def test_pylearn2_trainin(): # Construct the model mlp = MLP(activations=[Sigmoid(), Sigmoid()], dims=[784, 100, 784], weights_init=IsotropicGaussian(), biases_init=Constant(0.01)) mlp.initialize() cost = SquaredError() block_cost = BlocksCost(cost) block_model = BlocksModel(mlp, (VectorSpace(dim=784), 'features')) # Load the data rng = numpy.random.RandomState(14) train_dataset = random_dense_design_matrix(rng, 1024, 784, 10) valid_dataset = random_dense_design_matrix(rng, 1024, 784, 10) # Silence Pylearn2's logger logger = logging.getLogger(pylearn2.__name__) logger.setLevel(logging.ERROR) # Training algorithm sgd = SGD(learning_rate=0.01, cost=block_cost, batch_size=128, monitoring_dataset=valid_dataset) train = Train(train_dataset, block_model, algorithm=sgd) train.main_loop(time_budget=3)
def test_variable_filter(): # Creating computation graph brick1 = Linear(input_dim=2, output_dim=2, name='linear1') brick2 = Bias(2, name='bias1') activation = Sigmoid(name='sigm') x = tensor.vector() h1 = brick1.apply(x) h2 = activation.apply(h1) y = brick2.apply(h2) cg = ComputationGraph(y) parameters = [brick1.W, brick1.b, brick2.params[0]] bias = [brick1.b, brick2.params[0]] brick1_bias = [brick1.b] # Testing filtering by role role_filter = VariableFilter(roles=[PARAMETER]) assert parameters == role_filter(cg.variables) role_filter = VariableFilter(roles=[FILTER]) assert [] == role_filter(cg.variables) # Testing filtering by role using each_role flag role_filter = VariableFilter(roles=[PARAMETER, BIAS]) assert parameters == role_filter(cg.variables) role_filter = VariableFilter(roles=[PARAMETER, BIAS], each_role=True) assert not parameters == role_filter(cg.variables) assert bias == role_filter(cg.variables) # Testing filtering by bricks classes brick_filter = VariableFilter(roles=[BIAS], bricks=[Linear]) assert brick1_bias == brick_filter(cg.variables) # Testing filtering by bricks instances brick_filter = VariableFilter(roles=[BIAS], bricks=[brick1]) assert brick1_bias == brick_filter(cg.variables) # Testing filtering by brick instance brick_filter = VariableFilter(roles=[BIAS], bricks=[brick1]) assert brick1_bias == brick_filter(cg.variables) # Testing filtering by name name_filter = VariableFilter(name='W_norm') assert [cg.variables[2]] == name_filter(cg.variables) # Testing filtering by name regex name_filter_regex = VariableFilter(name_regex='W_no.?m') assert [cg.variables[2]] == name_filter_regex(cg.variables) # Testing filtering by application appli_filter = VariableFilter(applications=[brick1.apply]) variables = [cg.variables[1], cg.variables[8]] assert variables == appli_filter(cg.variables) # Testing filtering by application appli_filter_list = VariableFilter(applications=[brick1.apply]) assert variables == appli_filter_list(cg.variables)
def __init__(self, input_dim, hidden_dim, **kwargs): super(VariationalAutoEncoder, self).__init__(**kwargs) encoder_mlp = MLP([Sigmoid(), Identity()], [input_dim, 101, None]) decoder_mlp = MLP([Sigmoid(), Sigmoid()], [hidden_dim, 101, input_dim]) self.hidden_dim = hidden_dim self.encoder = VAEEncoder(encoder_mlp, hidden_dim) self.decoder = VAEDecoder(decoder_mlp) self.children = [self.encoder, self.decoder]
def __init__(self, visible_dim, hidden_dim, activation=Sigmoid(), **kwargs): super(Rbm, self).__init__(**kwargs) self.hidden_dim = hidden_dim self.visible_dim = visible_dim self.activation = activation self.children = [activation]
def __init__(self, dim, activation=None, gate_activation=None, **kwargs): super(GatedRecurrent, self).__init__(**kwargs) self.dim = dim if not activation: activation = Tanh() if not gate_activation: gate_activation = Sigmoid() self.activation = activation self.gate_activation = gate_activation self.children = [activation, gate_activation]
def initialize_rbm(Wrbm=None, bh=None, bv=None): rbm = Rbm(visible_dim=88, hidden_dim=256, activation=Sigmoid(), weights_init=IsotropicGaussian(0.01), biases_init=Constant(0.1), name='rbm2l') rbm.allocate() rbm.initialize() if Wrbm is not None: rbm.W.set_value(Wrbm) if bv is not None: rbm.bv.set_value(bv) if bh is not None: rbm.bh.set_value(bh) return rbm
def __init__(self, activation, gate_activation, dim, use_update_gate=True, use_reset_gate=True, **kwargs): super(GatedRecurrent, self).__init__(**kwargs) self.dim = dim self.use_update_gate = use_update_gate self.use_reset_gate = use_reset_gate if not activation: activation = Identity() if not gate_activation: gate_activation = Sigmoid() self.activation = activation self.gate_activation = gate_activation self.children = [activation, gate_activation]
def __init__(self, activation, gate_activation, dim, use_update_gate=True, use_reset_gate=True, **kwargs): super(GatedRecurrent, self).__init__(**kwargs) if not activation: activation = Identity() if not gate_activation: gate_activation = Sigmoid() update_instance(self, locals()) self.children = [activation, gate_activation]
def __init__(self, dims=(88, 100, 100), **kwargs): super(Rnn, self).__init__(**kwargs) self.dims = dims self.input_transform = Linear( input_dim=dims[0], output_dim=dims[1], weights_init=IsotropicGaussian(0.01), # biases_init=Constant(0.0), use_bias=False, name="input_transfrom") self.gru_layer = SimpleRecurrent(dim=dims[1], activation=Tanh(), weights_init=IsotropicGaussian(0.01), biases_init=Constant(0.0), use_bias=True, name="gru_rnn_layer") # TODO: find a way to automatically set the output dim in case of lstm vs normal rnn self.linear_trans = Linear(input_dim=dims[1], output_dim=dims[2] * 4, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0.0), use_bias=False, name="h2h_transform") self.lstm_layer = LSTM(dim=dims[2], activation=Tanh(), weights_init=IsotropicGaussian(0.01), biases_init=Constant(0.0), use_bias=True, name="lstm_rnn_layer") self.out_transform = MLP(activations=[Sigmoid()], dims=[dims[2], dims[0]], weights_init=IsotropicGaussian(0.01), use_bias=True, biases_init=Constant(0.0), name="out_layer") self.children = [ self.input_transform, self.gru_layer, self.linear_trans, self.lstm_layer, self.out_transform ]
def __init__(self, visible_dim, hidden_dim, rnn_dimensions=(128, 128), **kwargs): super(Rnnrbm, self).__init__(**kwargs) self.rnn_dimensions = rnn_dimensions self.visible_dim = visible_dim self.hidden_dim = hidden_dim # self.in_layer = Linear(input_dim=input_dim, output_dim=rnn_dimension * 4, # weights_init=IsotropicGaussian(0.01), # biases_init=Constant(0.0), # use_bias=False, # name="in_layer") self.rbm = Rbm(visible_dim=visible_dim, hidden_dim=hidden_dim, activation=Sigmoid(), weights_init=IsotropicGaussian(0.1), biases_init=Constant(0.1), name='rbm') self.uv = Linear(input_dim=rnn_dimensions[-1], output_dim=visible_dim, weights_init=IsotropicGaussian(0.0001), biases_init=Constant(0.001), use_bias=True, name='uv') self.uh = Linear(input_dim=rnn_dimensions[-1], output_dim=hidden_dim, weights_init=IsotropicGaussian(0.0001), biases_init=Constant(0.001), use_bias=True, name='uh') self.rnn = Rnn([visible_dim] + list(rnn_dimensions), name='rnn') self.children = [self.rbm, self.uv, self.uh, self.rnn ] + self.rnn.children._items
def main_run(_config, _log): from collections import namedtuple c = namedtuple("Config", _config.keys())(*_config.values()) _log.info("Running with" + str(_config)) import theano from theano import tensor as T import numpy as np from dataset import IMDBText, GloveTransformer from blocks.initialization import Uniform, Constant, IsotropicGaussian, NdarrayInitialization, Identity, Orthogonal from blocks.bricks.recurrent import LSTM, SimpleRecurrent, GatedRecurrent from blocks.bricks.parallel import Fork from blocks.bricks import Linear, Sigmoid, Tanh, Rectifier from blocks import bricks from blocks.extensions import Printing, Timing from blocks.extensions.monitoring import (DataStreamMonitoring, TrainingDataMonitoring) from blocks.extensions.plot import Plot from plot import PlotHistogram from blocks.algorithms import GradientDescent, Adam, Scale, StepClipping, CompositeRule, AdaDelta from blocks.graph import ComputationGraph, apply_dropout from blocks.main_loop import MainLoop from blocks.model import Model from cuboid.algorithms import AdaM, NAG from cuboid.extensions import EpochProgress from fuel.streams import DataStream, ServerDataStream from fuel.transformers import Padding from fuel.schemes import ShuffledScheme from Conv1D import Conv1D, MaxPooling1D from schemes import BatchwiseShuffledScheme from bricks import WeightedSigmoid, GatedRecurrentFull from multiprocessing import Process import fuel import logging from initialization import SumInitialization from transformers import DropSources global train_p global test_p x = T.tensor3('features') #m = T.matrix('features_mask') y = T.imatrix('targets') #x = x+m.mean()*0 dropout_variables = [] embedding_size = 300 glove_version = "glove.6B.300d.txt" #embedding_size = 50 #glove_version = "vectors.6B.50d.txt" gloveMapping = Linear( input_dim=embedding_size, output_dim=c.rnn_input_dim, weights_init=Orthogonal(), #weights_init = IsotropicGaussian(c.wstd), biases_init=Constant(0.0), name="gloveMapping") gloveMapping.initialize() o = gloveMapping.apply(x) o = Rectifier(name="gloveRec").apply(o) dropout_variables.append(o) summed_mapped_glove = o.sum(axis=1) # take out the sequence glove_out = Linear(input_dim=c.rnn_input_dim, output_dim=1.0, weights_init=IsotropicGaussian(c.wstd), biases_init=Constant(0.0), name="mapping_to_output") glove_out.initialize() deeply_sup_0 = glove_out.apply(summed_mapped_glove) deeply_sup_probs = Sigmoid(name="deeply_sup_softmax").apply(deeply_sup_0) input_dim = c.rnn_input_dim hidden_dim = c.rnn_dim gru = GatedRecurrentFull( hidden_dim=hidden_dim, activation=Tanh(), #activation=bricks.Identity(), gate_activation=Sigmoid(), state_to_state_init=SumInitialization( [Identity(1.0), IsotropicGaussian(c.wstd)]), state_to_reset_init=IsotropicGaussian(c.wstd), state_to_update_init=IsotropicGaussian(c.wstd), input_to_state_transform=Linear(input_dim=input_dim, output_dim=hidden_dim, weights_init=IsotropicGaussian(c.wstd), biases_init=Constant(0.0)), input_to_update_transform=Linear( input_dim=input_dim, output_dim=hidden_dim, weights_init=IsotropicGaussian(c.wstd), #biases_init=Constant(-2.0)), biases_init=Constant(-1.0)), input_to_reset_transform=Linear( input_dim=input_dim, output_dim=hidden_dim, weights_init=IsotropicGaussian(c.wstd), #biases_init=Constant(-3.0)) biases_init=Constant(-2.0))) gru.initialize() rnn_in = o.dimshuffle(1, 0, 2) #rnn_in = o #rnn_out = gru.apply(rnn_in, mask=m.T) rnn_out = gru.apply(rnn_in) state_to_state = gru.rnn.state_to_state state_to_state.name = "state_to_state" #o = rnn_out[-1, :, :] o = rnn_out[-1] #o = rnn_out[:, -1, :] #o = rnn_out.mean(axis=1) #print rnn_last_out.eval({ #x: np.ones((3, 101, 300), dtype=theano.config.floatX), #m: np.ones((3, 101), dtype=theano.config.floatX)}) #raw_input() #o = rnn_out.mean(axis=1) dropout_variables.append(o) score_layer = Linear(input_dim=hidden_dim, output_dim=1, weights_init=IsotropicGaussian(std=c.wstd), biases_init=Constant(0.), name="linear2") score_layer.initialize() o = score_layer.apply(o) probs = Sigmoid().apply(o) #probs = deeply_sup_probs cost = -(y * T.log(probs) + (1 - y) * T.log(1 - probs)).mean() #cost_deeply_sup0 = - (y * T.log(deeply_sup_probs) + (1-y) * T.log(1 - deeply_sup_probs)).mean() # cost += cost_deeply_sup0 * c.deeply_factor cost.name = 'cost' misclassification = (y * (probs < 0.5) + (1 - y) * (probs > 0.5)).mean() misclassification.name = 'misclassification' #print rnn_in.shape.eval( #{x : np.ones((45, 111, embedding_size), dtype=theano.config.floatX), #}) #print rnn_out.shape.eval( #{x : np.ones((45, 111, embedding_size), dtype=theano.config.floatX), #m : np.ones((45, 111), dtype=theano.config.floatX)}) #print (m).sum(axis=1).shape.eval({ #m : np.ones((45, 111), dtype=theano.config.floatX)}) #print (m).shape.eval({ #m : np.ones((45, 111), dtype=theano.config.floatX)}) #raw_input() # ================= cg = ComputationGraph([cost]) cg = apply_dropout(cg, variables=dropout_variables, drop_prob=0.5) params = cg.parameters algorithm = GradientDescent( cost=cg.outputs[0], params=params, step_rule=CompositeRule([ StepClipping(threshold=4), Adam(learning_rate=0.002, beta1=0.1, beta2=0.001), #NAG(lr=0.1, momentum=0.9), #AdaDelta(), ])) # ======== print "setting up data" ports = { 'gpu0_train': 5557, 'gpu0_test': 5558, 'cuda0_train': 5557, 'cuda0_test': 5558, 'opencl0:0_train': 5557, 'opencl0:0_test': 5558, 'gpu1_train': 5559, 'gpu1_test': 5560, } #batch_size = 16 #batch_size = 32 batch_size = 40 def start_server(port, which_set): fuel.server.logger.setLevel('WARN') dataset = IMDBText(which_set, sorted=True) n_train = dataset.num_examples #scheme = ShuffledScheme(examples=n_train, batch_size=batch_size) scheme = BatchwiseShuffledScheme(examples=n_train, batch_size=batch_size) stream = DataStream(dataset=dataset, iteration_scheme=scheme) print "loading glove" glove = GloveTransformer(glove_version, data_stream=stream) padded = Padding( data_stream=glove, #mask_sources=('features',) mask_sources=('features', )) padded = DropSources(padded, ['features_mask']) fuel.server.start_server(padded, port=port, hwm=20) train_port = ports[theano.config.device + '_train'] train_p = Process(target=start_server, args=(train_port, 'train')) train_p.start() test_port = ports[theano.config.device + '_test'] test_p = Process(target=start_server, args=(test_port, 'test')) test_p.start() #train_stream = ServerDataStream(('features', 'features_mask', 'targets'), port=train_port) #test_stream = ServerDataStream(('features', 'features_mask', 'targets'), port=test_port) train_stream = ServerDataStream(('features', 'targets'), port=train_port) test_stream = ServerDataStream(('features', 'targets'), port=test_port) print "setting up model" #ipdb.set_trace() n_examples = 25000 print "Batches per epoch", n_examples // (batch_size + 1) batches_extensions = 100 monitor_rate = 50 #====== model = Model(cg.outputs[0]) extensions = [] extensions.append( EpochProgress(batch_per_epoch=n_examples // batch_size + 1)) extensions.append( TrainingDataMonitoring( [cost, misclassification], prefix='train', every_n_batches=monitor_rate, )) extensions.append( DataStreamMonitoring([cost, misclassification], data_stream=test_stream, prefix='test', after_epoch=True, before_first_epoch=False)) extensions.append(Timing()) extensions.append(Printing()) #extensions.append(Plot("norms", channels=[['train_lstm_norm', 'train_pre_norm']], after_epoch=True)) #extensions.append(Plot(theano.config.device+"_result", channels=[['test_misclassification', 'train_misclassification']], after_epoch=True)) #extensions.append(PlotHistogram( #channels=['train_state_to_state'], #bins=50, #every_n_batches=30)) extensions.append( Plot(theano.config.device + "_result", channels=[['train_cost'], ['train_misclassification']], every_n_batches=monitor_rate)) main_loop = MainLoop(model=model, data_stream=train_stream, algorithm=algorithm, extensions=extensions) main_loop.run()
def main(): x = T.tensor3('features') #m = T.matrix('features_mask') y = T.imatrix('targets') #x = x+m.mean()*0 embedding_size = 300 glove_version = "glove.6B.300d.txt" #embedding_size = 50 #glove_version = "vectors.6B.50d.txt" wstd = 0.02 #vaguely normalize x = x / 3.0 - .5 #gloveMapping = Linear( #input_dim = embedding_size, #output_dim = 128, #weights_init = Orthogonal(), #biases_init = Constant(0.0), #name="gloveMapping" #) #gloveMapping.initialize() #o = gloveMapping.apply(x) #o = Rectifier(name="gloveRec").apply(o) o = x input_dim = 300 gru = GatedRecurrentFull( hidden_dim=input_dim, activation=Tanh(), #activation=bricks.Identity(), gate_activation=Sigmoid(), state_to_state_init=IsotropicGaussian(0.02), state_to_reset_init=IsotropicGaussian(0.02), state_to_update_init=IsotropicGaussian(0.02), input_to_state_transform=Linear(input_dim=input_dim, output_dim=input_dim, weights_init=IsotropicGaussian(0.02), biases_init=Constant(0.0)), input_to_update_transform=Linear(input_dim=input_dim, output_dim=input_dim, weights_init=IsotropicGaussian(0.02), biases_init=Constant(0.0)), input_to_reset_transform=Linear(input_dim=input_dim, output_dim=input_dim, weights_init=IsotropicGaussian(0.02), biases_init=Constant(0.0))) gru.initialize() rnn_in = o.dimshuffle(1, 0, 2) #rnn_in = o #rnn_out = gru.apply(rnn_in, mask=m.T) rnn_out = gru.apply(rnn_in) state_to_state = gru.rnn.state_to_state state_to_state.name = "state_to_state" #o = rnn_out[-1, :, :] o = rnn_out[-1] #o = rnn_out[:, -1, :] #o = rnn_out.mean(axis=1) #print rnn_last_out.eval({ #x: np.ones((3, 101, 300), dtype=theano.config.floatX), #m: np.ones((3, 101), dtype=theano.config.floatX)}) #raw_input() #o = rnn_out.mean(axis=1) score_layer = Linear(input_dim=300, output_dim=1, weights_init=IsotropicGaussian(std=wstd), biases_init=Constant(0.), use_bias=True, name="linear_score") score_layer.initialize() o = score_layer.apply(o) probs = Sigmoid().apply(o) cost = -(y * T.log(probs) + (1 - y) * T.log(1 - probs)).mean() cost.name = 'cost' misclassification = (y * (probs < 0.5) + (1 - y) * (probs > 0.5)).mean() misclassification.name = 'misclassification' #print rnn_in.shape.eval( #{x : np.ones((45, 111, embedding_size), dtype=theano.config.floatX), #}) #print rnn_out.shape.eval( #{x : np.ones((45, 111, embedding_size), dtype=theano.config.floatX), #m : np.ones((45, 111), dtype=theano.config.floatX)}) #print (m).sum(axis=1).shape.eval({ #m : np.ones((45, 111), dtype=theano.config.floatX)}) #print (m).shape.eval({ #m : np.ones((45, 111), dtype=theano.config.floatX)}) #raw_input() # ================= cg = ComputationGraph([cost]) #cg = apply_dropout(cg, variables=dropout_variables, drop_prob=0.5) params = cg.parameters for p in params: p.name += "___" + p.tag.annotations[0].name algorithm = GradientDescent( cost=cg.outputs[0], params=params, step_rule=CompositeRule([ StepClipping(threshold=4), AdaM(), #NAG(lr=0.1, momentum=0.9), #AdaDelta(), ])) #algorithm.initialize() print params f = theano.function([x, y], algorithm.cost) ipdb.set_trace() print "making plots" #theano.printing.pydotprint(algorithm.cost, outfile='unopt.png') theano.printing.pydotprint(f, outfile='opt.png', scan_graphs=True)
def main(): x = T.tensor3('features') m = T.matrix('features_mask') y = T.imatrix('targets') x = x+m.mean()*0 embedding_size = 300 glove_version = "glove.6B.300d.txt" #embedding_size = 50 #glove_version = "vectors.6B.50d.txt" wstd = 0.02 #vaguely normalize x = x / 3.0 - .5 #gloveMapping = Linear( #input_dim = embedding_size, #output_dim = 128, #weights_init = Orthogonal(), #biases_init = Constant(0.0), #name="gloveMapping" #) #gloveMapping.initialize() #o = gloveMapping.apply(x) #o = Rectifier(name="gloveRec").apply(o) rnn_in = x.dimshuffle(1, 0, 2) class Stub(object): def output(self, dropout_active=False): return rnn_in l_in = Stub() l_in.size = 300 layer = GatedRecurrentPassage( size=300, gate_activation='sigmoid') layer.connect(l_in) from blocks.roles import add_role, WEIGHT, INITIAL_STATE print layer.params [add_role(l, WEIGHT) for l in layer.params] rnn_out = layer.output() o = rnn_out #o = rnn_out[-1, :, :] #o = rnn_out[:, -1, :] #o = rnn_out.mean(axis=1) #print rnn_last_out.eval({ #x: np.ones((3, 101, 300), dtype=theano.config.floatX), #m: np.ones((3, 101), dtype=theano.config.floatX)}) #raw_input() #o = rnn_out.mean(axis=1) score_layer = Linear( input_dim = 300, output_dim = 1, weights_init = IsotropicGaussian(std=wstd), biases_init = Constant(0.), name="linear2") score_layer.initialize() o = score_layer.apply(o) probs = Sigmoid().apply(o) cost = - (y * T.log(probs) + (1-y) * T.log(1 - probs)).mean() cost.name = 'cost' misclassification = (y * (probs < 0.5) + (1-y) * (probs > 0.5)).mean() misclassification.name = 'misclassification' #print rnn_in.shape.eval( #{x : np.ones((45, 111, embedding_size), dtype=theano.config.floatX), #}) #print rnn_out.shape.eval( #{x : np.ones((45, 111, embedding_size), dtype=theano.config.floatX), #m : np.ones((45, 111), dtype=theano.config.floatX)}) #print (m).sum(axis=1).shape.eval({ #m : np.ones((45, 111), dtype=theano.config.floatX)}) #print (m).shape.eval({ #m : np.ones((45, 111), dtype=theano.config.floatX)}) #raw_input() # ================= cg = ComputationGraph([cost]) #cg = apply_dropout(cg, variables=dropout_variables, drop_prob=0.5) params = cg.parameters print params print "Len params", len(params) algorithm = GradientDescent( cost = cg.outputs[0], params=params, step_rule = CompositeRule([ StepClipping(threshold=4), AdaM(), #NAG(lr=0.1, momentum=0.9), #AdaDelta(), ]) ) # ======== print "setting up data" ports = { 'gpu0_train' : 5557, 'gpu0_test' : 5558, 'gpu1_train' : 5559, 'gpu1_test' : 5560, } #batch_size = 16 batch_size = 32 def start_server(port, which_set): fuel.server.logger.setLevel('WARN') dataset = IMDBText(which_set, sorted=True) n_train = dataset.num_examples #scheme = ShuffledScheme(examples=n_train, batch_size=batch_size) scheme = BatchwiseShuffledScheme(examples=n_train, batch_size=batch_size) stream = DataStream( dataset=dataset, iteration_scheme=scheme) print "loading glove" glove = GloveTransformer(glove_version, data_stream=stream) padded = Padding( data_stream=glove, #mask_sources=('features',) mask_sources=('features',) ) fuel.server.start_server(padded, port=port, hwm=20) train_port = ports[theano.config.device + '_train'] train_p = Process(target=start_server, args=(train_port, 'train')) train_p.start() test_port = ports[theano.config.device + '_test'] test_p = Process(target=start_server, args=(test_port, 'test')) test_p.start()
### Identity testing from blocks.initialization import Identity, IsotropicGaussian from blocks import bricks from blocks.bricks import Sigmoid dim = 2 floatX = theano.config.floatX x = tensor.tensor3('input') gru = GatedRecurrentFull( hidden_dim=dim, state_to_state_init=Identity(1.), #state_to_reset_init=Identity(1.), state_to_reset_init=IsotropicGaussian(0.2), state_to_update_init=Identity(1.0), activation=bricks.Identity(1.0), gate_activation=Sigmoid(), input_to_state_transform=Linear( input_dim=dim, output_dim=dim, weights_init=Identity(1.0), #weights_init=IsotropicGaussian(0.02), biases_init=Constant(0.0)), input_to_update_transform=Linear( input_dim=dim, output_dim=dim, #weights_init=Constant(0.0), weights_init=IsotropicGaussian(0.02), biases_init=Constant(2.0)), input_to_reset_transform=Linear( input_dim=dim, output_dim=dim,
n_u = 225 # input vector size (not time at this point) n_y = 225 # output vector size n_h = 500 # numer of hidden units iteration = 300 # number of epochs of gradient descent print "Building Model" # Symbolic variables x = tensor.tensor3('x', dtype=floatX) target = tensor.tensor3('target', dtype=floatX) # Build the model linear = Linear(input_dim=n_u, output_dim=n_h, name="first_layer") rnn = SimpleRecurrent(dim=n_h, activation=Tanh()) linear2 = Linear(input_dim=n_h, output_dim=n_y, name="output_layer") sigm = Sigmoid() x_transform = linear.apply(x) h = rnn.apply(x_transform) predict = sigm.apply(linear2.apply(h)) # only for generation B x h_dim h_initial = tensor.tensor3('h_initial', dtype=floatX) h_testing = rnn.apply(x_transform, h_initial, iterate=False) y_hat_testing = linear2.apply(h_testing) y_hat_testing = sigm.apply(y_hat_testing) y_hat_testing.name = 'y_hat_testing' # Cost function
def main(): x = T.tensor3('features') m = T.matrix('features_mask') y = T.imatrix('targets') #rnn = SimpleRecurrent( #dim = 50, #activation=Tanh(), #weights_init = Uniform(std=0.01), #biases_init = Constant(0.) #) #rnn = GatedRecurrent( #dim = 50, #activation=Tanh(), #weights_init = Uniform(std=0.01), #biases_init = Constant(0.) #) embedding_size = 300 #glove_version = "vectors.6B.100d.txt" glove_version = "glove.6B.300d.txt" #fork = Fork(weights_init=IsotropicGaussian(0.02), #biases_init=Constant(0.), #input_dim=embedding_size, #output_dims=[embedding_size]*3, #output_names=['inputs', 'reset_inputs', 'update_inputs'] #) rnn = LSTM( dim = embedding_size, activation=Tanh(), weights_init = IsotropicGaussian(std=0.02), ) rnn.initialize() #fork.initialize() wstd = 0.02 score_layer = Linear( input_dim = 128, output_dim = 1, weights_init = IsotropicGaussian(std=wstd), biases_init = Constant(0.), name="linear2") score_layer.initialize() gloveMapping = Linear( input_dim = embedding_size, output_dim = embedding_size, weights_init = IsotropicGaussian(std=wstd), biases_init = Constant(0.0), name="gloveMapping" ) gloveMapping.initialize() o = gloveMapping.apply(x) o = Rectifier(name="rectivfyglove").apply(o) forget_bias = np.zeros((embedding_size*4), dtype=theano.config.floatX) forget_bias[embedding_size:embedding_size*2] = 4.0 toLSTM = Linear( input_dim = embedding_size, output_dim = embedding_size*4, weights_init = IsotropicGaussian(std=wstd), biases_init = Constant(forget_bias), #biases_init = Constant(0.0), name="ToLSTM" ) toLSTM.initialize() rnn_states, rnn_cells = rnn.apply(toLSTM.apply(o) * T.shape_padright(m), mask=m) #inputs, reset_inputs, update_inputs = fork.apply(x) #rnn_states = rnn.apply(inputs=inputs, reset_inputs=reset_inputs, update_inputs=update_inputs, mask=m) #rnn_out = rnn_states[:, -1, :] rnn_out = (rnn_states * m.dimshuffle(0, 1, 'x')).sum(axis=1) / m.sum(axis=1).dimshuffle(0, 'x') #rnn_out = (rnn_states).mean(axis=1)# / m.sum(axis=1) hidden = Linear( input_dim = embedding_size, output_dim = 128, weights_init = Uniform(std=0.01), biases_init = Constant(0.)) hidden.initialize() o = hidden.apply(rnn_out) o = Rectifier().apply(o) hidden = Linear( input_dim = 128, output_dim = 128, weights_init = IsotropicGaussian(std=0.02), biases_init = Constant(0.), name="hiddenmap2") hidden.initialize() o = hidden.apply(o) o = Rectifier(name="rec2").apply(o) o = score_layer.apply(o) probs = Sigmoid().apply(o) cost = - (y * T.log(probs) + (1-y) * T.log(1 - probs)).mean() cost.name = 'cost' misclassification = (y * (probs < 0.5) + (1-y) * (probs > 0.5)).mean() misclassification.name = 'misclassification' #print (rnn_states * m.dimshuffle(0, 1, 'x')).sum(axis=1).shape.eval( #{x : np.ones((45, 111, embedding_size), dtype=theano.config.floatX), #m : np.ones((45, 111), dtype=theano.config.floatX)}) #print (m).sum(axis=1).shape.eval({ #m : np.ones((45, 111), dtype=theano.config.floatX)}) #print (m).shape.eval({ #m : np.ones((45, 111), dtype=theano.config.floatX)}) #raw_input() # ================= cg = ComputationGraph([cost]) params = cg.parameters algorithm = GradientDescent( cost = cost, params=params, step_rule = CompositeRule([ StepClipping(threshold=10), AdaM(), #AdaDelta(), ]) ) # ======== print "setting up data" train_dataset = IMDBText('train') test_dataset = IMDBText('test') batch_size = 16 n_train = train_dataset.num_examples train_stream = DataStream( dataset=train_dataset, iteration_scheme=ShuffledScheme( examples=n_train, batch_size=batch_size) ) glove = GloveTransformer(glove_version, data_stream=train_stream) train_padded = Padding( data_stream=glove, mask_sources=('features',) #mask_sources=[] ) test_stream = DataStream( dataset=test_dataset, iteration_scheme=ShuffledScheme( examples=n_train, batch_size=batch_size) ) glove = GloveTransformer(glove_version, data_stream=test_stream) test_padded = Padding( data_stream=glove, mask_sources=('features',) #mask_sources=[] ) print "setting up model" #import ipdb #ipdb.set_trace() lstm_norm = rnn.W_state.norm(2) lstm_norm.name = "lstm_norm" pre_norm= gloveMapping.W.norm(2) pre_norm.name = "pre_norm" #====== model = Model(cost) extensions = [] extensions.append(EpochProgress(batch_per_epoch=train_dataset.num_examples // batch_size + 1)) extensions.append(TrainingDataMonitoring( [cost, misclassification, lstm_norm, pre_norm], prefix='train', after_epoch=True )) extensions.append(DataStreamMonitoring( [cost, misclassification], data_stream=test_padded, prefix='test', after_epoch=True )) extensions.append(Timing()) extensions.append(Printing()) extensions.append(Plot("norms", channels=[['train_lstm_norm', 'train_pre_norm']], after_epoch=True)) extensions.append(Plot("result", channels=[['train_cost', 'train_misclassification']], after_epoch=True)) main_loop = MainLoop( model=model, data_stream=train_padded, algorithm=algorithm, extensions=extensions) main_loop.run()
def main(num_epochs=100): x = tensor.matrix('features') m = tensor.matrix('features_mask') y = tensor.imatrix('targets') x_int = x.astype(dtype='int32').T train_dataset = IMDB() idx_sort = numpy.argsort( [len(s) for s in train_dataset.indexables[ train_dataset.sources.index('features')]] ) n_voc = len(train_dataset.dict.keys()) for idx in xrange(len(train_dataset.sources)): train_dataset.indexables[idx] = train_dataset.indexables[idx][idx_sort] n_h = 100 linear_embedding = LookupTable( length=n_voc, dim=4 * n_h, weights_init=Uniform(std=0.01), biases_init=Constant(0.) ) linear_embedding.initialize() lstm_biases = numpy.zeros(4 * n_h).astype(dtype=theano.config.floatX) lstm_biases[n_h:(2 * n_h)] = 4. rnn = LSTM( dim=n_h, weights_init=Uniform(std=0.01), biases_init=Constant(0.) ) rnn.initialize() score_layer = Linear( input_dim=n_h, output_dim=1, weights_init=Uniform(std=0.01), biases_init=Constant(0.) ) score_layer.initialize() embedding = linear_embedding.apply(x_int) * tensor.shape_padright(m.T) rnn_out = rnn.apply(embedding) rnn_out_mean_pooled = rnn_out[0][-1] probs = Sigmoid().apply( score_layer.apply(rnn_out_mean_pooled)) cost = - (y * tensor.log(probs) + (1 - y) * tensor.log(1 - probs) ).mean() cost.name = 'cost' misclassification = (y * (probs < 0.5) + (1 - y) * (probs > 0.5) ).mean() misclassification.name = 'misclassification' cg = ComputationGraph([cost]) params = cg.parameters algorithm = GradientDescent( cost=cost, params=params, step_rule=CompositeRule( components=[StepClipping(threshold=10.), Adam() ] ) ) n_train = int(numpy.floor(.8 * train_dataset.num_examples)) n_valid = int(numpy.floor(.1 * train_dataset.num_examples)) train_data_stream = Padding( data_stream=DataStream( dataset=train_dataset, iteration_scheme=BatchwiseShuffledScheme( examples=range(n_train), batch_size=10, ) ), mask_sources=('features',) ) valid_data_stream = Padding( data_stream=DataStream( dataset=train_dataset, iteration_scheme=BatchwiseShuffledScheme( examples=range(n_train, n_train + n_valid), batch_size=10, ) ), mask_sources=('features',) ) test_data_stream = Padding( data_stream=DataStream( dataset=train_dataset, iteration_scheme=BatchwiseShuffledScheme( examples=range(n_train + n_valid, train_dataset.num_examples), batch_size=10, ) ), mask_sources=('features',) ) model = Model(cost) extensions = [] extensions.append(Timing()) extensions.append(FinishAfter(after_n_epochs=num_epochs)) extensions.append(DataStreamMonitoring( [cost, misclassification], test_data_stream, prefix='test')) extensions.append(DataStreamMonitoring( [cost, misclassification], valid_data_stream, prefix='valid')) extensions.append(TrainingDataMonitoring( [cost, misclassification], prefix='train', after_epoch=True)) plotters = [] plotters.append(Plotter( channels=[['train_cost', 'train_misclassification', 'valid_cost', 'valid_misclassification']], titles=['Costs'])) extensions.append(PlotManager('IMDB classification example', plotters=plotters, after_epoch=True, after_training=True)) extensions.append(Printing()) main_loop = MainLoop(model=model, data_stream=train_data_stream, algorithm=algorithm, extensions=extensions) main_loop.run()
def __init__(self): srng = MRG_RandomStreams(seed=123) X = T.matrix('features') self.X = X #drop = Dropout(p_drop=0.5) #o = drop.apply(X) o = X self.noisy = o #n_hidden = 64 n_hidden = 128 n_zs = 2 self.n_zs = n_zs self.n_hidden = n_hidden l = Linear(input_dim=28 * 28, output_dim=n_hidden, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0)) l.initialize() o = l.apply(o) o = Tanh().apply(o) l = Linear(input_dim=n_hidden, output_dim=n_hidden, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0)) l.initialize() o = l.apply(o) o = Tanh().apply(o) l = Linear(input_dim=n_hidden, output_dim=n_zs, weights_init=IsotropicGaussian(.101), biases_init=Constant(0)) l.initialize() mu_encoder = l.apply(o) l = Linear(input_dim=n_hidden, output_dim=n_zs, weights_init=IsotropicGaussian(0.1), biases_init=Constant(0)) l.initialize() log_sigma_encoder = l.apply(o) eps = srng.normal(log_sigma_encoder.shape) z = eps * T.exp(log_sigma_encoder) + mu_encoder z_to_h1_decode = Linear(input_dim=n_zs, output_dim=n_hidden, weights_init=IsotropicGaussian(0.1), biases_init=Constant(0)) z_to_h1_decode.initialize() h1_decode_to_h_decode = Linear(input_dim=n_hidden, output_dim=n_hidden, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0)) h1_decode_to_h_decode.initialize() #o = z_to_h_decode.apply(z) #h_decoder = Tanh().apply(o) h_decode_produce = Linear(input_dim=n_hidden, output_dim=28 * 28, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0), name="linear4") h_decode_produce.initialize() #o = h_decode_produce.apply(h_decoder) #self.produced = Sigmoid().apply(o) seq = Sequence([ z_to_h1_decode.apply, Tanh().apply, h1_decode_to_h_decode.apply, Tanh().apply, h_decode_produce.apply, Sigmoid().apply ]) seq.initialize() self.produced = seq.apply(z) self.cost = T.sum(T.sqr(self.produced - X)) #regular old mean squared #self.cost = T.sum(T.nnet.binary_crossentropy(self.produced, X)) #T.sum(T.sqr(self.produced - X)) self.cost.name = "cost" # Computed with L = 1, only one sample of produced. logpxz = T.sum(-1 * log_sigma_encoder * T.log(2 * np.pi) - T.sqr((self.produced - X) / (2 * T.exp(log_sigma_encoder)))) self.variational_cost = - 0.5 * T.sum(1 + 2*log_sigma_encoder - mu_encoder * mu_encoder\ - T.exp(2 * log_sigma_encoder)) + logpxz self.variational_cost.name = "variational_cost" self.Z = T.matrix('z') self.sampled = seq.apply(self.Z) cg = ComputationGraph([self.variational_cost]) bricks = [ get_brick(var) for var in cg.variables + cg.scan_variables if get_brick(var) ] for i, b in enumerate(bricks): b.name = b.name + "_" + str(i)
def main(): x = T.tensor3('features') m = T.matrix('features_mask') y = T.imatrix('targets') embedding_size = 300 glove_version = "glove.6B.300d.txt" #embedding_size = 50 #glove_version = "vectors.6B.50d.txt" o = x.sum(axis=1) + m.mean() * 0 score_layer = Linear( input_dim = 300, output_dim = 1, weights_init = IsotropicGaussian(std=0.02), biases_init = Constant(0.), name="linear2") score_layer.initialize() o = score_layer.apply(o) probs = Sigmoid().apply(o) cost = - (y * T.log(probs) + (1-y) * T.log(1 - probs)).mean() cost.name = 'cost' misclassification = (y * (probs < 0.5) + (1-y) * (probs > 0.5)).mean() misclassification.name = 'misclassification' # ================= cg = ComputationGraph([cost]) params = cg.parameters algorithm = GradientDescent( cost = cg.outputs[0], params=params, step_rule = CompositeRule([ StepClipping(threshold=4), AdaM(), ]) ) # ======== print "setting up data" ports = { 'gpu0_train' : 5557, 'gpu0_test' : 5558, 'gpu1_train' : 5559, 'gpu1_test' : 5560, } #batch_size = 16 batch_size = 16 def start_server(port, which_set): fuel.server.logger.setLevel('WARN') dataset = IMDBText(which_set, sorted=True) n_train = dataset.num_examples #scheme = ShuffledScheme(examples=n_train, batch_size=batch_size) scheme = BatchwiseShuffledScheme(examples=n_train, batch_size=batch_size) stream = DataStream( dataset=dataset, iteration_scheme=scheme) print "loading glove" glove = GloveTransformer(glove_version, data_stream=stream) padded = Padding( data_stream=glove, mask_sources=('features',) ) fuel.server.start_server(padded, port=port, hwm=20) train_port = ports[theano.config.device + '_train'] train_p = Process(target=start_server, args=(train_port, 'train')) train_p.start() test_port = ports[theano.config.device + '_test'] test_p = Process(target=start_server, args=(test_port, 'test')) test_p.start() train_stream = ServerDataStream(('features', 'features_mask', 'targets'), port=train_port) test_stream = ServerDataStream(('features', 'features_mask', 'targets'), port=test_port) print "setting up model" n_examples = 25000 #====== model = Model(cost) extensions = [] extensions.append(EpochProgress(batch_per_epoch=n_examples // batch_size + 1)) extensions.append(TrainingDataMonitoring( [ cost, misclassification, ], prefix='train', after_epoch=True )) #extensions.append(DataStreamMonitoring( #[cost, misclassification], #data_stream=test_stream, #prefix='test', #after_epoch=True #)) extensions.append(Timing()) extensions.append(Printing()) extensions.append(Plot( theano.config.device+"_result", channels=[['train_cost']], after_epoch=True )) main_loop = MainLoop( model=model, data_stream=train_stream, algorithm=algorithm, extensions=extensions) main_loop.run()
def main(): x = T.tensor3('features') m = T.matrix('features_mask') y = T.imatrix('targets') x = m.mean() + x #stupid mask not always needed... #embedding_size = 300 #glove_version = "glove.6B.300d.txt" embedding_size = 50 glove_version = "vectors.6B.50d.txt" wstd = 0.02 conv1 = Conv1D(filter_length=5, num_filters=128, input_dim=embedding_size, weights_init=IsotropicGaussian(std=wstd), biases_init=Constant(0.0)) conv1.initialize() o = conv1.apply(x) o = Rectifier(name="conv1red").apply(o) o = MaxPooling1D(pooling_length=5 #, step=2 ).apply(o) conv2 = Conv1D(filter_length=5, num_filters=128, input_dim=128, weights_init=IsotropicGaussian(std=wstd), biases_init=Constant(0.0), step=3, name="conv2") conv2.initialize() o = conv2.apply(o) o = Rectifier(name="conv2rec").apply(o) conv2 = Conv1D(filter_length=5, num_filters=128, input_dim=128, weights_init=IsotropicGaussian(std=wstd), biases_init=Constant(0.0), step=3, name="conv3") conv2.initialize() o = conv2.apply(o) o = Rectifier(name="conv3rec").apply(o) fork = Fork(weights_init=IsotropicGaussian(0.02), biases_init=Constant(0.), input_dim=128, output_dims=[128]*3, output_names=['inputs', 'reset_inputs', 'update_inputs'] ) fork.initialize() inputs, reset_inputs, update_inputs = fork.apply(o) out = o.mean(axis=1) #gru = GatedRecurrent(dim=128, #weights_init=IsotropicGaussian(0.02), #biases_init=IsotropicGaussian(0.0)) #gru.initialize() #states = gru.apply(inputs=inputs, reset_inputs=reset_inputs, update_inputs=update_inputs) #out = states[:, -1, :] hidden = Linear( input_dim = 128, output_dim = 128, weights_init = Uniform(std=0.01), biases_init = Constant(0.)) hidden.initialize() o = hidden.apply(out) o = Rectifier().apply(o) #hidden = Linear( #input_dim = 128, #output_dim = 128, #weights_init = IsotropicGaussian(std=0.02), #biases_init = Constant(0.), #name="hiddenmap2") #hidden.initialize() #o = hidden.apply(o) #o = Rectifier(name="rec2").apply(o) score_layer = Linear( input_dim = 128, output_dim = 1, weights_init = IsotropicGaussian(std=wstd), biases_init = Constant(0.), name="linear2") score_layer.initialize() o = score_layer.apply(o) probs = Sigmoid().apply(o) cost = - (y * T.log(probs) + (1-y) * T.log(1 - probs)).mean() cost.name = 'cost' misclassification = (y * (probs < 0.5) + (1-y) * (probs > 0.5)).mean() misclassification.name = 'misclassification' #print (rnn_states * m.dimshuffle(0, 1, 'x')).sum(axis=1).shape.eval( #{x : np.ones((45, 111, embedding_size), dtype=theano.config.floatX), #m : np.ones((45, 111), dtype=theano.config.floatX)}) #print (m).sum(axis=1).shape.eval({ #m : np.ones((45, 111), dtype=theano.config.floatX)}) #print (m).shape.eval({ #m : np.ones((45, 111), dtype=theano.config.floatX)}) #raw_input() # ================= cg = ComputationGraph([cost]) params = cg.parameters algorithm = GradientDescent( cost = cost, params=params, step_rule = CompositeRule([ StepClipping(threshold=10), AdaM(), #AdaDelta(), ]) ) # ======== print "setting up data" ports = { 'gpu0_train' : 5557, 'gpu0_test' : 5558, 'gpu1_train' : 5559, 'gpu1_test' : 5560, } batch_size = 16 def start_server(port, which_set): fuel.server.logger.setLevel('WARN') dataset = IMDBText(which_set) n_train = dataset.num_examples stream = DataStream( dataset=dataset, iteration_scheme=ShuffledScheme( examples=n_train, batch_size=batch_size) ) print "loading glove" glove = GloveTransformer(glove_version, data_stream=stream) padded = Padding( data_stream=glove, mask_sources=('features',) ) fuel.server.start_server(padded, port=port, hwm=20) train_port = ports[theano.config.device + '_train'] train_p = Process(target=start_server, args=(train_port, 'train')) train_p.start() test_port = ports[theano.config.device + '_test'] test_p = Process(target=start_server, args=(test_port, 'test')) test_p.start() train_stream = ServerDataStream(('features', 'features_mask', 'targets'), port=train_port) test_stream = ServerDataStream(('features', 'features_mask', 'targets'), port=test_port) print "setting up model" #import ipdb #ipdb.set_trace() n_examples = 25000 #====== model = Model(cost) extensions = [] extensions.append(EpochProgress(batch_per_epoch=n_examples // batch_size + 1)) extensions.append(TrainingDataMonitoring( [cost, misclassification], prefix='train', after_epoch=True )) extensions.append(DataStreamMonitoring( [cost, misclassification], data_stream=test_stream, prefix='test', after_epoch=True )) extensions.append(Timing()) extensions.append(Printing()) #extensions.append(Plot("norms", channels=[['train_lstm_norm', 'train_pre_norm']], after_epoch=True)) extensions.append(Plot(theano.config.device+"_result", channels=[['test_misclassification', 'train_misclassification']], after_epoch=True)) main_loop = MainLoop( model=model, data_stream=train_stream, algorithm=algorithm, extensions=extensions) main_loop.run()
def __init__(self): srng = MRG_RandomStreams(seed=123) X = T.matrix('features') self.X = X #drop = Dropout(p_drop=0.5) #o = drop.apply(X) o = (X - 128) / 128.0 self.scaled = o #n_hidden = 64 n_hidden = 2048 * 2 n_zs = 1024 self.n_zs = n_zs self.n_hidden = n_hidden l = Linear(input_dim=32 * 32 * 3, output_dim=n_hidden, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0)) l.initialize() o = l.apply(o) o = Rectifier().apply(o) l = Linear(input_dim=n_hidden, output_dim=n_hidden, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0)) l.initialize() o = l.apply(o) o = Rectifier().apply(o) l = Linear(input_dim=n_hidden, output_dim=n_zs, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0)) l.initialize() mu_encoder = l.apply(o) l = Linear(input_dim=n_hidden, output_dim=n_zs, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0)) l.initialize() log_sigma_encoder = l.apply(o) eps = srng.normal(log_sigma_encoder.shape) z = eps * T.exp(log_sigma_encoder) + mu_encoder z_to_h1_decode = Linear(input_dim=n_zs, output_dim=n_hidden, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0)) z_to_h1_decode.initialize() h1_decode_to_h_decode = Linear(input_dim=n_hidden, output_dim=n_hidden, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0)) h1_decode_to_h_decode.initialize() h_decode_produce = Linear(input_dim=n_hidden, output_dim=32 * 32 * 3, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0), name="linear4") h_decode_produce.initialize() #o = h_decode_produce.apply(h_decoder) h_decode_produce = Linear(input_dim=n_hidden, output_dim=32 * 32 * 3, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0), name="linear4") h_decode_produce.initialize() #self.produced = Sigmoid().apply(o) seq = Sequence([ z_to_h1_decode.apply, Rectifier().apply, h1_decode_to_h_decode.apply, Rectifier().apply, h_decode_produce.apply, Sigmoid().apply ]) seq.initialize() self.produced = seq.apply(z) self.cost = T.mean(T.sqr(self.produced - self.scaled)) #self.cost = T.sum(T.nnet.binary_crossentropy(self.produced, self.scaled)) #T.sum(T.sqr(self.produced - self.scaled)) self.cost.name = "cost" self.variational_cost = - 0.5 * T.mean(1 + 2*log_sigma_encoder - mu_encoder * mu_encoder\ - T.exp(2 * log_sigma_encoder)) + self.cost self.variational_cost.name = "variational_cost" self.Z = T.matrix('z') self.sampled = seq.apply(self.Z) cg = ComputationGraph([self.variational_cost]) bricks = [ get_brick(var) for var in cg.variables + cg.scan_variables if get_brick(var) ] for i, b in enumerate(bricks): b.name = b.name + "_" + str(i)
one_time = tensor.wscalar('one_time') h_initial = tensor.matrix('h_initial', dtype=floatX) # Build the model clockwork = ClockWork(input_dim=n_u, module=module, periods=periods, unit=unit, activation=Sigmoid(), name="clockwork rnn") linear = Linear(input_dim=unit * module, output_dim=n_y, name="output_layer") h = clockwork.apply(x, time) predict = Sigmoid().apply(linear.apply(h)) # only for generation B x h_dim h_testing = clockwork.apply(one_x, one_time, h_initial, iterate=False) y_hat_testing = Sigmoid().apply(linear.apply(h_testing)) y_hat_testing.name = 'y_hat_testing' # Cost function cost = SquaredError().apply(predict, target) # Initialization for brick in (clockwork, linear): brick.weights_init = initialization.IsotropicGaussian(0.1) brick.biases_init = initialization.Constant(0) brick.initialize() cg = ComputationGraph(cost) print(VariableFilter(roles=[WEIGHT, BIAS])(cg.variables)) # Training process
def main(): x = T.imatrix('features') m = T.matrix('features_mask') y = T.imatrix('targets') #x_int = x.astype(dtype='int32').T x_int = x.T train_dataset = IMDB('train') n_voc = len(train_dataset.dict.keys()) n_h = 2 lookup = LookupTable(length=n_voc + 2, dim=n_h * 4, weights_init=Uniform(std=0.01), biases_init=Constant(0.)) lookup.initialize() #rnn = SimpleRecurrent( #dim = n_h, #activation=Tanh(), #weights_init = Uniform(std=0.01), #biases_init = Constant(0.) #) rnn = LSTM(dim=n_h, activation=Tanh(), weights_init=Uniform(std=0.01), biases_init=Constant(0.)) rnn.initialize() score_layer = Linear(input_dim=n_h, output_dim=1, weights_init=Uniform(std=0.01), biases_init=Constant(0.)) score_layer.initialize() embedding = lookup.apply(x_int) * T.shape_padright(m.T) #embedding = lookup.apply(x_int) + m.T.mean()*0 #embedding = lookup.apply(x_int) + m.T.mean()*0 rnn_states = rnn.apply(embedding, mask=m.T) #rnn_states, rnn_cells = rnn.apply(embedding) rnn_out_mean_pooled = rnn_states[-1] #rnn_out_mean_pooled = rnn_states.mean() probs = Sigmoid().apply(score_layer.apply(rnn_out_mean_pooled)) cost = -(y * T.log(probs) + (1 - y) * T.log(1 - probs)).mean() cost.name = 'cost' misclassification = (y * (probs < 0.5) + (1 - y) * (probs > 0.5)).mean() misclassification.name = 'misclassification' # ================= cg = ComputationGraph([cost]) params = cg.parameters algorithm = GradientDescent( cost=cost, params=params, step_rule=CompositeRule([ StepClipping(threshold=10), Adam(), #AdaDelta(), ])) # ======== test_dataset = IMDB('test') batch_size = 64 n_train = train_dataset.num_examples train_stream = DataStream(dataset=train_dataset, iteration_scheme=ShuffledScheme( examples=n_train, batch_size=batch_size)) train_padded = Padding(data_stream=train_stream, mask_sources=('features', ) #mask_sources=[] ) test_stream = DataStream(dataset=test_dataset, iteration_scheme=ShuffledScheme( examples=n_train, batch_size=batch_size)) test_padded = Padding(data_stream=test_stream, mask_sources=('features', ) #mask_sources=[] ) #import ipdb #ipdb.set_trace() #====== model = Model(cost) extensions = [] extensions.append( EpochProgress( batch_per_epoch=train_dataset.num_examples // batch_size + 1)) extensions.append( TrainingDataMonitoring([cost, misclassification], prefix='train', after_epoch=True)) extensions.append( DataStreamMonitoring([cost, misclassification], data_stream=test_padded, prefix='test', after_epoch=True)) extensions.append(Timing()) extensions.append(Printing()) main_loop = MainLoop(model=model, data_stream=train_padded, algorithm=algorithm, extensions=extensions) main_loop.run()