def __init__(self, batch_size, output_length, visual_dim, word_dim, visual_feature_dim, question_feature_dim, joint_dim, memory_dim, output_dim, fc1_dim, fc2_dim, voc_size): # the video encoder self.video_encoder = visualEncoder( visual_dim, visual_feature_dim) self.sentence_encoder = questionEncoder( word_dim, question_feature_dim) self.toJoint = embeddingLayer( 2 * question_feature_dim, 2 * visual_feature_dim, joint_dim) self.rewatcher = videoAttentionLayer( joint_dim, memory_dim, output_dim) self.seq_gen = seqDecoder( joint_dim, output_dim, fc1_dim, fc2_dim) self.softmax_layer = Softmax() self.bs = batch_size self.output_length = output_length self.voc_size = voc_size
def __init__(self, config, **kwargs): super(Model, self).__init__(config, **kwargs) self.dest_mlp = MLP( activations=[Rectifier() for _ in config.dim_hidden_dest] + [Softmax()], dims=[config.dim_hidden[-1]] + config.dim_hidden_dest + [config.dim_output_dest], name='dest_mlp') self.time_mlp = MLP( activations=[Rectifier() for _ in config.dim_hidden_time] + [Softmax()], dims=[config.dim_hidden[-1]] + config.dim_hidden_time + [config.dim_output_time], name='time_mlp') self.dest_classes = theano.shared(numpy.array( config.dest_tgtcls, dtype=theano.config.floatX), name='dest_classes') self.time_classes = theano.shared(numpy.array( config.time_tgtcls, dtype=theano.config.floatX), name='time_classes') self.inputs.append('input_time') self.children.extend([self.dest_mlp, self.time_mlp])
def build_training(lr=0.002, model=None): x = T.tensor4('x') y = T.imatrix() if model is None: model = build_model() y_prev = model.apply(x) y_softmax =Softmax().apply(y_prev) ##### prediction ##### #cost = CategoricalCrossEntropy().apply(y.flatten(), y_prev).mean() cost = Softmax().categorical_cross_entropy(y.flatten(), y_prev).mean() error = MisclassificationRate().apply(y.flatten(), y_softmax).mean() W, B = get_Params(y_prev) params = W + B regulizer_full = sum([w.norm(2) for w in W[0:2]]) regulizer_conv = sum([w.norm(2) for w in W[2:]]) cost = cost #+ 0.01*regulizer_conv #+ 0.001*regulizer_conv updates, updates_init = RMSProp(cost, params, lr) #updates, updates_init = Adam(cost, params, lr) #updates = Sgd(cost, params, lr) train_function = theano.function([x,y], cost, updates=updates, allow_input_downcast=True) valid_function = theano.function([x,y], cost, allow_input_downcast=True) test_function = theano.function([x,y], error, allow_input_downcast=True) reinit = theano.function([], T.zeros((1,)), updates=updates_init) observation = theano.function([], [w.norm(2) for w in W]) """ reg_function = theano.function([], T.zeros((1,)), updates=clip(W), allow_input_downcast=True) observation = theano.function([], [w.norm(2) for w in W]) """ return train_function, valid_function, test_function, model, reinit
def __init__(self, feature_dim, hidden_dim, output_dim): self.image_embed = Linear(input_dim=feature_dim, output_dim=hidden_dim, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0), use_bias=False, name='image_embed') self.word_embed = Linear(input_dim=feature_dim, output_dim=hidden_dim, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0), use_bias=False, name='word_embed') self.r_embed = Linear(input_dim=feature_dim, output_dim=hidden_dim, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0), use_bias=False, name='r_embed') self.m_to_s = Linear(input_dim=hidden_dim, output_dim=1, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0), use_bias=False, name='m_to_s') self.attention_dist = Softmax(name='attention_dist_softmax') self.r_to_r = Linear(input_dim=feature_dim, output_dim=feature_dim, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0), use_bias=False, name='r_to_r') # self.r_to_g = Linear(input_dim=feature_dim, # output_dim=output_dim, # weights_init=IsotropicGaussian(0.01), # biases_init=Constant(0), # use_bias=False, # name='r_to_g') self.image_embed.initialize() self.word_embed.initialize() self.r_embed.initialize() self.m_to_s.initialize() self.r_to_r.initialize() # self.r_to_g.initialize() # the sequence to sequence LSTM self.seq = LSTM(output_dim, name='rewatcher_seq', weights_init=IsotropicGaussian(0.01), biases_init=Constant(0)) self.seq_embed = Linear(feature_dim, output_dim * 4, name='rewatcher_seq_embed', weights_init=IsotropicGaussian(0.01), biases_init=Constant(0), use_bias=False) self.seq.initialize() self.seq_embed.initialize()
def maxout_vae_mnist_test(path_vae_mnist): # load vae model on mnist vae_mnist = load(path_vae_mnist) maxout = Maxout() x = T.matrix('features') y = T.imatrix('targets') batch_size = 128 z, _ = vae_mnist.sampler.sample(vae_mnist.encoder_mlp.apply(x)) predict = maxout.apply(z) cost = Softmax().categorical_cross_entropy(y.flatten(), predict) y_hat = Softmax().apply(predict) cost.name = 'cost' cg = ComputationGraph(cost) temp = cg.parameters for t, i in zip(temp, range(len(temp))): t.name = t.name+str(i)+"maxout" error_brick = MisclassificationRate() error_rate = error_brick.apply(y, y_hat) # training step_rule = RMSProp(0.01, 0.9) #step_rule = Momentum(0.2, 0.9) train_set = MNIST('train') test_set = MNIST("test") data_stream_train = Flatten(DataStream.default_stream( train_set, iteration_scheme=SequentialScheme(train_set.num_examples, batch_size))) data_stream_test =Flatten(DataStream.default_stream( test_set, iteration_scheme=SequentialScheme(test_set.num_examples, batch_size))) algorithm = GradientDescent(cost=cost, params=cg.parameters, step_rule=step_rule) monitor_train = TrainingDataMonitoring( variables=[cost], data_stream=data_stream_train, prefix="train") monitor_valid = DataStreamMonitoring( variables=[cost, error_rate], data_stream=data_stream_test, prefix="test") extensions = [ monitor_train, monitor_valid, FinishAfter(after_n_epochs=50), Printing(every_n_epochs=1) ] main_loop = MainLoop(data_stream=data_stream_train, algorithm=algorithm, model = Model(cost), extensions=extensions) main_loop.run() # save here from blocks.serialization import dump with closing(open('../data_mnist/maxout', 'w')) as f: dump(maxout, f)
def __init__(self, config, **kwargs): super(Model, self).__init__(config, output_dim=config.tgtcls.shape[0], **kwargs) self.classes = theano.shared(numpy.array(config.tgtcls, dtype=theano.config.floatX), name='classes') self.softmax = Softmax() self.children.append(self.softmax)
def onestepContextAttn(hContextAttn): preContextatt = attentionmlpContext.apply(hContextAttn) attContextsoft = Softmax() attContextpyx = attContextsoft.apply(preContextatt.flatten()) attContextpred = attContextpyx.flatten() attcontext = T.mul(hContextAttn.dimshuffle(1,0), attContextpred).dimshuffle(1,0) return attcontext
def onestepEncAttn(hEncAttn): preEncattn = attentionmlpEnc.apply(hEncAttn) attEncsoft = Softmax() attEncpyx = attEncsoft.apply(preEncattn.flatten()) attEncpred = attEncpyx.flatten() attenc = T.mul(hEncAttn.dimshuffle(1,0), attEncpred).dimshuffle(1,0) return attenc
def get_config(config): config1 = {} if config == '5layers': config1['num_epochs'] = 150 config1['num_channels'] = 3 config1['image_shape'] = (192, 192) config1['filter_size'] = [(5, 5), (5, 5), (5, 5), (5, 5), (5, 5)] config1['num_filter'] = [32, 48, 64, 128, 256] config1['pooling_sizes'] = [(2, 2), (2, 2), (2, 2), (2, 2), (2, 2)] config1['mlp_hiddens'] = [1000, 100] config1['output_size'] = 2 config1['batch_size'] = 16 config1['activation'] = [Rectifier() for _ in config1['num_filter']] config1['mlp_activation'] = [ Rectifier().apply for _ in config1['mlp_hiddens'] ] + [Softmax().apply] config1['num_batches'] = None elif config == '4layers': config1['num_epochs'] = 100 config1['num_channels'] = 3 config1['image_shape'] = (160, 160) config1['filter_size'] = [(5, 5), (5, 5), (5, 5), (5, 5)] config1['num_filter'] = [32, 64, 128, 128] config1['pooling_sizes'] = [(2, 2), (2, 2), (2, 2), (2, 2)] config1['mlp_hiddens'] = [1000, 100] config1['output_size'] = 2 config1['batch_size'] = 32 config1['activation'] = [Rectifier() for _ in config1['num_filter']] config1['mlp_activation'] = [ Rectifier().apply for _ in config1['mlp_hiddens'] ] + [Softmax().apply] config1['num_batches'] = None else: config1['num_epochs'] = 100 config1['num_channels'] = 3 config1['image_shape'] = (128, 128) config1['filter_size'] = [(5, 5), (5, 5), (5, 5)] config1['num_filter'] = [20, 50, 80] config1['pooling_sizes'] = [(2, 2), (2, 2), (2, 2)] config1['mlp_hiddens'] = [1000] config1['output_size'] = 2 config1['batch_size'] = 64 config1['activation'] = [Rectifier() for _ in config1['num_filter']] config1['mlp_activation'] = [ Rectifier().apply for _ in config1['mlp_hiddens'] ] + [Softmax().apply] config1['num_batches'] = 11000 if config == 'test': print("Test run...") config1['test'] = True else: print("Using default config..") return config1
def __init__(self, config, **kwargs): super(Model, self).__init__(config, rec_input_len=4, output_dim=config.tgtcls.shape[0], **kwargs) self.classes = theano.shared(numpy.array(config.tgtcls, dtype=theano.config.floatX), name='classes') self.softmax = Softmax() self.sequences.extend(['latitude_lag', 'longitude_lag']) self.children.append(self.softmax)
def __init__(self, hidden_dim, n_classes, **kwargs): super(SingleSoftmax, self).__init__(**kwargs) self.hidden_dim = hidden_dim self.n_classes = n_classes self.mlp = MLP(activations=[Rectifier(), Softmax()], dims=[hidden_dim, hidden_dim / 2, self.n_classes], weights_init=Orthogonal(), biases_init=Constant(0)) self.softmax = Softmax() self.children = [self.mlp, self.softmax]
def __init__(self, config, prefix_encoder, candidate_encoder, **kwargs): super(MemoryNetworkBase, self).__init__(**kwargs) self.prefix_encoder = prefix_encoder self.candidate_encoder = candidate_encoder self.config = config self.softmax = Softmax() self.children = [self.softmax, prefix_encoder, candidate_encoder] self.inputs = self.prefix_encoder.apply.inputs \ + ['candidate_%s'%x for x in self.candidate_encoder.apply.inputs] \ + ['candidate_destination_latitude', 'candidate_destination_longitude']
def build_pretrain_model(self, data_dict, hyper_params): """ pretrain-method specific; constucts an SCE net; works with any network structure of the pipeline :param data_dict: :param hyper_params: :return: """ from theano import tensor from blocks.model import Model # Note: this has to match the sources defined in the dataset indices = [tensor.ivector('{}_indices'.format(i)) for i in range(3)] pipeline = self.encoder_pipeline_factory.build_pipeline( input_shape=data_dict.get_value().shape, params=hyper_params) # compute feature represenation rep = [pipeline.apply(data_dict[indices[i]]) for i in range(3)] # for r in rep: print r.type # flatten representations rep = [r.flatten(ndim=2) for r in rep] # for r in rep: print r.type # compute similarities rval = [] for i in range(1, 3): r = (rep[0] * rep[i]).sum( axis=1) # element-wise multiplication and row sum r = tensor.reshape(r, (r.shape[0], 1)) rval.append(r) rval = tensor.concatenate(rval, axis=1) # print rval.type # optional softmax layer (normalization to sum = 1) if 'apply_softmax' in hyper_params and hyper_params[ 'apply_softmax']: # default=False from blocks.bricks import Softmax rval = Softmax().apply(rval) # optional argmax (int output instead of scores if 'return_probs' in hyper_params and hyper_params[ 'return_probs'] is False: # default=True rval = rval.argmax(axis=1) return Model(rval)
def test_activations(): x = tensor.vector() x_val = numpy.random.rand(8).astype(theano.config.floatX) exp_x_val = numpy.exp(x_val) assert_allclose(x_val, Identity().apply(x).eval({x: x_val})) assert_allclose(numpy.tanh(x_val), Tanh().apply(x).eval({x: x_val}), rtol=1e-06) assert_allclose(numpy.log(1 + exp_x_val), Softplus(x).apply(x).eval({x: x_val}), rtol=1e-6) assert_allclose(exp_x_val / numpy.sum(exp_x_val), Softmax(x).apply(x).eval({ x: x_val }).flatten(), rtol=1e-6) assert_allclose(1.0 / (1.0 + numpy.exp(-x_val)), Logistic(x).apply(x).eval({x: x_val}), rtol=1e-6) leaky_out_1 = x_val - 0.5 leaky_out_1[leaky_out_1 < 0] *= 0.01 assert_allclose(leaky_out_1, LeakyRectifier().apply(x).eval({x: x_val - 0.5}), rtol=1e-5) leaky_out_2 = x_val - 0.5 leaky_out_2[leaky_out_2 < 0] *= 0.05 assert_allclose(leaky_out_2, LeakyRectifier(leak=0.05).apply(x).eval({x: x_val - 0.5}), rtol=1e-5)
def setup_model(): # shape: T x B x F input_ = T.tensor3('features') # shape: B target = T.lvector('targets') model = LSTMAttention(input_dim=10000, dim=500, mlp_hidden_dims=[2000, 500, 4], batch_size=100, image_shape=(100, 100), patch_shape=(28, 28), weights_init=IsotropicGaussian(0.01), biases_init=Constant(0)) model.initialize() h, c = model.apply(input_) classifier = MLP([Rectifier(), Softmax()], [500, 100, 10], weights_init=IsotropicGaussian(0.01), biases_init=Constant(0)) classifier.initialize() probabilities = classifier.apply(h[-1]) cost = CategoricalCrossEntropy().apply(target, probabilities) error_rate = MisclassificationRate().apply(target, probabilities) return cost, error_rate
def main(save_to, num_epochs): mlp = MLP([Tanh(), Softmax()], [784, 100, 10], weights_init=IsotropicGaussian(0.01), biases_init=Constant(0)) mlp.initialize() x = tensor.matrix('features') y = tensor.lmatrix('targets') probs = mlp.apply(tensor.flatten(x, outdim=2)) cost = CategoricalCrossEntropy().apply(y.flatten(), probs) error_rate = MisclassificationRate().apply(y.flatten(), probs) cg = ComputationGraph([cost]) W1, W2 = VariableFilter(roles=[WEIGHT])(cg.variables) cost = cost + .00005 * (W1**2).sum() + .00005 * (W2**2).sum() cost.name = 'final_cost' mnist_train = MNIST(("train", )) mnist_test = MNIST(("test", )) algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=Scale(learning_rate=0.1)) extensions = [ Timing(), FinishAfter(after_n_epochs=num_epochs), DataStreamMonitoring([cost, error_rate], Flatten(DataStream.default_stream( mnist_test, iteration_scheme=SequentialScheme( mnist_test.num_examples, 500)), which_sources=('features', )), prefix="test"), TrainingDataMonitoring([ cost, error_rate, aggregation.mean(algorithm.total_gradient_norm) ], prefix="train", after_epoch=True), Checkpoint(save_to), Printing() ] if BLOCKS_EXTRAS_AVAILABLE: extensions.append( Plot('MNIST example', channels=[[ 'test_final_cost', 'test_misclassificationrate_apply_error_rate' ], ['train_total_gradient_norm']])) main_loop = MainLoop(algorithm, Flatten(DataStream.default_stream( mnist_train, iteration_scheme=SequentialScheme( mnist_train.num_examples, 50)), which_sources=('features', )), model=Model(cost), extensions=extensions) main_loop.run()
class SingleSoftmax(Initializable): def __init__(self, hidden_dim, n_classes, **kwargs): super(SingleSoftmax, self).__init__(**kwargs) self.hidden_dim = hidden_dim self.n_classes = n_classes self.mlp = MLP(activations=[Rectifier(), Softmax()], dims=[hidden_dim, hidden_dim/2, self.n_classes], weights_init=Orthogonal(), biases_init=Constant(0)) self.softmax = Softmax() self.children = [self.mlp, self.softmax] # some day: @application(...) def feedback(self, h) @application(inputs=['cs', 'y'], outputs=['cost']) def cost(self, cs, y, n_patches): energies = [self.mlp.apply(cs[:, t, :]) for t in xrange(n_patches)] cross_entropies = [self.softmax.categorical_cross_entropy(y.flatten(), energy) for energy in energies] error_rates = [T.neq(y, energy.argmax(axis=1)).mean(axis=0) for energy in energies] # train on final prediction cost = util.named(cross_entropies[-1], "cost") # monitor final prediction self.add_auxiliary_variable(cross_entropies[-1], name="cross_entropy") self.add_auxiliary_variable(error_rates[-1], name="error_rate") return cost
def build_model(images, labels): # Construct a bottom convolutional sequence bottom_conv_sequence = convolutional_sequence((3, 3), 64, (150, 150)) bottom_conv_sequence._push_allocation_config() # Flatten layer flattener = Flattener() # Construct a top MLP conv_out_dim = numpy.prod(bottom_conv_sequence.get_dim('output')) top_mlp = MLP([ LeakyRectifier(name='non_linear_9'), LeakyRectifier(name='non_linear_10'), Softmax(name='non_linear_11') ], [conv_out_dim, 2048, 612, 10], weights_init=IsotropicGaussian(), biases_init=Constant(1)) # Construct feedforward sequence ss_seq = FeedforwardSequence( [bottom_conv_sequence.apply, flattener.apply, top_mlp.apply]) ss_seq.push_initialization_config() ss_seq.initialize() prediction = ss_seq.apply(images) cost = CategoricalCrossEntropy().apply(labels.flatten(), prediction) return cost
def __init__(self, config): self.X = T.tensor4("features") c = config seq = BrickSequence( input_dim=(3, 32, 32), bricks=[ conv3(c['n_l1']), conv3(c['n_l2']), max_pool(), conv3(c['n_l3']), conv3(c['n_l4']), max_pool(), #conv3(10), #conv3(10), Flattener(), linear(c['n_l5']), Softmax() ]) seq.initialize() self.pred = seq.apply(self.X) self.Y = T.imatrix("targets") self.cost = CategoricalCrossEntropy().apply(self.Y.flatten(), self.pred) self.cost.name = "cost" self.accur = 1.0 - MisclassificationRate().apply( self.Y.flatten(), self.pred) self.accur.name = "accur"
class SingleSoftmax(Initializable): def __init__(self, hidden_dim, n_classes, **kwargs): super(SingleSoftmax, self).__init__(**kwargs) self.hidden_dim = hidden_dim self.n_classes = n_classes self.mlp = MLP(activations=[Rectifier(), Softmax()], dims=[hidden_dim, hidden_dim / 2, self.n_classes], weights_init=Orthogonal(), biases_init=Constant(0)) self.softmax = Softmax() self.children = [self.mlp, self.softmax] # some day: @application(...) def feedback(self, h) @application(inputs=['cs', 'y'], outputs=['cost']) def cost(self, cs, y, n_patches): energies = [self.mlp.apply(cs[:, t, :]) for t in xrange(n_patches)] cross_entropies = [ self.softmax.categorical_cross_entropy(y.flatten(), energy) for energy in energies ] error_rates = [ T.neq(y, energy.argmax(axis=1)).mean(axis=0) for energy in energies ] # train on final prediction cost = util.named(cross_entropies[-1], "cost") # monitor final prediction self.add_auxiliary_variable(cross_entropies[-1], name="cross_entropy") self.add_auxiliary_variable(error_rates[-1], name="error_rate") return cost
def construct_model(input_dim, output_dim): # Construct the model r = tensor.fmatrix('r') x = tensor.fmatrix('x') y = tensor.ivector('y') # input_dim must be nr mlp = MLP(activations=activation_functions, dims=[input_dim] + hidden_dims + [2]) weights = mlp.apply(r) final = tensor.dot(x, weights) cost = Softmax().categorical_cross_entropy(y, final).mean() pred = final.argmax(axis=1) error_rate = tensor.neq(y, pred).mean() # Initialize parameters for brick in [mlp]: brick.weights_init = IsotropicGaussian(0.01) brick.biases_init = Constant(0.001) brick.initialize() # apply noise cg = ComputationGraph([cost, error_rate]) noise_vars = VariableFilter(roles=[WEIGHT])(cg) apply_noise(cg, noise_vars, noise_std) [cost, error_rate] = cg.outputs return cost, error_rate
def create_lenet_5(): feature_maps = [6, 16] mlp_hiddens = [120, 84] conv_sizes = [5, 5] pool_sizes = [2, 2] image_size = (28, 28) output_size = 10 # The above are from LeCun's paper. The blocks example had: # feature_maps = [20, 50] # mlp_hiddens = [500] # Use ReLUs everywhere and softmax for the final prediction conv_activations = [Rectifier() for _ in feature_maps] mlp_activations = [Rectifier() for _ in mlp_hiddens] + [Softmax()] convnet = LeNet(conv_activations, 1, image_size, filter_sizes=zip(conv_sizes, conv_sizes), feature_maps=feature_maps, pooling_sizes=zip(pool_sizes, pool_sizes), top_mlp_activations=mlp_activations, top_mlp_dims=mlp_hiddens + [output_size], border_mode='valid', weights_init=Uniform(width=.2), biases_init=Constant(0)) # We push initialization config to set different initialization schemes # for convolutional layers. convnet.push_initialization_config() convnet.layers[0].weights_init = Uniform(width=.2) convnet.layers[1].weights_init = Uniform(width=.09) convnet.top_mlp.linear_transformations[0].weights_init = Uniform(width=.08) convnet.top_mlp.linear_transformations[1].weights_init = Uniform(width=.11) convnet.initialize() return convnet
def __init__(self, **kwargs): conv_layers = [ Convolutional(filter_size=(3, 3), num_filters=64, border_mode=(1, 1), name='conv_1'), Rectifier(), Convolutional(filter_size=(3, 3), num_filters=64, border_mode=(1, 1), name='conv_2'), Rectifier(), MaxPooling((2, 2), step=(2, 2), name='pool_2'), Convolutional(filter_size=(3, 3), num_filters=128, border_mode=(1, 1), name='conv_3'), Rectifier(), Convolutional(filter_size=(3, 3), num_filters=128, border_mode=(1, 1), name='conv_4'), Rectifier(), MaxPooling((2, 2), step=(2, 2), name='pool_4'), Convolutional(filter_size=(3, 3), num_filters=256, border_mode=(1, 1), name='conv_5'), Rectifier(), Convolutional(filter_size=(3, 3), num_filters=256, border_mode=(1, 1), name='conv_6'), Rectifier(), Convolutional(filter_size=(3, 3), num_filters=256, border_mode=(1, 1), name='conv_7'), Rectifier(), MaxPooling((2, 2), step=(2, 2), name='pool_7'), Convolutional(filter_size=(3, 3), num_filters=512, border_mode=(1, 1), name='conv_8'), Rectifier(), Convolutional(filter_size=(3, 3), num_filters=512, border_mode=(1, 1), name='conv_9'), Rectifier(), Convolutional(filter_size=(3, 3), num_filters=512, border_mode=(1, 1), name='conv_10'), Rectifier(), MaxPooling((2, 2), step=(2, 2), name='pool_10'), Convolutional(filter_size=(3, 3), num_filters=512, border_mode=(1, 1), name='conv_11'), Rectifier(), Convolutional(filter_size=(3, 3), num_filters=512, border_mode=(1, 1), name='conv_12'), Rectifier(), Convolutional(filter_size=(3, 3), num_filters=512, border_mode=(1, 1), name='conv_13'), Rectifier(), MaxPooling((2, 2), step=(2, 2), name='pool_13'), ] mlp = MLP([Rectifier(name='fc_14'), Rectifier('fc_15'), Softmax()], [25088, 4096, 4096, 1000], ) conv_sequence = ConvolutionalSequence( conv_layers, 3, image_size=(224, 224)) super(VGGNet, self).__init__( [conv_sequence.apply, Flattener().apply, mlp.apply], **kwargs)
def __init__(self, image_shape=None, output_size=None, noise_batch_size=None, noise_without_rectifier=False, noise_after_rectifier=False, **kwargs): self.num_channels = 3 self.image_shape = image_shape or (32, 32) self.output_size = output_size or 10 self.noise_batch_size = noise_batch_size conv_parameters = [(96, 3, 1, 'half', Convolutional), (96, 3, 1, 'half', Convolutional), (96, 3, 2, 'half', NoisyConvolutional), (192, 3, 1, 'half', Convolutional), (192, 3, 1, 'half', Convolutional), (192, 3, 2, 'half', NoisyConvolutional), (192, 3, 1, 'half', Convolutional), (192, 1, 1, 'valid', Convolutional), (10, 1, 1, 'valid', Convolutional)] fc_layer = 10 self.convolutions = [] layers = [] for i, (num_filters, filter_size, conv_step, border_mode, cls) in enumerate(conv_parameters): if cls == NoisyConvolutional and noise_after_rectifier: cls = NoisyConvolutional2 layer = cls(filter_size=(filter_size, filter_size), num_filters=num_filters, step=(conv_step, conv_step), border_mode=border_mode, tied_biases=True, name='conv_{}'.format(i)) if cls == NoisyConvolutional or cls == NoisyConvolutional2: layer.noise_batch_size = self.noise_batch_size self.convolutions.append(layer) layers.append(layer) if cls != NoisyConvolutional2 and not noise_without_rectifier: layers.append(Rectifier()) self.conv_sequence = ConvolutionalSequence(layers, self.num_channels, image_size=self.image_shape) # The AllConvNet applies average pooling to combine top-level # features across the image. self.flattener = GlobalAverageFlattener() # Then it inserts one final 10-way FC layer before softmax # self.top_mlp = MLP([Rectifier(), Softmax()], # [conv_parameters[-1][0], fc_layer, self.output_size]) self.top_softmax = Softmax() application_methods = [ self.conv_sequence.apply, self.flattener.apply, self.top_softmax.apply ] super(NoisyAllConvNet, self).__init__(application_methods, **kwargs)
def __init__(self, hidden_dim, n_classes, batch_normalize, **kwargs): super(SingleSoftmax, self).__init__(**kwargs) self.hidden_dim = hidden_dim self.n_classes = n_classes self.mlp = masonry.construct_mlp( activations=[None, Identity()], input_dim=hidden_dim, hidden_dims=[hidden_dim / 2, self.n_classes], batch_normalize=batch_normalize, weights_init=Orthogonal(), biases_init=Constant(0)) self.softmax = Softmax() self.children = [self.mlp, self.softmax]
class Model(RNN): @lazy() def __init__(self, config, **kwargs): super(Model, self).__init__(config, rec_input_len=4, output_dim=config.tgtcls.shape[0], **kwargs) self.classes = theano.shared(numpy.array(config.tgtcls, dtype=theano.config.floatX), name='classes') self.softmax = Softmax() self.sequences.extend(['latitude_lag', 'longitude_lag']) self.children.append(self.softmax) def before_predict_all(self, kwargs): super(Model, self).before_predict_all(kwargs) kwargs['latitude_lag'] = tensor.extra_ops.repeat(kwargs['latitude'], 2, axis=0) kwargs['longitude_lag'] = tensor.extra_ops.repeat(kwargs['longitude'], 2, axis=0) def process_rto(self, rto): return tensor.dot(self.softmax.apply(rto), self.classes) def rec_input(self, latitude, longitude, latitude_lag, longitude_lag, **kwargs): return (tensor.shape_padright(latitude), tensor.shape_padright(longitude), tensor.shape_padright(latitude_lag), tensor.shape_padright(longitude_lag))
class MemoryNetworkBase(Initializable): def __init__(self, config, prefix_encoder, candidate_encoder, **kwargs): super(MemoryNetworkBase, self).__init__(**kwargs) self.prefix_encoder = prefix_encoder self.candidate_encoder = candidate_encoder self.config = config self.softmax = Softmax() self.children = [self.softmax, prefix_encoder, candidate_encoder] self.inputs = self.prefix_encoder.apply.inputs \ + ['candidate_%s'%x for x in self.candidate_encoder.apply.inputs] \ + ['candidate_destination_latitude', 'candidate_destination_longitude'] def candidate_destination(self, **kwargs): return tensor.concatenate( (tensor.shape_padright(kwargs['candidate_destination_latitude']), tensor.shape_padright(kwargs['candidate_destination_longitude'])), axis=1) @application(outputs=['cost']) def cost(self, **kwargs): y_hat = self.predict(**kwargs) y = tensor.concatenate((kwargs['destination_latitude'][:, None], kwargs['destination_longitude'][:, None]), axis=1) return error.erdist(y_hat, y).mean() @application(outputs=['destination']) def predict(self, **kwargs): prefix_representation = self.prefix_encoder.apply( **{x: kwargs[x] for x in self.prefix_encoder.apply.inputs}) candidate_representation = self.candidate_encoder.apply( **{ x: kwargs['candidate_' + x] for x in self.candidate_encoder.apply.inputs }) if self.config.normalize_representation: prefix_representation = prefix_representation \ / tensor.sqrt((prefix_representation ** 2).sum(axis=1, keepdims=True)) candidate_representation = candidate_representation \ / tensor.sqrt((candidate_representation ** 2).sum(axis=1, keepdims=True)) similarity_score = tensor.dot(prefix_representation, candidate_representation.T) similarity = self.softmax.apply(similarity_score) return tensor.dot(similarity, self.candidate_destination(**kwargs)) @predict.property('inputs') def predict_inputs(self): return self.inputs @cost.property('inputs') def cost_inputs(self): return self.inputs + ['destination_latitude', 'destination_longitude']
def build_model(images, labels): # Construct a bottom convolutional sequence bottom_conv_sequence = convolutional_sequence((3,3), 16, (160, 160)) bottom_conv_sequence._push_allocation_config() # Flatten layer flattener = Flattener() # Construct a top MLP conv_out_dim = numpy.prod(bottom_conv_sequence.get_dim('output')) #top_mlp = MLP([Rectifier(name='non_linear_9'), Softmax(name='non_linear_11')], [conv_out_dim, 1024, 10], weights_init=IsotropicGaussian(), biases_init=Constant(0)) top_mlp = BatchNormalizedMLP([Rectifier(name='non_linear_9'), Softmax(name='non_linear_11')], [conv_out_dim, 1024, 10], weights_init=IsotropicGaussian(), biases_init=Constant(0)) # Construct feedforward sequence ss_seq = FeedforwardSequence([bottom_conv_sequence.apply, flattener.apply, top_mlp.apply]) ss_seq.push_initialization_config() ss_seq.initialize() prediction = ss_seq.apply(images) cost_noreg = CategoricalCrossEntropy().apply(labels.flatten(), prediction) # add regularization selector = Selector([top_mlp]) Ws = selector.get_parameters('W') mlp_brick_name = 'batchnormalizedmlp' W0 = Ws['/%s/linear_0.W' % mlp_brick_name] W1 = Ws['/%s/linear_1.W' % mlp_brick_name] cost = cost_noreg + .01 * (W0 ** 2).mean() + .01 * (W1 ** 2).mean() return cost
def training(repo, learning_rate, batch_size, filenames): print 'LOAD DATA' (x_train, y_train), (x_valid, y_valid), (x_test, y_test) = load_datasets_mnist(repo, filenames) print 'BUILD MODEL' train_f, valid_f, test_f, model, fisher, params = build_training() x_train = x_train[:1000] y_train = y_train[:1000] x = T.tensor4() y = T.imatrix() output = model.apply(x) output = output.reshape( (x.shape[0], model.get_dim('output'))) #TO DO : get_dim('name') for Architecture cost = Softmax().categorical_cross_entropy(y.flatten(), output).mean() cg = ComputationGraph(cost) inputs_conv = VariableFilter(roles=[INPUT], bricks=[Convolutional])(cg) outputs_conv = VariableFilter(roles=[OUTPUT], bricks=[Convolutional])(cg) inputs_fully = VariableFilter(roles=[INPUT], bricks=[Linear])(cg) outputs_fully = VariableFilter(roles=[OUTPUT], bricks=[Linear])(cg) dico = OrderedDict([('conv_output', outputs_conv[0])]) [grad_s] = T.grad(cost, outputs_conv) dico['conv_output'] = grad_s f = theano.function([x, y], grad_s, allow_input_downcast=True, on_unused_input='ignore') print np.mean(f(x_train[:10], y_train[:10]))
class rewatching: def __init__(self, batch_size, output_length, visual_dim, word_dim, visual_feature_dim, question_feature_dim, joint_dim, memory_dim, output_dim, fc1_dim, fc2_dim, voc_size): # the video encoder self.video_encoder = visualEncoder(visual_dim, visual_feature_dim) self.sentence_encoder = questionEncoder(word_dim, question_feature_dim) self.toJoint = embeddingLayer(2 * question_feature_dim, 2 * visual_feature_dim, joint_dim) self.rewatcher = impatientLayer(joint_dim, memory_dim, output_dim) self.seq_gen = seqDecoder(joint_dim, output_dim, fc1_dim, fc2_dim) self.softmax_layer = Softmax() self.bs = batch_size self.output_length = output_length self.voc_size = voc_size def build_model(self, frame, q, q_rev, mask, maskMat, mask01, padding): bs = self.bs # visual dim -> visual feature dim video_embedding = self.video_encoder.apply(frame) # wod_dim -> question feature dimA question_embedding, u1, u2 = self.sentence_encoder.apply( q, q_rev, mask, bs) # -> joint_dim questionJoint, videoJoint, u = self.toJoint.apply( words=question_embedding, video=video_embedding, u1=u1, u2=u2) # bs x joint_dim, bs x output_dim question = questionJoint[:, -1, :] #video = videoJoint[:, -1, :] r_q, seq_r_q = self.rewatcher.apply(videoJoint, questionJoint, mask, bs) fc_r = self.seq_gen.apply(self.output_length, r_q, question, padding) fc = fc_r.reshape((self.bs * self.output_length, self.voc_size)) self.softmax_result = self.softmax_layer.apply(fc) self.pred = T.argmax(self.softmax_result, axis=1) self.pred = self.pred.reshape((self.bs, self.output_length)) # groundtruth_: batch_size x output_length # mask_01: (batch_size x output_length) # this mask is a 0-1 matrix where 0 indicates padding area of the answer def loss(self, groundtruth_, mask_01): mask = mask_01.flatten() gt = groundtruth_.flatten() self.p = self.softmax_result[T.arange(self.bs * self.output_length), gt] self.cost_ = T.log(self.p + 1e-20) self.cost = -T.sum(self.cost_ * mask) / self.bs self.cost.name = 'softmax_cost' return self.cost def error(self, groundtruth, mask_01): return T.neq(T.sum(T.neq(self.pred, groundtruth) * mask_01, axis=1), 0).sum() / self.bs def predict(self): return self.pred
def __init__(self, hidden_dim, n_classes, **kwargs): super(Emitter, self).__init__(**kwargs) self.hidden_dim = hidden_dim self.n_classes = n_classes # TODO: use TensorLinear or some such self.emitters = [ MLP(activations=[Rectifier(), Identity()], dims=[hidden_dim, hidden_dim / 2, n], name="mlp_%i" % i, weights_init=Orthogonal(), biases_init=Constant(0)) for i, n in enumerate(self.n_classes) ] self.softmax = Softmax() self.children = self.emitters + [self.softmax]
class Emitter(Initializable): def __init__(self, hidden_dim, n_classes, **kwargs): super(Emitter, self).__init__(**kwargs) self.hidden_dim = hidden_dim self.n_classes = n_classes # TODO: use TensorLinear or some such self.emitters = [ MLP(activations=[Rectifier(), Identity()], dims=[hidden_dim, hidden_dim / 2, n], name="mlp_%i" % i, weights_init=Orthogonal(), biases_init=Constant(0)) for i, n in enumerate(self.n_classes) ] self.softmax = Softmax() self.children = self.emitters + [self.softmax] # some day: @application(...) def feedback(self, h) @application(inputs=['cs', 'y'], outputs=['cost']) def cost(self, cs, y, n_patches): max_length = len(self.n_classes) - 1 _length_masks = theano.shared(np.tril( np.ones((max_length, max_length), dtype='int8')), name='shared_length_masks') lengths = y[:, -1] length_masks = _length_masks[lengths] mean_cross_entropies = [] error_rates = [] for t in xrange(n_patches): energies = [ emitter.apply(cs[:, t, :]) for emitter in self.emitters ] mean_cross_entropies.append( sum( self.softmax.categorical_cross_entropy(y[:, i], energy) # to avoid punishing predictions of nonexistent digits: * (length_masks[:, i] if i < max_length else 1) for i, energy in enumerate(energies)).mean()) # FIXME: do proper logprob-minimizing prediction of length error_rates.append( T.stack(*[ T.neq(y[:, i], energy.argmax(axis=1)) # to avoid punishing predictions of nonexistent digits: * (length_masks[:, i] if i < max_length else 1) for i, energy in enumerate(energies) ]).any(axis=0).mean()) self.add_auxiliary_variable(mean_cross_entropies[-1], name="cross_entropy") self.add_auxiliary_variable(error_rates[-1], name="error_rate") # minimize the mean cross entropy over time and over batch cost = mean_cross_entropies[-1] return cost
class Model(RNN): @lazy() def __init__(self, config, **kwargs): super(Model, self).__init__(config, output_dim=config.tgtcls.shape[0], **kwargs) self.classes = theano.shared(numpy.array(config.tgtcls, dtype=theano.config.floatX), name='classes') self.softmax = Softmax() self.children.append(self.softmax) def process_rto(self, rto): return tensor.dot(self.softmax.apply(rto), self.classes)
class Emitter(Initializable): def __init__(self, hidden_dim, n_classes, batch_normalize, **kwargs): super(Emitter, self).__init__(**kwargs) self.hidden_dim = hidden_dim self.n_classes = n_classes # TODO: use TensorLinear or some such self.emitters = [ masonry.construct_mlp( activations=[None, Identity()], input_dim=hidden_dim, hidden_dims=[hidden_dim/2, n], name="mlp_%i" % i, batch_normalize=batch_normalize, initargs=dict(weights_init=Orthogonal(), biases_init=Constant(0))) for i, n in enumerate(self.n_classes)] self.softmax = Softmax() self.children = self.emitters + [self.softmax] # some day: @application(...) def feedback(self, h) @application(inputs=['cs', 'y'], outputs=['cost']) def cost(self, cs, y, n_patches): max_length = len(self.n_classes) - 1 _length_masks = theano.shared( np.tril(np.ones((max_length, max_length), dtype='int8')), name='shared_length_masks') lengths = y[:, -1] length_masks = _length_masks[lengths] mean_cross_entropies = [] error_rates = [] for t in xrange(n_patches): energies = [emitter.apply(cs[:, t, :]) for emitter in self.emitters] mean_cross_entropies.append( sum(self.softmax.categorical_cross_entropy(y[:, i], energy) # to avoid punishing predictions of nonexistent digits: * (length_masks[:, i] if i < max_length else 1) for i, energy in enumerate(energies)).mean()) # FIXME: do proper logprob-minimizing prediction of length error_rates.append( T.stack(*[T.neq(y[:, i], energy.argmax(axis=1)) # to avoid punishing predictions of nonexistent digits: * (length_masks[:, i] if i < max_length else 1) for i, energy in enumerate(energies)]).any(axis=0).mean()) self.add_auxiliary_variable(mean_cross_entropies[-1], name="cross_entropy") self.add_auxiliary_variable(error_rates[-1], name="error_rate") # minimize the mean cross entropy over time and over batch cost = mean_cross_entropies[-1] return cost
class SoftmaxLinear(Initializable): def __init__(self, input_dim, output_dim, **kwargs): super(SoftmaxLinear, self).__init__(**kwargs) self.linear = Linear(input_dim=input_dim, output_dim=output_dim) self.sofmax = Softmax() self.children = [self.linear, self.sofmax] def apply(self, input_): output = self.sofmax.apply(self.linear.apply(input_)) return output
class MemoryNetworkBase(Initializable): def __init__(self, config, prefix_encoder, candidate_encoder, **kwargs): super(MemoryNetworkBase, self).__init__(**kwargs) self.prefix_encoder = prefix_encoder self.candidate_encoder = candidate_encoder self.config = config self.softmax = Softmax() self.children = [ self.softmax, prefix_encoder, candidate_encoder ] self.inputs = self.prefix_encoder.apply.inputs \ + ['candidate_%s'%x for x in self.candidate_encoder.apply.inputs] \ + ['candidate_destination_latitude', 'candidate_destination_longitude'] def candidate_destination(self, **kwargs): return tensor.concatenate( (tensor.shape_padright(kwargs['candidate_destination_latitude']), tensor.shape_padright(kwargs['candidate_destination_longitude'])), axis=1) @application(outputs=['cost']) def cost(self, **kwargs): y_hat = self.predict(**kwargs) y = tensor.concatenate((kwargs['destination_latitude'][:, None], kwargs['destination_longitude'][:, None]), axis=1) return error.erdist(y_hat, y).mean() @application(outputs=['destination']) def predict(self, **kwargs): prefix_representation = self.prefix_encoder.apply(**{ x: kwargs[x] for x in self.prefix_encoder.apply.inputs }) candidate_representation = self.candidate_encoder.apply(**{ x: kwargs['candidate_'+x] for x in self.candidate_encoder.apply.inputs }) if self.config.normalize_representation: prefix_representation = prefix_representation \ / tensor.sqrt((prefix_representation ** 2).sum(axis=1, keepdims=True)) candidate_representation = candidate_representation \ / tensor.sqrt((candidate_representation ** 2).sum(axis=1, keepdims=True)) similarity_score = tensor.dot(prefix_representation, candidate_representation.T) similarity = self.softmax.apply(similarity_score) return tensor.dot(similarity, self.candidate_destination(**kwargs)) @predict.property('inputs') def predict_inputs(self): return self.inputs @cost.property('inputs') def cost_inputs(self): return self.inputs + ['destination_latitude', 'destination_longitude']
def __init__(self, config, prefix_encoder, candidate_encoder, **kwargs): super(MemoryNetworkBase, self).__init__(**kwargs) self.prefix_encoder = prefix_encoder self.candidate_encoder = candidate_encoder self.config = config self.softmax = Softmax() self.children = [ self.softmax, prefix_encoder, candidate_encoder ] self.inputs = self.prefix_encoder.apply.inputs \ + ['candidate_%s'%x for x in self.candidate_encoder.apply.inputs] \ + ['candidate_destination_latitude', 'candidate_destination_longitude']
def __init__(self, hidden_dim, n_classes, **kwargs): super(SingleSoftmax, self).__init__(**kwargs) self.hidden_dim = hidden_dim self.n_classes = n_classes self.mlp = MLP(activations=[Rectifier(), Softmax()], dims=[hidden_dim, hidden_dim/2, self.n_classes], weights_init=Orthogonal(), biases_init=Constant(0)) self.softmax = Softmax() self.children = [self.mlp, self.softmax]
def __init__(self, hidden_dim, n_classes, **kwargs): super(Emitter, self).__init__(**kwargs) self.hidden_dim = hidden_dim self.n_classes = n_classes # TODO: use TensorLinear or some such self.emitters = [MLP(activations=[Rectifier(), Identity()], dims=[hidden_dim, hidden_dim/2, n], name="mlp_%i" % i, weights_init=Orthogonal(), biases_init=Constant(0)) for i, n in enumerate(self.n_classes)] self.softmax = Softmax() self.children = self.emitters + [self.softmax]
def __init__(self, hidden_dim, n_classes, batch_normalize, **kwargs): super(Emitter, self).__init__(**kwargs) self.hidden_dim = hidden_dim self.n_classes = n_classes # TODO: use TensorLinear or some such self.emitters = [ masonry.construct_mlp( activations=[None, Identity()], input_dim=hidden_dim, hidden_dims=[hidden_dim/2, n], name="mlp_%i" % i, batch_normalize=batch_normalize, initargs=dict(weights_init=Orthogonal(), biases_init=Constant(0))) for i, n in enumerate(self.n_classes)] self.softmax = Softmax() self.children = self.emitters + [self.softmax]
def __init__(self, io_dim, hidden_dims, cond_cert, activation=None, **kwargs): super(CCHLSTM, self).__init__(**kwargs) self.cond_cert = cond_cert self.io_dim = io_dim self.hidden_dims = hidden_dims self.children = [] self.layers = [] self.softmax = Softmax() self.children.append(self.softmax) for i, d in enumerate(hidden_dims): i0 = LookupTable(length=io_dim, dim=4*d, name='i0-%d'%i) self.children.append(i0) if i > 0: i1 = Linear(input_dim=hidden_dims[i-1], output_dim=4*d, name='i1-%d'%i) self.children.append(i1) else: i1 = None lstm = LSTM(dim=d, activation=activation, name='LSTM-%d'%i) self.children.append(lstm) o = Linear(input_dim=d, output_dim=io_dim, name='o-%d'%i) self.children.append(o) self.layers.append((i0, i1, lstm, o))
def __init__(self, config, **kwargs): super(Model, self).__init__(**kwargs) self.config = config self.context_embedder = ContextEmbedder(config) self.prefix_encoder = MLP( activations=[Rectifier() for _ in config.prefix_encoder.dim_hidden] + [config.representation_activation()], dims=[config.prefix_encoder.dim_input] + config.prefix_encoder.dim_hidden + [config.representation_size], name="prefix_encoder", ) self.candidate_encoder = MLP( activations=[Rectifier() for _ in config.candidate_encoder.dim_hidden] + [config.representation_activation()], dims=[config.candidate_encoder.dim_input] + config.candidate_encoder.dim_hidden + [config.representation_size], name="candidate_encoder", ) self.softmax = Softmax() self.prefix_extremities = { "%s_k_%s" % (side, ["latitude", "longitude"][axis]): axis for side in ["first", "last"] for axis in [0, 1] } self.candidate_extremities = { "candidate_%s_k_%s" % (side, ["latitude", "longitude"][axis]): axis for side in ["first", "last"] for axis in [0, 1] } self.inputs = ( self.context_embedder.inputs + ["candidate_%s" % k for k in self.context_embedder.inputs] + self.prefix_extremities.keys() + self.candidate_extremities.keys() ) self.children = [self.context_embedder, self.prefix_encoder, self.candidate_encoder, self.softmax]
class Model(Initializable): def __init__(self, config, **kwargs): super(Model, self).__init__(**kwargs) self.config = config self.context_embedder = ContextEmbedder(config) self.prefix_encoder = MLP( activations=[Rectifier() for _ in config.prefix_encoder.dim_hidden] + [config.representation_activation()], dims=[config.prefix_encoder.dim_input] + config.prefix_encoder.dim_hidden + [config.representation_size], name="prefix_encoder", ) self.candidate_encoder = MLP( activations=[Rectifier() for _ in config.candidate_encoder.dim_hidden] + [config.representation_activation()], dims=[config.candidate_encoder.dim_input] + config.candidate_encoder.dim_hidden + [config.representation_size], name="candidate_encoder", ) self.softmax = Softmax() self.prefix_extremities = { "%s_k_%s" % (side, ["latitude", "longitude"][axis]): axis for side in ["first", "last"] for axis in [0, 1] } self.candidate_extremities = { "candidate_%s_k_%s" % (side, ["latitude", "longitude"][axis]): axis for side in ["first", "last"] for axis in [0, 1] } self.inputs = ( self.context_embedder.inputs + ["candidate_%s" % k for k in self.context_embedder.inputs] + self.prefix_extremities.keys() + self.candidate_extremities.keys() ) self.children = [self.context_embedder, self.prefix_encoder, self.candidate_encoder, self.softmax] def _push_initialization_config(self): for (mlp, config) in [ [self.prefix_encoder, self.config.prefix_encoder], [self.candidate_encoder, self.config.candidate_encoder], ]: mlp.weights_init = config.weights_init mlp.biases_init = config.biases_init @application(outputs=["destination"]) def predict(self, **kwargs): prefix_embeddings = tuple(self.context_embedder.apply(**{k: kwargs[k] for k in self.context_embedder.inputs})) prefix_extremities = tuple( (kwargs[k] - data.train_gps_mean[v]) / data.train_gps_std[v] for k, v in self.prefix_extremities.items() ) prefix_inputs = tensor.concatenate(prefix_extremities + prefix_embeddings, axis=1) prefix_representation = self.prefix_encoder.apply(prefix_inputs) if self.config.normalize_representation: prefix_representation = prefix_representation / tensor.sqrt( (prefix_representation ** 2).sum(axis=1, keepdims=True) ) candidate_embeddings = tuple( self.context_embedder.apply(**{k: kwargs["candidate_%s" % k] for k in self.context_embedder.inputs}) ) candidate_extremities = tuple( (kwargs[k] - data.train_gps_mean[v]) / data.train_gps_std[v] for k, v in self.candidate_extremities.items() ) candidate_inputs = tensor.concatenate(candidate_extremities + candidate_embeddings, axis=1) candidate_representation = self.candidate_encoder.apply(candidate_inputs) if self.config.normalize_representation: candidate_representation = candidate_representation / tensor.sqrt( (candidate_representation ** 2).sum(axis=1, keepdims=True) ) similarity_score = tensor.dot(prefix_representation, candidate_representation.T) similarity = self.softmax.apply(similarity_score) candidate_destination = tensor.concatenate( ( tensor.shape_padright(kwargs["candidate_last_k_latitude"][:, -1]), tensor.shape_padright(kwargs["candidate_last_k_longitude"][:, -1]), ), axis=1, ) return tensor.dot(similarity, candidate_destination) @predict.property("inputs") def predict_inputs(self): return self.inputs @application(outputs=["cost"]) def cost(self, **kwargs): y_hat = self.predict(**kwargs) y = tensor.concatenate( (kwargs["destination_latitude"][:, None], kwargs["destination_longitude"][:, None]), axis=1 ) return error.erdist(y_hat, y).mean() @cost.property("inputs") def cost_inputs(self): return self.inputs + ["destination_latitude", "destination_longitude"]
rect = Rectifier() mlp = MLP(dims=[784, 1200, 1200, 200], activations=[rect, rect, rect], seed=10) mlp.weights_init = Uniform(0.0, 0.01) mlp.biases_init = Constant(0.0) mlp.initialize() lin = Linear(200, 10, use_bias=True) lin.weights_init = Uniform(0.0, 0.01) lin.biases_init = Constant(0.0) lin.initialize() train_out = lin.apply(mlp.apply(flat_x)) test_out = lin.apply(mlp.apply(flat_x)) sm = Softmax(name='softmax') loss = sm.categorical_cross_entropy(flat_y, train_out).mean() loss.name = 'nll' misclass = MisclassificationRate().apply(flat_y, train_out) misclass.name = 'misclass' test_loss = sm.categorical_cross_entropy(flat_y, test_out).mean() test_loss.name = 'nll' test_misclass = MisclassificationRate().apply(flat_y, test_out) test_misclass.name = 'misclass' model = Model(loss) ###################### # Data ######################
def build_model_vanilla(vocab_size, args, dtype=floatX): logger.info('Building model ...') # Parameters for the model context = args.context state_dim = args.state_dim layers = args.layers skip_connections = args.skip_connections # Symbolic variables # In both cases: Time X Batch x = tensor.lmatrix('features') y = tensor.lmatrix('targets') # Build the model output_names = [] output_dims = [] for d in range(layers): if d > 0: suffix = '_' + str(d) else: suffix = '' if d == 0 or skip_connections: output_names.append("inputs" + suffix) output_dims.append(state_dim) lookup = LookupTable(length=vocab_size, dim=state_dim) lookup.weights_init = initialization.IsotropicGaussian(0.1) lookup.biases_init = initialization.Constant(0) fork = Fork(output_names=output_names, input_dim=args.mini_batch_size, output_dims=output_dims, prototype=FeedforwardSequence( [lookup.apply])) transitions = [SimpleRecurrent(dim=state_dim, activation=Tanh()) for _ in range(layers)] rnn = RecurrentStack(transitions, skip_connections=skip_connections) # If skip_connections: dim = layers * state_dim # else: dim = state_dim output_layer = Linear( input_dim=skip_connections * layers * state_dim + (1 - skip_connections) * state_dim, output_dim=vocab_size, name="output_layer") # Return list of 3D Tensor, one for each layer # (Time X Batch X embedding_dim) pre_rnn = fork.apply(x) # Give a name to the input of each layer if skip_connections: for t in range(len(pre_rnn)): pre_rnn[t].name = "pre_rnn_" + str(t) else: pre_rnn.name = "pre_rnn" # Prepare inputs for the RNN kwargs = OrderedDict() init_states = {} for d in range(layers): if d > 0: suffix = '_' + str(d) else: suffix = '' if skip_connections: kwargs['inputs' + suffix] = pre_rnn[d] elif d == 0: kwargs['inputs'] = pre_rnn init_states[d] = theano.shared( numpy.zeros((args.mini_batch_size, state_dim)).astype(floatX), name='state0_%d' % d) kwargs['states' + suffix] = init_states[d] # Apply the RNN to the inputs h = rnn.apply(low_memory=True, **kwargs) # We have # h = [state, state_1, state_2 ...] if layers > 1 # h = state if layers == 1 # If we have skip connections, concatenate all the states # Else only consider the state of the highest layer last_states = {} if layers > 1: # Save all the last states for d in range(layers): last_states[d] = h[d][-1, :, :] if skip_connections: h = tensor.concatenate(h, axis=2) else: h = h[-1] else: last_states[0] = h[-1, :, :] h.name = "hidden_state" # The updates of the hidden states updates = [] for d in range(layers): updates.append((init_states[d], last_states[d])) presoft = output_layer.apply(h[context:, :, :]) # Define the cost # Compute the probability distribution time, batch, feat = presoft.shape presoft.name = 'presoft' cross_entropy = Softmax().categorical_cross_entropy( y[context:, :].flatten(), presoft.reshape((batch * time, feat))) cross_entropy = cross_entropy / tensor.log(2) cross_entropy.name = "cross_entropy" # TODO: add regularisation for the cost # the log(1) is here in order to differentiate the two variables # for monitoring cost = cross_entropy + tensor.log(1) cost.name = "regularized_cost" # Initialize the model logger.info('Initializing...') fork.initialize() rnn.weights_init = initialization.Orthogonal() rnn.biases_init = initialization.Constant(0) rnn.initialize() output_layer.weights_init = initialization.IsotropicGaussian(0.1) output_layer.biases_init = initialization.Constant(0) output_layer.initialize() return cost, cross_entropy, updates
def main(save_to, cost_name, learning_rate, momentum, num_epochs): mlp = MLP([None], [784, 10], weights_init=IsotropicGaussian(0.01), biases_init=Constant(0)) mlp.initialize() x = tensor.matrix('features') y = tensor.lmatrix('targets') scores = mlp.apply(x) batch_size = y.shape[0] indices = tensor.arange(y.shape[0]) target_scores = tensor.set_subtensor( tensor.zeros((batch_size, 10))[indices, y.flatten()], 1) score_diff = scores - target_scores # Logistic Regression if cost_name == 'lr': cost = Softmax().categorical_cross_entropy(y.flatten(), scores).mean() # MSE elif cost_name == 'mse': cost = (score_diff ** 2).mean() # Perceptron elif cost_name == 'perceptron': cost = (scores.max(axis=1) - scores[indices, y.flatten()]).mean() # TLE elif cost_name == 'minmin': cost = abs(score_diff[indices, y.flatten()]).mean() cost += abs(score_diff[indices, scores.argmax(axis=1)]).mean() # TLEcut elif cost_name == 'minmin_cut': # Score of the groundtruth should be greater or equal than its target score cost = tensor.maximum(0, -score_diff[indices, y.flatten()]).mean() # Score of the prediction should be less or equal than its actual score cost += tensor.maximum(0, score_diff[indices, scores.argmax(axis=1)]).mean() # TLE2 elif cost_name == 'minmin2': cost = ((score_diff[tensor.arange(y.shape[0]), y.flatten()]) ** 2).mean() cost += ((score_diff[tensor.arange(y.shape[0]), scores.argmax(axis=1)]) ** 2).mean() # Direct loss minimization elif cost_name == 'direct': epsilon = 0.1 cost = (- scores[indices, (scores + epsilon * target_scores).argmax(axis=1)] + scores[indices, scores.argmax(axis=1)]).mean() cost /= epsilon elif cost_name == 'svm': cost = (scores[indices, (scores - 1 * target_scores).argmax(axis=1)] - scores[indices, y.flatten()]).mean() else: raise ValueError("Unknown cost " + cost) error_rate = MisclassificationRate().apply(y.flatten(), scores) error_rate.name = 'error_rate' cg = ComputationGraph([cost]) cost.name = 'cost' mnist_train = MNIST(("train",)) mnist_test = MNIST(("test",)) if learning_rate == None: learning_rate = 0.0001 if momentum == None: momentum = 0.0 rule = Momentum(learning_rate=learning_rate, momentum=momentum) algorithm = GradientDescent( cost=cost, parameters=cg.parameters, step_rule=rule) extensions = [Timing(), FinishAfter(after_n_epochs=num_epochs), DataStreamMonitoring( [cost, error_rate], Flatten( DataStream.default_stream( mnist_test, iteration_scheme=SequentialScheme( mnist_test.num_examples, 500)), which_sources=('features',)), prefix="test"), # CallbackExtension( # lambda: rule.learning_rate.set_value(rule.learning_rate.get_value() * 0.9), # after_epoch=True), TrainingDataMonitoring( [cost, error_rate, aggregation.mean(algorithm.total_gradient_norm), rule.learning_rate], prefix="train", after_epoch=True), Checkpoint(save_to), Printing()] if BLOCKS_EXTRAS_AVAILABLE: extensions.append(Plot( 'MNIST example', channels=[ ['test_cost', 'test_error_rate'], ['train_total_gradient_norm']])) main_loop = MainLoop( algorithm, Flatten( DataStream.default_stream( mnist_train, iteration_scheme=SequentialScheme( mnist_train.num_examples, 50)), which_sources=('features',)), model=Model(cost), extensions=extensions) main_loop.run() df = pandas.DataFrame.from_dict(main_loop.log, orient='index') res = {'cost' : cost_name, 'learning_rate' : learning_rate, 'momentum' : momentum, 'train_cost' : df.train_cost.iloc[-1], 'test_cost' : df.test_cost.iloc[-1], 'best_test_cost' : df.test_cost.min(), 'train_error' : df.train_error_rate.iloc[-1], 'test_error' : df.test_error_rate.iloc[-1], 'best_test_error' : df.test_error_rate.min()} res = {k: float(v) if isinstance(v, numpy.ndarray) else v for k, v in res.items()} json.dump(res, sys.stdout) sys.stdout.flush()
def main(): # # # # # # # # # # # # Modeling Building # # # # # # # # # # # # # ConvOp requires input be a 4D tensor x = tensor.tensor4("features") y = tensor.ivector("targets") # Convolutional Layers # ==================== # "Improving neural networks by preventing co-adaptation of feature detectors" # conv_layers = [ # # ConvolutionalLayer(activiation, filter_size, num_filters, pooling_size, name) # ConvolutionalLayer(Rectifier().apply, (5,5), 64, (2,2), border_mode='full', name='l1') # , ConvolutionalLayer(Rectifier().apply, (5,5), 64, (2,2), border_mode='full', name='l2') # , ConvolutionalLayer(Rectifier().apply, (5,5), 64, (2,2), border_mode='full', name='l3') # ] # "VGGNet" conv_layers = [ ConvolutionalActivation(Rectifier().apply, (3,3), 64, border_mode='full', name='l1') , ConvolutionalLayer(Rectifier().apply, (3,3), 64, (2,2), border_mode='full', name='l2') , ConvolutionalActivation(Rectifier().apply, (3,3), 128, border_mode='full', name='l3') , ConvolutionalLayer(Rectifier().apply, (3,3), 128, (2,2), border_mode='full', name='l4') , ConvolutionalActivation(Rectifier().apply, (3,3), 256, border_mode='full', name='l5') , ConvolutionalLayer(Rectifier().apply, (3,3), 256, (2,2), border_mode='full', name='l6') ] # Bake my own # conv_layers = [ # # ConvolutionalLayer(activiation, filter_size, num_filters, pooling_size, name) # ConvolutionalLayer(Rectifier().apply, (5,5), 64, (2,2), border_mode='full', name='l1') # , ConvolutionalLayer(Rectifier().apply, (3,3), 128, (2,2), border_mode='full', name='l2') # , ConvolutionalActivation(Rectifier().apply, (3,3), 256, border_mode='full', name='l3') # , ConvolutionalLayer(Rectifier().apply, (3,3), 256, (2,2), border_mode='full', name='l4') # ] convnet = ConvolutionalSequence( conv_layers, num_channels=3, image_size=(32,32), weights_init=IsotropicGaussian(0.1), biases_init=Constant(0) ) convnet.initialize() output_dim = np.prod(convnet.get_dim('output')) # Fully Connected Layers # ====================== conv_features = convnet.apply(x) features = Flattener().apply(conv_features) mlp = MLP( activations=[Rectifier()]*2+[None] , dims=[output_dim, 256, 256, 10] , weights_init=IsotropicGaussian(0.01) , biases_init=Constant(0) ) mlp.initialize() y_hat = mlp.apply(features) # print y_hat.shape.eval({x: np.zeros((1, 3, 32, 32), dtype=theano.config.floatX)}) # Numerically Stable Softmax cost = Softmax().categorical_cross_entropy(y, y_hat) error_rate = MisclassificationRate().apply(y, y_hat) cg = ComputationGraph(cost) weights = VariableFilter(roles=[FILTER, WEIGHT])(cg.variables) l2_regularization = 0.005 * sum((W**2).sum() for W in weights) cost = cost + l2_regularization cost.name = 'cost_with_regularization' # Print sizes to check print("Representation sizes:") for layer in convnet.layers: print(layer.get_dim('input_')) # # # # # # # # # # # # Modeling Training # # # # # # # # # # # # # Figure out data source train = CIFAR10("train") test = CIFAR10("test") # Load Data Using Fuel train_stream = DataStream.default_stream( dataset=train , iteration_scheme=SequentialScheme(train.num_examples, batch_size=128)) test_stream = DataStream.default_stream( dataset=test , iteration_scheme=SequentialScheme(test.num_examples, batch_size=1024)) # Train algorithm = GradientDescent( cost=cost , params=cg.parameters , step_rule=Adam(learning_rate=0.0005) ) main_loop = MainLoop( model=Model(cost) , data_stream=train_stream , algorithm=algorithm , extensions=[ TrainingDataMonitoring( [cost, error_rate] , prefix='train' , after_epoch=True) , DataStreamMonitoring( [cost, error_rate] , test_stream, prefix='test') , ExperimentSaver(dest_directory='...', src_directory='.') , Printing() , ProgressBar() ] ) main_loop.run()
################### #### Softmax ################### from blocks.bricks import Softmax from blocks.bricks.cost import MisclassificationRate W2 = theano.shared(numpy.random.normal(size=(n_out, num_protos)).astype('float32')) b = theano.shared(numpy.zeros((num_protos,)).astype('float32')) y = tensor.ivector('y') h = tensor.dot(h3, W2) + b h = tensor.switch(h < 0, -h , h) sm = Softmax() pred = sm.apply(h) misclass = MisclassificationRate().apply(y, pred) c = sm.categorical_cross_entropy(y, h).mean() s_params = [W2, b] s_grad = theano.grad(c, s_params) s_updates = [p - numpy.float32(0.05)*g for p, g in zip(s_params, s_grad)] s_f = theano.function([h3, y], [c, misclass], updates=zip(s_params, s_updates)) s_pred = theano.function([h3], pred) for j in range(200): for i in range(n_batches): if i == 0: print s_f(data[i*batch_size:(i+1)*batch_size, :], labels[i*batch_size:(i+1)*batch_size]) else:
# Fully connected layers features = Flattener().apply(convnet.apply(x)) mlp = MLP( activations=[Rectifier(), None], dims=[output_dim, 100, 10], weights_init=IsotropicGaussian(0.01), biases_init=Constant(0) ) mlp.initialize() y_hat = mlp.apply(features) # numerically stable softmax cost = Softmax().categorical_cross_entropy(y.flatten(), y_hat) cost.name = 'nll' error_rate = MisclassificationRate().apply(y.flatten(), y_hat) #cost = MisclassificationRate().apply(y, y_hat) #cost.name = 'error_rate' cg = ComputationGraph(cost) #pdb.set_trace() weights = VariableFilter(roles=[FILTER, WEIGHT])(cg.variables) l2_regularization = 0.005 * sum((W**2).sum() for W in weights) cost_l2 = cost + l2_regularization cost.name = 'cost_with_regularization' # Print sizes to check
pre_rnn = x_to_h1.apply(x) if is_LSTM: rnn = DropLSTM(dim=h_dim, model_type=model_type, update_prob=update_prob, name="rnn") h1, c1 = rnn.apply(pre_rnn, drops, is_for_test) else: rnn = DropGRU(dim=h_dim, model_type=model_type, update_prob=update_prob, name="rnn") h1, sd = rnn.apply(pre_rnn[:, :, :h_dim], pre_rnn[:, :, h_dim:], drops, is_for_test) h1_to_o = Linear(name='h1_to_o', input_dim=h_dim, output_dim=y_dim) pre_softmax = h1_to_o.apply(h1) softmax = Softmax() shape = pre_softmax.shape softmax_out = softmax.apply(pre_softmax.reshape((-1, y_dim))) softmax_out = softmax_out.reshape(shape) softmax_out.name = 'softmax_out' # comparing only last time-step cost = CategoricalCrossEntropy().apply(y, softmax_out[-1]) cost.name = 'CrossEntropy' error_rate = MisclassificationRate().apply(y, softmax_out[-1]) error_rate.name = 'error_rate' # Initialization for brick in (x_to_h1, h1_to_o, rnn): brick.weights_init = Glorot() brick.biases_init = Constant(0)
def build_model_soft(vocab_size, args, dtype=floatX): logger.info('Building model ...') # Parameters for the model context = args.context state_dim = args.state_dim layers = args.layers skip_connections = args.skip_connections # Symbolic variables # In both cases: Time X Batch x = tensor.lmatrix('features') y = tensor.lmatrix('targets') # Build the model output_names = [] output_dims = [] for d in range(layers): if d > 0: suffix = '_' + str(d) else: suffix = '' if d == 0 or skip_connections: output_names.append("inputs" + suffix) output_dims.append(state_dim) lookup = LookupTable(length=vocab_size, dim=state_dim) lookup.weights_init = initialization.IsotropicGaussian(0.1) lookup.biases_init = initialization.Constant(0) fork = Fork(output_names=output_names, input_dim=args.mini_batch_size, output_dims=output_dims, prototype=FeedforwardSequence( [lookup.apply])) transitions = [SimpleRecurrent(dim=state_dim, activation=Tanh())] # Build the MLP dims = [2 * state_dim] activations = [] for i in range(args.mlp_layers): activations.append(Rectifier()) dims.append(state_dim) # Activation of the last layer of the MLP if args.mlp_activation == "logistic": activations.append(Logistic()) elif args.mlp_activation == "rectifier": activations.append(Rectifier()) elif args.mlp_activation == "hard_logistic": activations.append(HardLogistic()) else: assert False # Output of MLP has dimension 1 dims.append(1) for i in range(layers - 1): mlp = MLP(activations=activations, dims=dims, weights_init=initialization.IsotropicGaussian(0.1), biases_init=initialization.Constant(0), name="mlp_" + str(i)) transitions.append( SoftGatedRecurrent(dim=state_dim, mlp=mlp, activation=Tanh())) rnn = RecurrentStack(transitions, skip_connections=skip_connections) # dim = layers * state_dim output_layer = Linear( input_dim=layers * state_dim, output_dim=vocab_size, name="output_layer") # Return list of 3D Tensor, one for each layer # (Time X Batch X embedding_dim) pre_rnn = fork.apply(x) # Give a name to the input of each layer if skip_connections: for t in range(len(pre_rnn)): pre_rnn[t].name = "pre_rnn_" + str(t) else: pre_rnn.name = "pre_rnn" # Prepare inputs for the RNN kwargs = OrderedDict() init_states = {} for d in range(layers): if d > 0: suffix = '_' + str(d) else: suffix = '' if skip_connections: kwargs['inputs' + suffix] = pre_rnn[d] elif d == 0: kwargs['inputs' + suffix] = pre_rnn init_states[d] = theano.shared( numpy.zeros((args.mini_batch_size, state_dim)).astype(floatX), name='state0_%d' % d) kwargs['states' + suffix] = init_states[d] # Apply the RNN to the inputs h = rnn.apply(low_memory=True, **kwargs) # Now we have: # h = [state, state_1, gate_value_1, state_2, gate_value_2, state_3, ...] # Extract gate_values gate_values = h[2::2] new_h = [h[0]] new_h.extend(h[1::2]) h = new_h # Now we have: # h = [state, state_1, state_2, ...] # gate_values = [gate_value_1, gate_value_2, gate_value_3] for i, gate_value in enumerate(gate_values): gate_value.name = "gate_value_" + str(i) # Save all the last states last_states = {} for d in range(layers): last_states[d] = h[d][-1, :, :] # Concatenate all the states if layers > 1: h = tensor.concatenate(h, axis=2) h.name = "hidden_state" # The updates of the hidden states updates = [] for d in range(layers): updates.append((init_states[d], last_states[d])) presoft = output_layer.apply(h[context:, :, :]) # Define the cost # Compute the probability distribution time, batch, feat = presoft.shape presoft.name = 'presoft' cross_entropy = Softmax().categorical_cross_entropy( y[context:, :].flatten(), presoft.reshape((batch * time, feat))) cross_entropy = cross_entropy / tensor.log(2) cross_entropy.name = "cross_entropy" # TODO: add regularisation for the cost # the log(1) is here in order to differentiate the two variables # for monitoring cost = cross_entropy + tensor.log(1) cost.name = "regularized_cost" # Initialize the model logger.info('Initializing...') fork.initialize() rnn.weights_init = initialization.Orthogonal() rnn.biases_init = initialization.Constant(0) rnn.initialize() output_layer.weights_init = initialization.IsotropicGaussian(0.1) output_layer.biases_init = initialization.Constant(0) output_layer.initialize() return cost, cross_entropy, updates, gate_values
def test_communication(path_vae_mnist, path_maxout_mnist): # load models vae_mnist = load(path_vae_mnist) # get params : to be remove from the computation graph # write an object maxout classifier = Maxout() # get params : to be removed from the computation graph # vae whose prior is a zero mean unit variance normal distribution activation = Rectifier() full_weights_init = Orthogonal() weights_init = full_weights_init # SVHN en niveau de gris layers = [32*32, 200, 200, 200, 50] encoder_layers = layers[:-1] encoder_mlp = MLP([activation] * (len(encoder_layers)-1), encoder_layers, name="MLP_SVHN_encode", biases_init=Constant(0.), weights_init=weights_init) enc_dim = encoder_layers[-1] z_dim = layers[-1] sampler = Qsampler(input_dim=enc_dim, output_dim=z_dim, biases_init=Constant(0.), weights_init=full_weights_init) decoder_layers = layers[:] ## includes z_dim as first layer decoder_layers.reverse() decoder_mlp = MLP([activation] * (len(decoder_layers)-2) + [Rectifier()], decoder_layers, name="MLP_SVHN_decode", biases_init=Constant(0.), weights_init=weights_init) vae_svhn = VAEModel(encoder_mlp, sampler, decoder_mlp) vae_svhn.initialize() # do the connection x = T.tensor4('x') # SVHN samples preprocessed with local contrast normalization x_ = (T.sum(x, axis=1)).flatten(ndim=2) y = T.imatrix('y') batch_size = 512 svhn_z, _ = vae_svhn.sampler.sample(vae_svhn.encoder_mlp.apply(x_)) mnist_decode = vae_mnist.decoder_mlp.apply(svhn_z) # reshape shape = mnist_decode.shape mnist_decode = mnist_decode.reshape((shape[0], 1, 28, 28)) prediction = classifier.apply(mnist_decode) y_hat = Softmax().apply(prediction) x_recons, kl_terms = vae_svhn.reconstruct(x_) recons_term = BinaryCrossEntropy().apply(x_, T.clip(x_recons, 1e-4, 1 - 1e-4)) recons_term.name = "recons_term" cost_A = recons_term + kl_terms.mean() cost_A.name = "cost_A" cost_B = Softmax().categorical_cross_entropy(y.flatten(), prediction) cost_B.name = 'cost_B' cost = cost_B cost.name = "cost" cg = ComputationGraph(cost) # probably discard some of the parameters parameters = cg.parameters params = [] for t in parameters: if not re.match(".*mnist", t.name): params.append(t) """ f = theano.function([x], cost_A) value_x = np.random.ranf((1, 3, 32, 32)).astype("float32") print f(value_x) return """ error_brick = MisclassificationRate() error_rate = error_brick.apply(y.flatten(), y_hat) error_rate.name = "error_rate" # training here step_rule = RMSProp(0.001,0.99) dataset_hdf5_file="/Tmp/ducoffem/SVHN/" train_set = H5PYDataset(os.path.join(dataset_hdf5_file, "all.h5"), which_set='train') test_set = H5PYDataset(os.path.join(dataset_hdf5_file, "all.h5"), which_set='valid') data_stream = DataStream.default_stream( train_set, iteration_scheme=SequentialScheme(train_set.num_examples, batch_size)) data_stream_test = DataStream.default_stream( test_set, iteration_scheme=SequentialScheme(2000, batch_size)) algorithm = GradientDescent(cost=cost, params=params, step_rule=step_rule) monitor_train = TrainingDataMonitoring( variables=[cost], prefix="train", every_n_batches=10) monitor_valid = DataStreamMonitoring( variables=[cost, error_rate], data_stream=data_stream_test, prefix="valid", every_n_batches=10) # drawing_samples = ImagesSamplesSave("../data_svhn", vae, (3, 32, 32), every_n_epochs=1) extensions = [ monitor_train, monitor_valid, FinishAfter(after_n_batches=10000), Printing(every_n_batches=10) ] main_loop = MainLoop(data_stream=data_stream, algorithm=algorithm, model = Model(cost), extensions=extensions) main_loop.run()
rnn = Bidirectional( SimpleRecurrent(dim=hidden_dim, activation=Tanh(), weights_init=IsotropicGaussian(0.01), biases_init=Constant(0), ), ) ### Will need to reshape the rnn outputs to produce suitable input here... gather = Linear(name='hidden_to_output', input_dim=hidden_dim*2, output_dim=labels_size, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0) ) p_labels = Softmax() ## Let's initialize the variables lookup.allocate() #print("lookup.parameters=", lookup.parameters) # ('lookup.parameters=', [W]) #lookup.weights_init = FUNCTION #lookup.initialize() #lookup.params[0].set_value( np.random.normal( scale = 0.1, size=(vocab_size, embedding_dim) ).astype(np.float32) ) #lookup.params[0].set_value( embedding ) # See : https://github.com/mila-udem/blocks/blob/master/tests/bricks/test_lookup.py #lookup.W.set_value(numpy.arange(15).reshape(5, 3).astype(theano.config.floatX))
def training_model_mnist(learning_rate, momentum, iteration, batch_size, epoch_end, iter_batch): x = T.tensor4('features') y = T.imatrix('targets') classifier = build_model_mnist() predict = classifier.apply(x) y_hat = Softmax().apply(predict) cost = Softmax().categorical_cross_entropy(y.flatten(), predict) cost.name = "cost" cg = ComputationGraph(cost) error_brick = MisclassificationRate() error_rate = error_brick.apply(y.flatten(), y_hat) error_rate.name = "error" train_set = MNIST(('train', )) test_set = MNIST(("test",)) if iteration =="slice": data_stream = DataStream.default_stream( train_set, iteration_scheme=SequentialScheme_slice(train_set.num_examples, batch_size)) data_stream_test = DataStream.default_stream( test_set, iteration_scheme=SequentialScheme_slice(test_set.num_examples, batch_size)) else: data_stream = DataStream.default_stream( train_set, iteration_scheme=SequentialScheme(train_set.num_examples, batch_size)) data_stream_test = DataStream.default_stream( test_set, iteration_scheme=SequentialScheme(test_set.num_examples, batch_size)) step_rule = Momentum(learning_rate=learning_rate, momentum=momentum) start = time.clock() time_spent = shared_floatx(np.float32(0.), name="time_spent") time_extension = Time_reference(start, time_spent, every_n_batches=1) algorithm = GradientDescent(cost=cost, params=cg.parameters, step_rule=step_rule) monitor_train = TrainingDataMonitoring( variables=[cost], prefix="train", every_n_epochs=iter_batch) monitor_valid = DataStreamMonitoring( variables=[cost, error_rate, time_spent], data_stream=data_stream_test, prefix="valid", every_n_epochs=iter_batch) # add a monitor variable about the time extensions = [ monitor_train, monitor_valid, FinishAfter(after_n_epochs=epoch_end), Printing(every_n_epochs=iter_batch), time_extension ] main_loop = MainLoop(data_stream=data_stream, algorithm=algorithm, model = Model(cost), extensions=extensions) main_loop.run()
class CCHLSTM(BaseRecurrent, Initializable): def __init__(self, io_dim, hidden_dims, cond_cert, activation=None, **kwargs): super(CCHLSTM, self).__init__(**kwargs) self.cond_cert = cond_cert self.io_dim = io_dim self.hidden_dims = hidden_dims self.children = [] self.layers = [] self.softmax = Softmax() self.children.append(self.softmax) for i, d in enumerate(hidden_dims): i0 = LookupTable(length=io_dim, dim=4*d, name='i0-%d'%i) self.children.append(i0) if i > 0: i1 = Linear(input_dim=hidden_dims[i-1], output_dim=4*d, name='i1-%d'%i) self.children.append(i1) else: i1 = None lstm = LSTM(dim=d, activation=activation, name='LSTM-%d'%i) self.children.append(lstm) o = Linear(input_dim=d, output_dim=io_dim, name='o-%d'%i) self.children.append(o) self.layers.append((i0, i1, lstm, o)) @recurrent(contexts=[]) def apply(self, inputs, **kwargs): l0i, _, l0l, l0o = self.layers[0] l0iv = l0i.apply(inputs) new_states0, new_cells0 = l0l.apply(states=kwargs['states0'], cells=kwargs['cells0'], inputs=l0iv, iterate=False) l0ov = l0o.apply(new_states0) pos = l0ov ps = new_states0 passnext = tensor.ones((inputs.shape[0],)) out_sc = [new_states0, new_cells0, passnext] for i, (cch, (i0, i1, l, o)) in enumerate(zip(self.cond_cert, self.layers[1:])): pop = self.softmax.apply(pos) best = pop.max(axis=1) passnext = passnext * tensor.le(best, cch) * kwargs['pass%d'%i] i0v = i0.apply(inputs) i1v = i1.apply(ps) prev_states = kwargs['states%d'%i] prev_cells = kwargs['cells%d'%i] new_states, new_cells = l.apply(inputs=i0v + i1v, states=prev_states, cells=prev_cells, iterate=False) new_states = tensor.switch(passnext[:, None], new_states, prev_states) new_cells = tensor.switch(passnext[:, None], new_cells, prev_cells) out_sc += [new_states, new_cells, passnext] ov = o.apply(new_states) pos = tensor.switch(passnext[:, None], pos + ov, pos) ps = new_states return [pos] + out_sc def get_dim(self, name): dims = {'pred': self.io_dim} for i, d in enumerate(self.hidden_dims): dims['states%d'%i] = dims['cells%d'%i] = d if name in dims: return dims[name] return super(CCHLSTM, self).get_dim(name) @apply.property('sequences') def apply_sequences(self): return ['inputs'] + ['pass%d'%i for i in range(len(self.hidden_dims)-1)] @apply.property('states') def apply_states(self): ret = [] for i in range(len(self.hidden_dims)): ret += ['states%d'%i, 'cells%d'%i] return ret @apply.property('outputs') def apply_outputs(self): ret = ['pred'] for i in range(len(self.hidden_dims)): ret += ['states%d'%i, 'cells%d'%i, 'active%d'%i] return ret
def __init__(self, config): inp = tensor.imatrix('bytes') embed = theano.shared(config.embedding_matrix.astype(theano.config.floatX), name='embedding_matrix') in_repr = embed[inp.flatten(), :].reshape((inp.shape[0], inp.shape[1], config.repr_dim)) in_repr.name = 'in_repr' bricks = [] states = [] # Construct predictive GRU hierarchy hidden = [] costs = [] next_target = in_repr.dimshuffle(1, 0, 2) for i, (hdim, cf, q) in enumerate(zip(config.hidden_dims, config.cost_factors, config.hidden_q)): init_state = theano.shared(numpy.zeros((config.num_seqs, hdim)).astype(theano.config.floatX), name='st0_%d'%i) linear = Linear(input_dim=config.repr_dim, output_dim=3*hdim, name="lstm_in_%d"%i) lstm = GatedRecurrent(dim=hdim, activation=config.activation_function, name="lstm_rec_%d"%i) linear2 = Linear(input_dim=hdim, output_dim=config.repr_dim, name='lstm_out_%d'%i) tanh = Tanh('lstm_out_tanh_%d'%i) bricks += [linear, lstm, linear2, tanh] if i > 0: linear1 = Linear(input_dim=config.hidden_dims[i-1], output_dim=3*hdim, name='lstm_in2_%d'%i) bricks += [linear1] next_target = tensor.cast(next_target, dtype=theano.config.floatX) inter = linear.apply(theano.gradient.disconnected_grad(next_target)) if i > 0: inter += linear1.apply(theano.gradient.disconnected_grad(hidden[-1][:-1,:,:])) new_hidden = lstm.apply(inputs=inter[:,:,:hdim], gate_inputs=inter[:,:,hdim:], states=init_state) states.append((init_state, new_hidden[-1, :, :])) hidden += [tensor.concatenate([init_state[None,:,:], new_hidden],axis=0)] pred = tanh.apply(linear2.apply(hidden[-1][:-1,:,:])) costs += [numpy.float32(cf) * (-next_target * pred).sum(axis=2).mean()] costs += [numpy.float32(cf) * q * abs(pred).sum(axis=2).mean()] diff = next_target - pred next_target = tensor.ge(diff, 0.5) - tensor.le(diff, -0.5) # Construct output from hidden states hidden = [s.dimshuffle(1, 0, 2) for s in hidden] out_parts = [] out_dims = config.out_hidden + [config.io_dim] for i, (dim, state) in enumerate(zip(config.hidden_dims, hidden)): pred_linear = Linear(input_dim=dim, output_dim=out_dims[0], name='pred_linear_%d'%i) bricks.append(pred_linear) lin = theano.gradient.disconnected_grad(state) out_parts.append(pred_linear.apply(lin)) # Do prediction and calculate cost out = sum(out_parts) if len(out_dims) > 1: out = config.out_hidden_act[0](name='out_act0').apply(out) mlp = MLP(dims=out_dims, activations=[x(name='out_act%d'%i) for i, x in enumerate(config.out_hidden_act[1:])] +[Identity()], name='out_mlp') bricks.append(mlp) out = mlp.apply(out.reshape((inp.shape[0]*(inp.shape[1]+1),-1)) ).reshape((inp.shape[0],inp.shape[1]+1,-1)) pred = out.argmax(axis=2) cost = Softmax().categorical_cross_entropy(inp.flatten(), out[:,:-1,:].reshape((inp.shape[0]*inp.shape[1], config.io_dim))).mean() error_rate = tensor.neq(inp.flatten(), pred[:,:-1].flatten()).mean() sgd_cost = cost + sum(costs) # Initialize all bricks for brick in bricks: brick.weights_init = config.weights_init brick.biases_init = config.biases_init brick.initialize() # apply noise cg = ComputationGraph([sgd_cost, cost, error_rate]+costs) if config.weight_noise > 0: noise_vars = VariableFilter(roles=[WEIGHT])(cg) cg = apply_noise(cg, noise_vars, config.weight_noise) sgd_cost = cg.outputs[0] cost = cg.outputs[1] error_rate = cg.outputs[2] costs = cg.outputs[3:] # put stuff into self that is usefull for training or extensions self.sgd_cost = sgd_cost sgd_cost.name = 'sgd_cost' for i in range(len(costs)): costs[i].name = 'pred_cost_%d'%i cost.name = 'cost' error_rate.name = 'error_rate' self.monitor_vars = [costs, [cost], [error_rate]] self.out = out[:,1:,:] self.pred = pred[:,1:] self.states = states
def __init__(self): inp = tensor.lmatrix('bytes') # Make state vars state_vars = {} for i, d in enumerate(hidden_dims): state_vars['states%d'%i] = theano.shared(numpy.zeros((num_seqs, d)) .astype(theano.config.floatX), name='states%d'%i) state_vars['cells%d'%i] = theano.shared(numpy.zeros((num_seqs, d)) .astype(theano.config.floatX), name='cells%d'%i) # Construct brick cchlstm = CCHLSTM(io_dim=io_dim, hidden_dims=hidden_dims, cond_cert=cond_cert, activation=activation_function) # Random pass passdict = {} for i, p in enumerate(block_prob): passdict['pass%d'%i] = rng.binomial(size=(inp.shape[1], inp.shape[0]), p=1-p) # Apply it outs = cchlstm.apply(inputs=inp.dimshuffle(1, 0), **dict(state_vars.items() + passdict.items())) states = [] active_prop = [] for i in range(len(hidden_dims)): states.append((state_vars['states%d'%i], outs[3*i+1][-1, :, :])) states.append((state_vars['cells%d'%i], outs[3*i+2][-1, :, :])) active_prop.append(outs[3*i+3].mean()) active_prop[-1].name = 'active_prop_%d'%i out = outs[0].dimshuffle(1, 0, 2) # Do prediction and calculate cost pred = out.argmax(axis=2) cost = Softmax().categorical_cross_entropy(inp[:, 1:].flatten(), out[:, :-1, :].reshape((inp.shape[0]*(inp.shape[1]-1), io_dim))) error_rate = tensor.neq(inp[:, 1:].flatten(), pred[:, :-1].flatten()).mean() # Initialize all bricks for brick in [cchlstm]: brick.weights_init = IsotropicGaussian(0.1) brick.biases_init = Constant(0.) brick.initialize() # Apply noise and dropoutvars cg = ComputationGraph([cost, error_rate]) if w_noise_std > 0: noise_vars = VariableFilter(roles=[WEIGHT])(cg) cg = apply_noise(cg, noise_vars, w_noise_std) [cost_reg, error_rate_reg] = cg.outputs self.sgd_cost = cost_reg self.monitor_vars = [[cost, cost_reg], [error_rate, error_rate_reg], active_prop] cost.name = 'cost' cost_reg.name = 'cost_reg' error_rate.name = 'error_rate' error_rate_reg.name = 'error_rate_reg' self.out = out self.pred = pred self.states = states