Beispiel #1
0
 def __init__(self, dim, **kwargs):
     super(FeedbackRNN, self).__init__(**kwargs)
     self.dim = dim
     self.first_recurrent_layer = SimpleRecurrent(
         dim=self.dim,
         activation=Identity(),
         name='first_recurrent_layer',
         weights_init=initialization.Identity())
     self.second_recurrent_layer = SimpleRecurrent(
         dim=self.dim,
         activation=Identity(),
         name='second_recurrent_layer',
         weights_init=initialization.Identity())
     self.children = [
         self.first_recurrent_layer, self.second_recurrent_layer
     ]
Beispiel #2
0
def example():
    """ Simple reccurent example. Taken from : https://github.com/mdda/pycon.sg-2015_deep-learning/blob/master/ipynb/blocks-recurrent-docs.ipynb """
    x = tensor.tensor3('x')

    rnn = SimpleRecurrent(dim=3,
                          activation=Identity(),
                          weights_init=initialization.Identity())
    rnn.initialize()
    h = rnn.apply(x)

    f = theano.function([x], h)
    print(f(np.ones((3, 1, 3), dtype=theano.config.floatX)))

    doubler = Linear(input_dim=3,
                     output_dim=3,
                     weights_init=initialization.Identity(2),
                     biases_init=initialization.Constant(0))
    doubler.initialize()
    h_doubler = rnn.apply(doubler.apply(x))

    f = theano.function([x], h_doubler)
    print(f(np.ones((3, 1, 3), dtype=theano.config.floatX)))

    #Initial State
    h0 = tensor.matrix('h0')
    h = rnn.apply(inputs=x, states=h0)

    f = theano.function([x, h0], h)
    print(
        f(np.ones((3, 1, 3), dtype=theano.config.floatX),
          np.ones((1, 3), dtype=theano.config.floatX)))
Beispiel #3
0
    def __init__(self, dim, depth, **kwargs):
        super(FeedbackRNNStack, self).__init__(**kwargs)
        self.dim = dim
        self.depth = depth
        self.children = []
        FeedbackRNNStack.depth = depth

        for i in range(depth):
            self.children.append(
                SimpleRecurrent(dim=self.dim,
                                activation=Identity(),
                                name=str(i) + 'th_recurrent_layer',
                                weights_init=initialization.Identity()))
    def setUp(self):
        prototype = SimpleRecurrent(dim=3, activation=Tanh())
        self.layers = [
            Bidirectional(weights_init=Orthogonal(), prototype=prototype)
            for _ in range(3)
        ]
        self.stack = RecurrentStack(self.layers)
        for fork in self.stack.forks:
            fork.weights_init = Identity(1)
            fork.biases_init = Constant(0)
        self.stack.initialize()

        self.x_val = 0.1 * numpy.asarray(
            list(itertools.permutations(range(4))), dtype=theano.config.floatX)
        self.x_val = (numpy.ones(
            (24, 4, 3), dtype=theano.config.floatX) * self.x_val[..., None])
        self.mask_val = numpy.ones((24, 4), dtype=theano.config.floatX)
        self.mask_val[12:24, 3] = 0
class CustomLSTMWeights(NdarrayInitialization):
    # Identity in the diagonal and IsotropicGaussian everywhere else
    def __init__(self, std=1, mean=0):
            self.gaussian_init = IsotropicGaussian(std = std, mean = mean)
            self.identity = Identity()

    def generate(self, rng, shape):
        if len(shape) != 2:
            raise ValueError
        assert shape[0] == shape[1]
        size = shape/4
        assert size*4 == shape[0]

        result = numpy.array([])
        for i in range(4):
            row = numpy.array([])
            for j in range(4):
                if i == j:
                    square = self.gaussian_init.generate(rng, (size,size))
                else:
                    square = self.identity.generate(rng, (size,size))
                row = numpy.hstack(row,square)
            result.vstack(row)
        return result.astype(theano.config.floatX)
Beispiel #6
0
class CustomLSTMWeights(NdarrayInitialization):
    # Identity in the diagonal and IsotropicGaussian everywhere else
    def __init__(self, std=1, mean=0):
        self.gaussian_init = IsotropicGaussian(std=std, mean=mean)
        self.identity = Identity()

    def generate(self, rng, shape):
        if len(shape) != 2:
            raise ValueError
        assert shape[0] == shape[1]
        size = shape / 4
        assert size * 4 == shape[0]

        result = numpy.array([])
        for i in range(4):
            row = numpy.array([])
            for j in range(4):
                if i == j:
                    square = self.gaussian_init.generate(rng, (size, size))
                else:
                    square = self.identity.generate(rng, (size, size))
                row = numpy.hstack(row, square)
            result.vstack(row)
        return result.astype(theano.config.floatX)
 def __init__(self, std=1, mean=0):
         self.gaussian_init = IsotropicGaussian(std = std, mean = mean)
         self.identity = Identity()
Beispiel #8
0
def main(config, tr_stream, dev_stream, use_bokeh=False, the_task=None, the_track=None):

    config['the_task'] = the_task
    # Create Theano variables
    logger.info('Creating theano variables')
    source_sentence = tensor.lmatrix('source')
    source_sentence_mask = tensor.matrix('source_mask')
    target_sentence = tensor.lmatrix('target')
    target_sentence_mask = tensor.matrix('target_mask')
    sampling_input = tensor.lmatrix('input')

    # Construct model
    logger.info('Building RNN encoder-decoder')
    encoder = BidirectionalEncoder(
        # end_embed is dimension of word embedding matrix in encoder; enc_nhids number of hidden units in encoder GRU
        config['src_vocab_size'], config['enc_embed'], config['enc_nhids'])
    decoder = Decoder(
        config['trg_vocab_size'], config['dec_embed'], config['dec_nhids'],
        config['enc_nhids'] * 2, config['use_attention'], cost_type=config['error_fct'])
    cost = decoder.cost(
        encoder.apply(source_sentence, source_sentence_mask),
        source_sentence_mask, target_sentence, target_sentence_mask)
    testVar = decoder.getTestVar(
        encoder.apply(source_sentence, source_sentence_mask),
        source_sentence_mask, target_sentence, target_sentence_mask)
   
    logger.info('Creating computational graph')
    cg = ComputationGraph(cost)

    # Initialize model
    logger.info('Initializing model')
    my_rng = numpy.random.RandomState(config['rng_value']) 
    if config['identity_init']:
      encoder.weights_init = decoder.weights_init = Identity()
    else:
      encoder.weights_init = decoder.weights_init = IsotropicGaussian(
          config['weight_scale'])
      encoder.rng = decoder.rng = my_rng
    
    encoder.biases_init = decoder.biases_init = Constant(0)
    encoder.push_initialization_config()
    decoder.push_initialization_config()
    encoder.bidir.prototype.weights_init = Orthogonal()
    encoder.bidir.prototype.rng = my_rng
    decoder.transition.weights_init = Orthogonal()
    decoder.transition.rng = my_rng
    encoder.initialize()
    decoder.initialize()

    # apply dropout for regularization
    if config['dropout'] < 1.0:
        # dropout is applied to the output of maxout in ghog
        logger.info('Applying dropout')
        dropout_inputs = [x for x in cg.intermediary_variables
                          if x.name == 'maxout_apply_output']
        cg = apply_dropout(cg, dropout_inputs, config['dropout'])

    # Apply weight noise for regularization
    if config['weight_noise_ff'] > 0.0:
        logger.info('Applying weight noise to ff layers')
        enc_params = Selector(encoder.lookup).get_params().values()
        enc_params += Selector(encoder.fwd_fork).get_params().values()
        enc_params += Selector(encoder.back_fork).get_params().values()
        dec_params = Selector(
            decoder.sequence_generator.readout).get_params().values()
        dec_params += Selector(
            decoder.sequence_generator.fork).get_params().values()
        dec_params += Selector(decoder.state_init).get_params().values()
        cg = apply_noise(cg, enc_params+dec_params, config['weight_noise_ff'], seed=my_rng)

    cost = cg.outputs[0]

    # Print shapes
    shapes = [param.get_value().shape for param in cg.parameters]
    logger.info("Parameter shapes: ")
    for shape, count in Counter(shapes).most_common():
        logger.info('    {:15}: {}'.format(shape, count))
    logger.info("Total number of parameters: {}".format(len(shapes)))
    


    # Print parameter names
    enc_dec_param_dict = merge(Selector(encoder).get_parameters(),
                               Selector(decoder).get_parameters())
    logger.info("Parameter names: ")
    for name, value in enc_dec_param_dict.items():
        logger.info('    {:15}: {}'.format(value.get_value().shape, name))
    logger.info("Total number of parameters: {}"
                .format(len(enc_dec_param_dict)))


    # Set up training model
    logger.info("Building model")
    training_model = Model(cost)    

    # Set extensions
    logger.info("Initializing extensions")
    # this is ugly code and done, because I am not sure if the order of the extensions is important
    if 'track2' in config['saveto']: # less epochs for track 2, because of more data
      if config['early_stopping']:
	extensions = [
	    FinishAfter(after_n_epochs=config['finish_after']/2),
	    #FinishAfter(after_n_batches=config['finish_after']),
	    TrainingDataMonitoring([cost], after_batch=True),
	    Printing(after_batch=True),
	    CheckpointNMT(config['saveto'],
			  every_n_batches=config['save_freq'])
	]
      else:
	extensions = [
	    FinishAfter(after_n_epochs=config['finish_after']/2),
	    #FinishAfter(after_n_batches=config['finish_after']),
	    TrainingDataMonitoring([cost], after_batch=True),
	    Printing(after_batch=True),
	    CheckpointNMT(config['saveto'],
			  every_n_batches=config['save_freq'])
	]
    else:
      if config['early_stopping']:
	extensions = [
	    FinishAfter(after_n_epochs=config['finish_after']),
	    #FinishAfter(after_n_batches=config['finish_after']),
	    TrainingDataMonitoring([cost], after_batch=True),
	    Printing(after_batch=True),
	    CheckpointNMT(config['saveto'],
			  every_n_batches=config['save_freq'])
	]
      else:
	extensions = [
	    FinishAfter(after_n_epochs=config['finish_after']),
	    #FinishAfter(after_n_batches=config['finish_after']),
	    TrainingDataMonitoring([cost], after_batch=True),
	    Printing(after_batch=True),
	    CheckpointNMT(config['saveto'],
			  every_n_batches=config['save_freq'])
	]

    # Set up beam search and sampling computation graphs if necessary
    if config['hook_samples'] >= 1:
        logger.info("Building sampling model")
        sampling_representation = encoder.apply(
            sampling_input, tensor.ones(sampling_input.shape))
        generated = decoder.generate(sampling_input, sampling_representation)
        search_model = Model(generated)
        _, samples = VariableFilter(
            bricks=[decoder.sequence_generator], name="outputs")(
                ComputationGraph(generated[1]))  # generated[1] is next_outputs

    
    # Add sampling
    if config['hook_samples'] >= 1:
        logger.info("Building sampler")
        extensions.append(
            Sampler(model=search_model, data_stream=tr_stream,
                    hook_samples=config['hook_samples'],
                    #every_n_batches=1,
                    every_n_batches=config['sampling_freq'],
                    src_vocab_size=8))
                    #src_vocab_size=config['src_vocab_size']))
    
    # Add early stopping based on bleu
    if config['val_set'] is not None:
        logger.info("Building accuracy validator")
        extensions.append(
            AccuracyValidator(sampling_input, samples=samples, config=config,
                          model=search_model, data_stream=dev_stream,
                          after_training=True,
                          #after_epoch=True))
                          every_n_epochs=5))
    else:
        logger.info("No validation set given for this language")
    
    # Reload model if necessary
    if config['reload']:
        extensions.append(LoadNMT(config['saveto']))
       
    
    # Set up training algorithm
    logger.info("Initializing training algorithm")
    algorithm = GradientDescent(
        cost=cost, parameters=cg.parameters,
        step_rule=CompositeRule([StepClipping(config['step_clipping']),
                                 eval(config['step_rule'])()])
    )

    # Initialize main loop
    logger.info("Initializing main loop")
    main_loop = MainLoop(
        model=training_model,
        algorithm=algorithm,
        data_stream=tr_stream,
        extensions=extensions
    )
    
    # Train!
    main_loop.run()
Beispiel #9
0
softmax_out = softmax.apply(pre_softmax.reshape((-1, o_dim)))
softmax_out = softmax_out.reshape(shape)
softmax_out.name = 'softmax_out'

# comparing only last time-step
cost = CategoricalCrossEntropy().apply(y[-1, :, 0], softmax_out[-1])
cost.name = 'CrossEntropy'
error_rate = MisclassificationRate().apply(y[-1, :, 0], softmax_out[-1])
error_rate.name = 'error_rate'

# Initialization
for brick in (x_to_h1, h1_to_o):
    brick.weights_init = IsotropicGaussian(0.01)
    brick.biases_init = Constant(0)
    brick.initialize()
rnn.weights_init = Identity()
rnn.biases_init = Constant(0)
rnn.initialize()

print 'Bulding training process...'
algorithm = GradientDescent(cost=cost,
                            parameters=ComputationGraph(cost).parameters,
                            step_rule=learning_algorithm(
                                learning_rate=1e-6,
                                momentum=0.0,
                                clipping_threshold=1.0,
                                algorithm='adam'))

train_stream, valid_stream = MNIST(batch_size=batch_size)

monitor_train_cost = TrainingDataMonitoring([cost, error_rate],
Beispiel #10
0
def test_identity():
    assert str(Identity(2.0)).endswith(' mult=2.0>')
Beispiel #11
0
    print "EVALled a thing"
    res = out.shape.eval({x: np.ones((6, 9, 19), dtype=floatX)})
    print res

    ### Identity testing
    from blocks.initialization import Identity, IsotropicGaussian
    from blocks import bricks
    from blocks.bricks import Sigmoid

    dim = 2
    floatX = theano.config.floatX
    x = tensor.tensor3('input')
    gru = GatedRecurrentFull(
        hidden_dim=dim,
        state_to_state_init=Identity(1.),
        #state_to_reset_init=Identity(1.),
        state_to_reset_init=IsotropicGaussian(0.2),
        state_to_update_init=Identity(1.0),
        activation=bricks.Identity(1.0),
        gate_activation=Sigmoid(),
        input_to_state_transform=Linear(
            input_dim=dim,
            output_dim=dim,
            weights_init=Identity(1.0),
            #weights_init=IsotropicGaussian(0.02),
            biases_init=Constant(0.0)),
        input_to_update_transform=Linear(
            input_dim=dim,
            output_dim=dim,
            #weights_init=Constant(0.0),
Beispiel #12
0
 def initialize(self):
     self.model.initialize()
     Identity().initialize(self.rnn.W_state, self.rnn.rng)
Beispiel #13
0
def testing_init(brick):
    brick.weights_init = Identity()
    brick.biases_init = Constant(0)
    brick.initialize()
Beispiel #14
0
 def __init__(self, std=1, mean=0):
     self.gaussian_init = IsotropicGaussian(std=std, mean=mean)
     self.identity = Identity()
Beispiel #15
0
    def set_up(self, config=None, make_prunable=False):
        """Loads and initializes all the theano variables for the
        training model and the decoding model.
        
        Args:
            config (dict): NMT configuration
        """
        if config:
            self.config = config
        else:
            config = self.config
        # Create Theano variables
        logging.debug('Creating theano variables')
        source_sentence_mask = tensor.matrix('source_mask')
        target_sentence_mask = tensor.matrix('target_mask')

        # Construct model (fs439: Add NoLookup options)
        if config['dec_layers'] != 1:
            logging.fatal("Only dec_layers=1 supported.")
        logging.debug('Building RNN encoder-decoder')
        if config['src_sparse_feat_map']:
            if config['enc_layers'] != 1:
                logging.fatal("Only enc_layers=1 supported for sparse "
                              "source features.")
            source_sentence = tensor.tensor3('source')
            self.sampling_input = tensor.tensor3('input')
            encoder = NoLookupEncoder(config['enc_embed'], config['enc_nhids'])
        else:
            source_sentence = tensor.lmatrix('source')
            self.sampling_input = tensor.lmatrix('input')
            if config['enc_layers'] > 1 and not config['enc_share_weights']:
                encoder = DeepBidirectionalEncoder(
                    config['src_vocab_size'], config['enc_embed'],
                    config['enc_layers'], config['enc_skip_connections'],
                    config['enc_nhids'])
            else:
                encoder = BidirectionalEncoder(config['src_vocab_size'],
                                               config['enc_embed'],
                                               config['enc_layers'],
                                               config['enc_skip_connections'],
                                               config['enc_nhids'])
        if config['trg_sparse_feat_map']:
            target_sentence = tensor.tensor3('target')
            decoder = NoLookupDecoder(
                config['trg_vocab_size'], config['dec_embed'],
                config['dec_nhids'], config['att_nhids'],
                config['maxout_nhids'], config['enc_nhids'] * 2,
                config['attention'], config['dec_attention_sources'],
                config['dec_readout_sources'], config['memory'],
                config['memory_size'], config['seq_len'], config['dec_init'])
        else:
            target_sentence = tensor.lmatrix('target')
            decoder = Decoder(config['trg_vocab_size'],
                              config['dec_embed'],
                              config['dec_nhids'],
                              config['att_nhids'],
                              config['maxout_nhids'],
                              config['enc_nhids'] * 2,
                              config['attention'],
                              config['dec_attention_sources'],
                              config['dec_readout_sources'],
                              config['memory'],
                              config['memory_size'],
                              config['seq_len'],
                              config['dec_init'],
                              make_prunable=make_prunable)
        if config['annotations'] != 'direct':
            annotators = []
            add_direct = False
            for name in config['annotations'].split(','):
                if name == 'direct':
                    add_direct = True
                elif name == 'hierarchical':
                    annotators.append(HierarchicalAnnotator(encoder))
                else:
                    logging.fatal("Annotation strategy %s unknown" % name)
            encoder = EncoderWithAnnotators(encoder, annotators, add_direct)
        annotations, annotations_mask = encoder.apply(source_sentence,
                                                      source_sentence_mask)
        self.cost = decoder.cost(annotations, annotations_mask,
                                 target_sentence, target_sentence_mask)

        logging.info('Creating computational graph')
        self.cg = ComputationGraph(self.cost)

        # Initialize model
        logging.info('Initializing model')
        encoder.weights_init = decoder.weights_init = Identity()
        #        encoder.weights_init = decoder.weights_init = IsotropicGaussian(
        #            config['weight_scale'])
        encoder.biases_init = decoder.biases_init = Constant(0)
        encoder.push_initialization_config()
        decoder.push_initialization_config()
        try:
            #            encoder.bidir.prototype.weights_init = Orthogonal()
            encoder.bidir.prototype.weights_init = Identity()
        except AttributeError:
            pass  # Its fine, no bidirectional encoder
        decoder.transition.weights_init = Identity()
        #        decoder.transition.weights_init = Orthogonal()
        encoder.initialize()
        decoder.initialize()

        # apply dropout for regularization
        if config['dropout'] < 1.0:
            # dropout is applied to the output of maxout in ghog
            logging.info('Applying dropout')
            dropout_inputs = [
                x for x in self.cg.intermediary_variables
                if x.name == 'maxout_apply_output'
            ]
            self.cg = apply_dropout(self.cg, dropout_inputs, config['dropout'])

        # Apply weight noise for regularization
        if config['weight_noise_ff'] > 0.0:
            logging.info('Applying weight noise to ff layers')
            if encoder.lookup:
                enc_params = Selector(encoder.lookup).get_parameters().values()
            enc_params += Selector(encoder.fwd_fork).get_parameters().values()
            enc_params += Selector(encoder.back_fork).get_parameters().values()
            dec_params = Selector(
                decoder.sequence_generator.readout).get_parameters().values()
            dec_params += Selector(
                decoder.sequence_generator.fork).get_parameters().values()
            self.cg = apply_noise(self.cg, enc_params + dec_params,
                                  config['weight_noise_ff'])

        # Print shapes
        shapes = [param.get_value().shape for param in self.cg.parameters]
        logging.debug("Parameter shapes: ")
        for shape, count in Counter(shapes).most_common():
            logging.debug('    {:15}: {}'.format(shape, count))
        logging.debug("Total number of CG parameters: {}".format(len(shapes)))

        # Print parameter names
        enc_dec_param_dict = merge(
            Selector(encoder).get_parameters(),
            Selector(decoder).get_parameters())
        logging.debug("Parameter names: ")
        for name, value in enc_dec_param_dict.items():
            logging.debug('    {:15}: {}'.format(value.get_value().shape,
                                                 name))
        logging.info("Total number of parameters: {}".format(
            len(enc_dec_param_dict)))

        # Set up training model
        logging.info("Building model")
        self.training_model = Model(self.cost)

        logging.info("Building sampling model")
        src_shape = (self.sampling_input.shape[-2],
                     self.sampling_input.shape[-1])  # batch_size x sen_length
        sampling_representation, _ = encoder.apply(self.sampling_input,
                                                   tensor.ones(src_shape))
        generated = decoder.generate(src_shape, sampling_representation)
        self.search_model = Model(generated)
        generated_outputs = VariableFilter(
            bricks=[decoder.sequence_generator], name="outputs")(
                ComputationGraph(generated[1]))  # generated[1] is next_outputs
        self.samples = generated_outputs[1]
        self.encoder = encoder
        self.decoder = decoder
Beispiel #16
0
def main_run(_config, _log):
    from collections import namedtuple
    c = namedtuple("Config", _config.keys())(*_config.values())

    _log.info("Running with" + str(_config))

    import theano
    from theano import tensor as T
    import numpy as np

    from dataset import IMDBText, GloveTransformer

    from blocks.initialization import Uniform, Constant, IsotropicGaussian, NdarrayInitialization, Identity, Orthogonal
    from blocks.bricks.recurrent import LSTM, SimpleRecurrent, GatedRecurrent
    from blocks.bricks.parallel import Fork

    from blocks.bricks import Linear, Sigmoid, Tanh, Rectifier
    from blocks import bricks

    from blocks.extensions import Printing, Timing
    from blocks.extensions.monitoring import (DataStreamMonitoring,
                                              TrainingDataMonitoring)

    from blocks.extensions.plot import Plot
    from plot import PlotHistogram

    from blocks.algorithms import GradientDescent, Adam, Scale, StepClipping, CompositeRule, AdaDelta
    from blocks.graph import ComputationGraph, apply_dropout
    from blocks.main_loop import MainLoop
    from blocks.model import Model

    from cuboid.algorithms import AdaM, NAG
    from cuboid.extensions import EpochProgress

    from fuel.streams import DataStream, ServerDataStream
    from fuel.transformers import Padding

    from fuel.schemes import ShuffledScheme
    from Conv1D import Conv1D, MaxPooling1D
    from schemes import BatchwiseShuffledScheme
    from bricks import WeightedSigmoid, GatedRecurrentFull

    from multiprocessing import Process
    import fuel
    import logging
    from initialization import SumInitialization

    from transformers import DropSources
    global train_p
    global test_p

    x = T.tensor3('features')
    #m = T.matrix('features_mask')
    y = T.imatrix('targets')

    #x = x+m.mean()*0

    dropout_variables = []
    embedding_size = 300
    glove_version = "glove.6B.300d.txt"
    #embedding_size = 50
    #glove_version = "vectors.6B.50d.txt"

    gloveMapping = Linear(
        input_dim=embedding_size,
        output_dim=c.rnn_input_dim,
        weights_init=Orthogonal(),
        #weights_init = IsotropicGaussian(c.wstd),
        biases_init=Constant(0.0),
        name="gloveMapping")
    gloveMapping.initialize()
    o = gloveMapping.apply(x)
    o = Rectifier(name="gloveRec").apply(o)
    dropout_variables.append(o)

    summed_mapped_glove = o.sum(axis=1)  # take out the sequence
    glove_out = Linear(input_dim=c.rnn_input_dim,
                       output_dim=1.0,
                       weights_init=IsotropicGaussian(c.wstd),
                       biases_init=Constant(0.0),
                       name="mapping_to_output")
    glove_out.initialize()
    deeply_sup_0 = glove_out.apply(summed_mapped_glove)
    deeply_sup_probs = Sigmoid(name="deeply_sup_softmax").apply(deeply_sup_0)

    input_dim = c.rnn_input_dim
    hidden_dim = c.rnn_dim

    gru = GatedRecurrentFull(
        hidden_dim=hidden_dim,
        activation=Tanh(),
        #activation=bricks.Identity(),
        gate_activation=Sigmoid(),
        state_to_state_init=SumInitialization(
            [Identity(1.0), IsotropicGaussian(c.wstd)]),
        state_to_reset_init=IsotropicGaussian(c.wstd),
        state_to_update_init=IsotropicGaussian(c.wstd),
        input_to_state_transform=Linear(input_dim=input_dim,
                                        output_dim=hidden_dim,
                                        weights_init=IsotropicGaussian(c.wstd),
                                        biases_init=Constant(0.0)),
        input_to_update_transform=Linear(
            input_dim=input_dim,
            output_dim=hidden_dim,
            weights_init=IsotropicGaussian(c.wstd),
            #biases_init=Constant(-2.0)),
            biases_init=Constant(-1.0)),
        input_to_reset_transform=Linear(
            input_dim=input_dim,
            output_dim=hidden_dim,
            weights_init=IsotropicGaussian(c.wstd),
            #biases_init=Constant(-3.0))
            biases_init=Constant(-2.0)))
    gru.initialize()
    rnn_in = o.dimshuffle(1, 0, 2)
    #rnn_in = o
    #rnn_out = gru.apply(rnn_in, mask=m.T)
    rnn_out = gru.apply(rnn_in)
    state_to_state = gru.rnn.state_to_state
    state_to_state.name = "state_to_state"
    #o = rnn_out[-1, :, :]
    o = rnn_out[-1]

    #o = rnn_out[:, -1, :]
    #o = rnn_out.mean(axis=1)

    #print rnn_last_out.eval({
    #x: np.ones((3, 101, 300), dtype=theano.config.floatX),
    #m: np.ones((3, 101), dtype=theano.config.floatX)})
    #raw_input()
    #o = rnn_out.mean(axis=1)
    dropout_variables.append(o)

    score_layer = Linear(input_dim=hidden_dim,
                         output_dim=1,
                         weights_init=IsotropicGaussian(std=c.wstd),
                         biases_init=Constant(0.),
                         name="linear2")
    score_layer.initialize()
    o = score_layer.apply(o)
    probs = Sigmoid().apply(o)

    #probs = deeply_sup_probs
    cost = -(y * T.log(probs) + (1 - y) * T.log(1 - probs)).mean()
    #cost_deeply_sup0 = - (y * T.log(deeply_sup_probs) + (1-y) * T.log(1 - deeply_sup_probs)).mean()
    # cost += cost_deeply_sup0 * c.deeply_factor

    cost.name = 'cost'
    misclassification = (y * (probs < 0.5) + (1 - y) * (probs > 0.5)).mean()
    misclassification.name = 'misclassification'

    #print rnn_in.shape.eval(
    #{x : np.ones((45, 111, embedding_size), dtype=theano.config.floatX),
    #})
    #print rnn_out.shape.eval(
    #{x : np.ones((45, 111, embedding_size), dtype=theano.config.floatX),
    #m : np.ones((45, 111), dtype=theano.config.floatX)})
    #print (m).sum(axis=1).shape.eval({
    #m : np.ones((45, 111), dtype=theano.config.floatX)})
    #print (m).shape.eval({
    #m : np.ones((45, 111), dtype=theano.config.floatX)})
    #raw_input()

    # =================

    cg = ComputationGraph([cost])
    cg = apply_dropout(cg, variables=dropout_variables, drop_prob=0.5)
    params = cg.parameters

    algorithm = GradientDescent(
        cost=cg.outputs[0],
        params=params,
        step_rule=CompositeRule([
            StepClipping(threshold=4),
            Adam(learning_rate=0.002, beta1=0.1, beta2=0.001),
            #NAG(lr=0.1, momentum=0.9),
            #AdaDelta(),
        ]))

    # ========
    print "setting up data"
    ports = {
        'gpu0_train': 5557,
        'gpu0_test': 5558,
        'cuda0_train': 5557,
        'cuda0_test': 5558,
        'opencl0:0_train': 5557,
        'opencl0:0_test': 5558,
        'gpu1_train': 5559,
        'gpu1_test': 5560,
    }

    #batch_size = 16
    #batch_size = 32
    batch_size = 40

    def start_server(port, which_set):
        fuel.server.logger.setLevel('WARN')
        dataset = IMDBText(which_set, sorted=True)

        n_train = dataset.num_examples
        #scheme = ShuffledScheme(examples=n_train, batch_size=batch_size)
        scheme = BatchwiseShuffledScheme(examples=n_train,
                                         batch_size=batch_size)

        stream = DataStream(dataset=dataset, iteration_scheme=scheme)
        print "loading glove"
        glove = GloveTransformer(glove_version, data_stream=stream)
        padded = Padding(
            data_stream=glove,
            #mask_sources=('features',)
            mask_sources=('features', ))

        padded = DropSources(padded, ['features_mask'])

        fuel.server.start_server(padded, port=port, hwm=20)

    train_port = ports[theano.config.device + '_train']
    train_p = Process(target=start_server, args=(train_port, 'train'))
    train_p.start()

    test_port = ports[theano.config.device + '_test']
    test_p = Process(target=start_server, args=(test_port, 'test'))
    test_p.start()

    #train_stream = ServerDataStream(('features', 'features_mask', 'targets'), port=train_port)
    #test_stream = ServerDataStream(('features', 'features_mask', 'targets'), port=test_port)

    train_stream = ServerDataStream(('features', 'targets'), port=train_port)
    test_stream = ServerDataStream(('features', 'targets'), port=test_port)

    print "setting up model"
    #ipdb.set_trace()

    n_examples = 25000
    print "Batches per epoch", n_examples // (batch_size + 1)
    batches_extensions = 100
    monitor_rate = 50
    #======
    model = Model(cg.outputs[0])
    extensions = []
    extensions.append(
        EpochProgress(batch_per_epoch=n_examples // batch_size + 1))
    extensions.append(
        TrainingDataMonitoring(
            [cost, misclassification],
            prefix='train',
            every_n_batches=monitor_rate,
        ))

    extensions.append(
        DataStreamMonitoring([cost, misclassification],
                             data_stream=test_stream,
                             prefix='test',
                             after_epoch=True,
                             before_first_epoch=False))

    extensions.append(Timing())
    extensions.append(Printing())

    #extensions.append(Plot("norms", channels=[['train_lstm_norm', 'train_pre_norm']], after_epoch=True))
    #extensions.append(Plot(theano.config.device+"_result", channels=[['test_misclassification', 'train_misclassification']], after_epoch=True))

    #extensions.append(PlotHistogram(
    #channels=['train_state_to_state'],
    #bins=50,
    #every_n_batches=30))

    extensions.append(
        Plot(theano.config.device + "_result",
             channels=[['train_cost'], ['train_misclassification']],
             every_n_batches=monitor_rate))

    main_loop = MainLoop(model=model,
                         data_stream=train_stream,
                         algorithm=algorithm,
                         extensions=extensions)
    main_loop.run()