Esempio n. 1
0
def main():
    x = T.tensor3('features')
    #m = T.matrix('features_mask')
    y = T.imatrix('targets')

    #x = x+m.mean()*0

    embedding_size = 300
    glove_version = "glove.6B.300d.txt"
    #embedding_size = 50
    #glove_version = "vectors.6B.50d.txt"
    wstd = 0.02

    #vaguely normalize
    x = x / 3.0 - .5

    #gloveMapping = Linear(
            #input_dim = embedding_size,
            #output_dim = 128,
            #weights_init = Orthogonal(),
            #biases_init = Constant(0.0),
            #name="gloveMapping"
            #)
    #gloveMapping.initialize()
    #o = gloveMapping.apply(x)
    #o = Rectifier(name="gloveRec").apply(o)
    o = x
    input_dim = 300

    gru = GatedRecurrentFull(
            hidden_dim = input_dim,
            activation=Tanh(),
            #activation=bricks.Identity(),
            gate_activation=Sigmoid(),
            state_to_state_init=IsotropicGaussian(0.02),
            state_to_reset_init=IsotropicGaussian(0.02),
            state_to_update_init=IsotropicGaussian(0.02),
            input_to_state_transform = Linear(
                input_dim=input_dim,
                output_dim=input_dim,
                weights_init=IsotropicGaussian(0.02),
                biases_init=Constant(0.0)),
            input_to_update_transform = Linear(
                input_dim=input_dim,
                output_dim=input_dim,
                weights_init=IsotropicGaussian(0.02),
                biases_init=Constant(0.0)),
            input_to_reset_transform = Linear(
                input_dim=input_dim,
                output_dim=input_dim,
                weights_init=IsotropicGaussian(0.02),
                biases_init=Constant(0.0))
            )
    gru.initialize()
    rnn_in = o.dimshuffle(1, 0, 2)
    #rnn_in = o
    #rnn_out = gru.apply(rnn_in, mask=m.T)
    rnn_out = gru.apply(rnn_in)
    state_to_state = gru.rnn.state_to_state
    state_to_state.name = "state_to_state"
    #o = rnn_out[-1, :, :]
    o = rnn_out[-1]

    #o = rnn_out[:, -1, :]
    #o = rnn_out.mean(axis=1)

    #print rnn_last_out.eval({
        #x: np.ones((3, 101, 300), dtype=theano.config.floatX), 
        #m: np.ones((3, 101), dtype=theano.config.floatX)})
    #raw_input()
    #o = rnn_out.mean(axis=1)

    score_layer = Linear(
            input_dim = 300,
            output_dim = 1,
            weights_init = IsotropicGaussian(std=wstd),
            biases_init = Constant(0.),
            use_bias=True,
            name="linear_score")
    score_layer.initialize()
    o = score_layer.apply(o)
    probs = Sigmoid().apply(o)

    cost = - (y * T.log(probs) + (1-y) * T.log(1 - probs)).mean()
    cost.name = 'cost'
    misclassification = (y * (probs < 0.5) + (1-y) * (probs > 0.5)).mean()
    misclassification.name = 'misclassification'

    #print rnn_in.shape.eval(
            #{x : np.ones((45, 111, embedding_size), dtype=theano.config.floatX),
                #})
    #print rnn_out.shape.eval(
            #{x : np.ones((45, 111, embedding_size), dtype=theano.config.floatX),
                #m : np.ones((45, 111), dtype=theano.config.floatX)})
    #print (m).sum(axis=1).shape.eval({
                #m : np.ones((45, 111), dtype=theano.config.floatX)})
    #print (m).shape.eval({
                #m : np.ones((45, 111), dtype=theano.config.floatX)})
    #raw_input()


    # =================

    cg = ComputationGraph([cost])
    #cg = apply_dropout(cg, variables=dropout_variables, drop_prob=0.5)
    params = cg.parameters
    for p in params:
        p.name += "___" + p.tag.annotations[0].name

    algorithm = GradientDescent(
            cost = cg.outputs[0],
            params=params,
            step_rule = CompositeRule([
                StepClipping(threshold=4),
                AdaM(),
                #NAG(lr=0.1, momentum=0.9),
                #AdaDelta(),
                ])

            )

    #algorithm.initialize()
    print params
    f = theano.function([x, y], algorithm.cost)
    ipdb.set_trace()

    print "making plots"
    #theano.printing.pydotprint(algorithm.cost, outfile='unopt.png')
    theano.printing.pydotprint(f , outfile='opt.png', scan_graphs=True)
Esempio n. 2
0
def main_run(_config, _log):
    from collections import namedtuple

    c = namedtuple("Config", _config.keys())(*_config.values())

    _log.info("Running with" + str(_config))

    import theano
    from theano import tensor as T
    import numpy as np

    from dataset import IMDBText, GloveTransformer

    from blocks.initialization import Uniform, Constant, IsotropicGaussian, NdarrayInitialization, Identity, Orthogonal
    from blocks.bricks.recurrent import LSTM, SimpleRecurrent, GatedRecurrent
    from blocks.bricks.parallel import Fork

    from blocks.bricks import Linear, Sigmoid, Tanh, Rectifier
    from blocks import bricks

    from blocks.extensions import Printing, Timing
    from blocks.extensions.monitoring import DataStreamMonitoring, TrainingDataMonitoring

    from blocks.extensions.plot import Plot
    from plot import PlotHistogram

    from blocks.algorithms import GradientDescent, Adam, Scale, StepClipping, CompositeRule, AdaDelta
    from blocks.graph import ComputationGraph, apply_dropout
    from blocks.main_loop import MainLoop
    from blocks.model import Model

    from cuboid.algorithms import AdaM, NAG
    from cuboid.extensions import EpochProgress

    from fuel.streams import DataStream, ServerDataStream
    from fuel.transformers import Padding

    from fuel.schemes import ShuffledScheme
    from Conv1D import Conv1D, MaxPooling1D
    from schemes import BatchwiseShuffledScheme
    from bricks import WeightedSigmoid, GatedRecurrentFull

    from multiprocessing import Process
    import fuel
    import logging
    from initialization import SumInitialization

    from transformers import DropSources

    global train_p
    global test_p

    x = T.tensor3("features")
    # m = T.matrix('features_mask')
    y = T.imatrix("targets")

    # x = x+m.mean()*0

    dropout_variables = []
    embedding_size = 300
    glove_version = "glove.6B.300d.txt"
    # embedding_size = 50
    # glove_version = "vectors.6B.50d.txt"

    gloveMapping = Linear(
        input_dim=embedding_size,
        output_dim=c.rnn_input_dim,
        weights_init=Orthogonal(),
        # weights_init = IsotropicGaussian(c.wstd),
        biases_init=Constant(0.0),
        name="gloveMapping",
    )
    gloveMapping.initialize()
    o = gloveMapping.apply(x)
    o = Rectifier(name="gloveRec").apply(o)
    dropout_variables.append(o)

    summed_mapped_glove = o.sum(axis=1)  # take out the sequence
    glove_out = Linear(
        input_dim=c.rnn_input_dim,
        output_dim=1.0,
        weights_init=IsotropicGaussian(c.wstd),
        biases_init=Constant(0.0),
        name="mapping_to_output",
    )
    glove_out.initialize()
    deeply_sup_0 = glove_out.apply(summed_mapped_glove)
    deeply_sup_probs = Sigmoid(name="deeply_sup_softmax").apply(deeply_sup_0)

    input_dim = c.rnn_input_dim
    hidden_dim = c.rnn_dim

    gru = GatedRecurrentFull(
        hidden_dim=hidden_dim,
        activation=Tanh(),
        # activation=bricks.Identity(),
        gate_activation=Sigmoid(),
        state_to_state_init=SumInitialization([Identity(1.0), IsotropicGaussian(c.wstd)]),
        state_to_reset_init=IsotropicGaussian(c.wstd),
        state_to_update_init=IsotropicGaussian(c.wstd),
        input_to_state_transform=Linear(
            input_dim=input_dim,
            output_dim=hidden_dim,
            weights_init=IsotropicGaussian(c.wstd),
            biases_init=Constant(0.0),
        ),
        input_to_update_transform=Linear(
            input_dim=input_dim,
            output_dim=hidden_dim,
            weights_init=IsotropicGaussian(c.wstd),
            # biases_init=Constant(-2.0)),
            biases_init=Constant(-1.0),
        ),
        input_to_reset_transform=Linear(
            input_dim=input_dim,
            output_dim=hidden_dim,
            weights_init=IsotropicGaussian(c.wstd),
            # biases_init=Constant(-3.0))
            biases_init=Constant(-2.0),
        ),
    )
    gru.initialize()
    rnn_in = o.dimshuffle(1, 0, 2)
    # rnn_in = o
    # rnn_out = gru.apply(rnn_in, mask=m.T)
    rnn_out = gru.apply(rnn_in)
    state_to_state = gru.rnn.state_to_state
    state_to_state.name = "state_to_state"
    # o = rnn_out[-1, :, :]
    o = rnn_out[-1]

    # o = rnn_out[:, -1, :]
    # o = rnn_out.mean(axis=1)

    # print rnn_last_out.eval({
    # x: np.ones((3, 101, 300), dtype=theano.config.floatX),
    # m: np.ones((3, 101), dtype=theano.config.floatX)})
    # raw_input()
    # o = rnn_out.mean(axis=1)
    dropout_variables.append(o)

    score_layer = Linear(
        input_dim=hidden_dim,
        output_dim=1,
        weights_init=IsotropicGaussian(std=c.wstd),
        biases_init=Constant(0.0),
        name="linear2",
    )
    score_layer.initialize()
    o = score_layer.apply(o)
    probs = Sigmoid().apply(o)

    # probs = deeply_sup_probs
    cost = -(y * T.log(probs) + (1 - y) * T.log(1 - probs)).mean()
    # cost_deeply_sup0 = - (y * T.log(deeply_sup_probs) + (1-y) * T.log(1 - deeply_sup_probs)).mean()
    # cost += cost_deeply_sup0 * c.deeply_factor

    cost.name = "cost"
    misclassification = (y * (probs < 0.5) + (1 - y) * (probs > 0.5)).mean()
    misclassification.name = "misclassification"

    # print rnn_in.shape.eval(
    # {x : np.ones((45, 111, embedding_size), dtype=theano.config.floatX),
    # })
    # print rnn_out.shape.eval(
    # {x : np.ones((45, 111, embedding_size), dtype=theano.config.floatX),
    # m : np.ones((45, 111), dtype=theano.config.floatX)})
    # print (m).sum(axis=1).shape.eval({
    # m : np.ones((45, 111), dtype=theano.config.floatX)})
    # print (m).shape.eval({
    # m : np.ones((45, 111), dtype=theano.config.floatX)})
    # raw_input()

    # =================

    cg = ComputationGraph([cost])
    cg = apply_dropout(cg, variables=dropout_variables, drop_prob=0.5)
    params = cg.parameters

    algorithm = GradientDescent(
        cost=cg.outputs[0],
        params=params,
        step_rule=CompositeRule(
            [
                StepClipping(threshold=4),
                Adam(learning_rate=0.002, beta1=0.1, beta2=0.001),
                # NAG(lr=0.1, momentum=0.9),
                # AdaDelta(),
            ]
        ),
    )

    # ========
    print "setting up data"
    ports = {
        "gpu0_train": 5557,
        "gpu0_test": 5558,
        "cuda0_train": 5557,
        "cuda0_test": 5558,
        "opencl0:0_train": 5557,
        "opencl0:0_test": 5558,
        "gpu1_train": 5559,
        "gpu1_test": 5560,
    }

    # batch_size = 16
    # batch_size = 32
    batch_size = 40

    def start_server(port, which_set):
        fuel.server.logger.setLevel("WARN")
        dataset = IMDBText(which_set, sorted=True)

        n_train = dataset.num_examples
        # scheme = ShuffledScheme(examples=n_train, batch_size=batch_size)
        scheme = BatchwiseShuffledScheme(examples=n_train, batch_size=batch_size)

        stream = DataStream(dataset=dataset, iteration_scheme=scheme)
        print "loading glove"
        glove = GloveTransformer(glove_version, data_stream=stream)
        padded = Padding(
            data_stream=glove,
            # mask_sources=('features',)
            mask_sources=("features",),
        )

        padded = DropSources(padded, ["features_mask"])

        fuel.server.start_server(padded, port=port, hwm=20)

    train_port = ports[theano.config.device + "_train"]
    train_p = Process(target=start_server, args=(train_port, "train"))
    train_p.start()

    test_port = ports[theano.config.device + "_test"]
    test_p = Process(target=start_server, args=(test_port, "test"))
    test_p.start()

    # train_stream = ServerDataStream(('features', 'features_mask', 'targets'), port=train_port)
    # test_stream = ServerDataStream(('features', 'features_mask', 'targets'), port=test_port)

    train_stream = ServerDataStream(("features", "targets"), port=train_port)
    test_stream = ServerDataStream(("features", "targets"), port=test_port)

    print "setting up model"
    # ipdb.set_trace()

    n_examples = 25000
    print "Batches per epoch", n_examples // (batch_size + 1)
    batches_extensions = 100
    monitor_rate = 50
    # ======
    model = Model(cg.outputs[0])
    extensions = []
    extensions.append(EpochProgress(batch_per_epoch=n_examples // batch_size + 1))
    extensions.append(TrainingDataMonitoring([cost, misclassification], prefix="train", every_n_batches=monitor_rate))

    extensions.append(
        DataStreamMonitoring(
            [cost, misclassification],
            data_stream=test_stream,
            prefix="test",
            after_epoch=True,
            before_first_epoch=False,
        )
    )

    extensions.append(Timing())
    extensions.append(Printing())

    # extensions.append(Plot("norms", channels=[['train_lstm_norm', 'train_pre_norm']], after_epoch=True))
    # extensions.append(Plot(theano.config.device+"_result", channels=[['test_misclassification', 'train_misclassification']], after_epoch=True))

    # extensions.append(PlotHistogram(
    # channels=['train_state_to_state'],
    # bins=50,
    # every_n_batches=30))

    extensions.append(
        Plot(
            theano.config.device + "_result",
            channels=[["train_cost"], ["train_misclassification"]],
            every_n_batches=monitor_rate,
        )
    )

    main_loop = MainLoop(model=model, data_stream=train_stream, algorithm=algorithm, extensions=extensions)
    main_loop.run()
Esempio n. 3
0
def main():
    x = T.tensor3('features')
    #m = T.matrix('features_mask')
    y = T.imatrix('targets')

    #x = x+m.mean()*0

    embedding_size = 300
    glove_version = "glove.6B.300d.txt"
    #embedding_size = 50
    #glove_version = "vectors.6B.50d.txt"
    wstd = 0.02

    #vaguely normalize
    x = x / 3.0 - .5

    #gloveMapping = Linear(
    #input_dim = embedding_size,
    #output_dim = 128,
    #weights_init = Orthogonal(),
    #biases_init = Constant(0.0),
    #name="gloveMapping"
    #)
    #gloveMapping.initialize()
    #o = gloveMapping.apply(x)
    #o = Rectifier(name="gloveRec").apply(o)
    o = x
    input_dim = 300

    gru = GatedRecurrentFull(
        hidden_dim=input_dim,
        activation=Tanh(),
        #activation=bricks.Identity(),
        gate_activation=Sigmoid(),
        state_to_state_init=IsotropicGaussian(0.02),
        state_to_reset_init=IsotropicGaussian(0.02),
        state_to_update_init=IsotropicGaussian(0.02),
        input_to_state_transform=Linear(input_dim=input_dim,
                                        output_dim=input_dim,
                                        weights_init=IsotropicGaussian(0.02),
                                        biases_init=Constant(0.0)),
        input_to_update_transform=Linear(input_dim=input_dim,
                                         output_dim=input_dim,
                                         weights_init=IsotropicGaussian(0.02),
                                         biases_init=Constant(0.0)),
        input_to_reset_transform=Linear(input_dim=input_dim,
                                        output_dim=input_dim,
                                        weights_init=IsotropicGaussian(0.02),
                                        biases_init=Constant(0.0)))
    gru.initialize()
    rnn_in = o.dimshuffle(1, 0, 2)
    #rnn_in = o
    #rnn_out = gru.apply(rnn_in, mask=m.T)
    rnn_out = gru.apply(rnn_in)
    state_to_state = gru.rnn.state_to_state
    state_to_state.name = "state_to_state"
    #o = rnn_out[-1, :, :]
    o = rnn_out[-1]

    #o = rnn_out[:, -1, :]
    #o = rnn_out.mean(axis=1)

    #print rnn_last_out.eval({
    #x: np.ones((3, 101, 300), dtype=theano.config.floatX),
    #m: np.ones((3, 101), dtype=theano.config.floatX)})
    #raw_input()
    #o = rnn_out.mean(axis=1)

    score_layer = Linear(input_dim=300,
                         output_dim=1,
                         weights_init=IsotropicGaussian(std=wstd),
                         biases_init=Constant(0.),
                         use_bias=True,
                         name="linear_score")
    score_layer.initialize()
    o = score_layer.apply(o)
    probs = Sigmoid().apply(o)

    cost = -(y * T.log(probs) + (1 - y) * T.log(1 - probs)).mean()
    cost.name = 'cost'
    misclassification = (y * (probs < 0.5) + (1 - y) * (probs > 0.5)).mean()
    misclassification.name = 'misclassification'

    #print rnn_in.shape.eval(
    #{x : np.ones((45, 111, embedding_size), dtype=theano.config.floatX),
    #})
    #print rnn_out.shape.eval(
    #{x : np.ones((45, 111, embedding_size), dtype=theano.config.floatX),
    #m : np.ones((45, 111), dtype=theano.config.floatX)})
    #print (m).sum(axis=1).shape.eval({
    #m : np.ones((45, 111), dtype=theano.config.floatX)})
    #print (m).shape.eval({
    #m : np.ones((45, 111), dtype=theano.config.floatX)})
    #raw_input()

    # =================

    cg = ComputationGraph([cost])
    #cg = apply_dropout(cg, variables=dropout_variables, drop_prob=0.5)
    params = cg.parameters
    for p in params:
        p.name += "___" + p.tag.annotations[0].name

    algorithm = GradientDescent(
        cost=cg.outputs[0],
        params=params,
        step_rule=CompositeRule([
            StepClipping(threshold=4),
            AdaM(),
            #NAG(lr=0.1, momentum=0.9),
            #AdaDelta(),
        ]))

    #algorithm.initialize()
    print params
    f = theano.function([x, y], algorithm.cost)
    ipdb.set_trace()

    print "making plots"
    #theano.printing.pydotprint(algorithm.cost, outfile='unopt.png')
    theano.printing.pydotprint(f, outfile='opt.png', scan_graphs=True)
Esempio n. 4
0
def main_run(_config, _log):
    from collections import namedtuple
    c = namedtuple("Config", _config.keys())(*_config.values())

    _log.info("Running with" + str(_config))

    import theano
    from theano import tensor as T
    import numpy as np

    from dataset import IMDBText, GloveTransformer

    from blocks.initialization import Uniform, Constant, IsotropicGaussian, NdarrayInitialization, Identity, Orthogonal
    from blocks.bricks.recurrent import LSTM, SimpleRecurrent, GatedRecurrent
    from blocks.bricks.parallel import Fork

    from blocks.bricks import Linear, Sigmoid, Tanh, Rectifier
    from blocks import bricks

    from blocks.extensions import Printing, Timing
    from blocks.extensions.monitoring import (DataStreamMonitoring,
                                              TrainingDataMonitoring)

    from blocks.extensions.plot import Plot
    from plot import PlotHistogram

    from blocks.algorithms import GradientDescent, Adam, Scale, StepClipping, CompositeRule, AdaDelta
    from blocks.graph import ComputationGraph, apply_dropout
    from blocks.main_loop import MainLoop
    from blocks.model import Model

    from cuboid.algorithms import AdaM, NAG
    from cuboid.extensions import EpochProgress

    from fuel.streams import DataStream, ServerDataStream
    from fuel.transformers import Padding

    from fuel.schemes import ShuffledScheme
    from Conv1D import Conv1D, MaxPooling1D
    from schemes import BatchwiseShuffledScheme
    from bricks import WeightedSigmoid, GatedRecurrentFull

    from multiprocessing import Process
    import fuel
    import logging
    from initialization import SumInitialization

    from transformers import DropSources
    global train_p
    global test_p

    x = T.tensor3('features')
    #m = T.matrix('features_mask')
    y = T.imatrix('targets')

    #x = x+m.mean()*0

    dropout_variables = []
    embedding_size = 300
    glove_version = "glove.6B.300d.txt"
    #embedding_size = 50
    #glove_version = "vectors.6B.50d.txt"

    gloveMapping = Linear(
        input_dim=embedding_size,
        output_dim=c.rnn_input_dim,
        weights_init=Orthogonal(),
        #weights_init = IsotropicGaussian(c.wstd),
        biases_init=Constant(0.0),
        name="gloveMapping")
    gloveMapping.initialize()
    o = gloveMapping.apply(x)
    o = Rectifier(name="gloveRec").apply(o)
    dropout_variables.append(o)

    summed_mapped_glove = o.sum(axis=1)  # take out the sequence
    glove_out = Linear(input_dim=c.rnn_input_dim,
                       output_dim=1.0,
                       weights_init=IsotropicGaussian(c.wstd),
                       biases_init=Constant(0.0),
                       name="mapping_to_output")
    glove_out.initialize()
    deeply_sup_0 = glove_out.apply(summed_mapped_glove)
    deeply_sup_probs = Sigmoid(name="deeply_sup_softmax").apply(deeply_sup_0)

    input_dim = c.rnn_input_dim
    hidden_dim = c.rnn_dim

    gru = GatedRecurrentFull(
        hidden_dim=hidden_dim,
        activation=Tanh(),
        #activation=bricks.Identity(),
        gate_activation=Sigmoid(),
        state_to_state_init=SumInitialization(
            [Identity(1.0), IsotropicGaussian(c.wstd)]),
        state_to_reset_init=IsotropicGaussian(c.wstd),
        state_to_update_init=IsotropicGaussian(c.wstd),
        input_to_state_transform=Linear(input_dim=input_dim,
                                        output_dim=hidden_dim,
                                        weights_init=IsotropicGaussian(c.wstd),
                                        biases_init=Constant(0.0)),
        input_to_update_transform=Linear(
            input_dim=input_dim,
            output_dim=hidden_dim,
            weights_init=IsotropicGaussian(c.wstd),
            #biases_init=Constant(-2.0)),
            biases_init=Constant(-1.0)),
        input_to_reset_transform=Linear(
            input_dim=input_dim,
            output_dim=hidden_dim,
            weights_init=IsotropicGaussian(c.wstd),
            #biases_init=Constant(-3.0))
            biases_init=Constant(-2.0)))
    gru.initialize()
    rnn_in = o.dimshuffle(1, 0, 2)
    #rnn_in = o
    #rnn_out = gru.apply(rnn_in, mask=m.T)
    rnn_out = gru.apply(rnn_in)
    state_to_state = gru.rnn.state_to_state
    state_to_state.name = "state_to_state"
    #o = rnn_out[-1, :, :]
    o = rnn_out[-1]

    #o = rnn_out[:, -1, :]
    #o = rnn_out.mean(axis=1)

    #print rnn_last_out.eval({
    #x: np.ones((3, 101, 300), dtype=theano.config.floatX),
    #m: np.ones((3, 101), dtype=theano.config.floatX)})
    #raw_input()
    #o = rnn_out.mean(axis=1)
    dropout_variables.append(o)

    score_layer = Linear(input_dim=hidden_dim,
                         output_dim=1,
                         weights_init=IsotropicGaussian(std=c.wstd),
                         biases_init=Constant(0.),
                         name="linear2")
    score_layer.initialize()
    o = score_layer.apply(o)
    probs = Sigmoid().apply(o)

    #probs = deeply_sup_probs
    cost = -(y * T.log(probs) + (1 - y) * T.log(1 - probs)).mean()
    #cost_deeply_sup0 = - (y * T.log(deeply_sup_probs) + (1-y) * T.log(1 - deeply_sup_probs)).mean()
    # cost += cost_deeply_sup0 * c.deeply_factor

    cost.name = 'cost'
    misclassification = (y * (probs < 0.5) + (1 - y) * (probs > 0.5)).mean()
    misclassification.name = 'misclassification'

    #print rnn_in.shape.eval(
    #{x : np.ones((45, 111, embedding_size), dtype=theano.config.floatX),
    #})
    #print rnn_out.shape.eval(
    #{x : np.ones((45, 111, embedding_size), dtype=theano.config.floatX),
    #m : np.ones((45, 111), dtype=theano.config.floatX)})
    #print (m).sum(axis=1).shape.eval({
    #m : np.ones((45, 111), dtype=theano.config.floatX)})
    #print (m).shape.eval({
    #m : np.ones((45, 111), dtype=theano.config.floatX)})
    #raw_input()

    # =================

    cg = ComputationGraph([cost])
    cg = apply_dropout(cg, variables=dropout_variables, drop_prob=0.5)
    params = cg.parameters

    algorithm = GradientDescent(
        cost=cg.outputs[0],
        params=params,
        step_rule=CompositeRule([
            StepClipping(threshold=4),
            Adam(learning_rate=0.002, beta1=0.1, beta2=0.001),
            #NAG(lr=0.1, momentum=0.9),
            #AdaDelta(),
        ]))

    # ========
    print "setting up data"
    ports = {
        'gpu0_train': 5557,
        'gpu0_test': 5558,
        'cuda0_train': 5557,
        'cuda0_test': 5558,
        'opencl0:0_train': 5557,
        'opencl0:0_test': 5558,
        'gpu1_train': 5559,
        'gpu1_test': 5560,
    }

    #batch_size = 16
    #batch_size = 32
    batch_size = 40

    def start_server(port, which_set):
        fuel.server.logger.setLevel('WARN')
        dataset = IMDBText(which_set, sorted=True)

        n_train = dataset.num_examples
        #scheme = ShuffledScheme(examples=n_train, batch_size=batch_size)
        scheme = BatchwiseShuffledScheme(examples=n_train,
                                         batch_size=batch_size)

        stream = DataStream(dataset=dataset, iteration_scheme=scheme)
        print "loading glove"
        glove = GloveTransformer(glove_version, data_stream=stream)
        padded = Padding(
            data_stream=glove,
            #mask_sources=('features',)
            mask_sources=('features', ))

        padded = DropSources(padded, ['features_mask'])

        fuel.server.start_server(padded, port=port, hwm=20)

    train_port = ports[theano.config.device + '_train']
    train_p = Process(target=start_server, args=(train_port, 'train'))
    train_p.start()

    test_port = ports[theano.config.device + '_test']
    test_p = Process(target=start_server, args=(test_port, 'test'))
    test_p.start()

    #train_stream = ServerDataStream(('features', 'features_mask', 'targets'), port=train_port)
    #test_stream = ServerDataStream(('features', 'features_mask', 'targets'), port=test_port)

    train_stream = ServerDataStream(('features', 'targets'), port=train_port)
    test_stream = ServerDataStream(('features', 'targets'), port=test_port)

    print "setting up model"
    #ipdb.set_trace()

    n_examples = 25000
    print "Batches per epoch", n_examples // (batch_size + 1)
    batches_extensions = 100
    monitor_rate = 50
    #======
    model = Model(cg.outputs[0])
    extensions = []
    extensions.append(
        EpochProgress(batch_per_epoch=n_examples // batch_size + 1))
    extensions.append(
        TrainingDataMonitoring(
            [cost, misclassification],
            prefix='train',
            every_n_batches=monitor_rate,
        ))

    extensions.append(
        DataStreamMonitoring([cost, misclassification],
                             data_stream=test_stream,
                             prefix='test',
                             after_epoch=True,
                             before_first_epoch=False))

    extensions.append(Timing())
    extensions.append(Printing())

    #extensions.append(Plot("norms", channels=[['train_lstm_norm', 'train_pre_norm']], after_epoch=True))
    #extensions.append(Plot(theano.config.device+"_result", channels=[['test_misclassification', 'train_misclassification']], after_epoch=True))

    #extensions.append(PlotHistogram(
    #channels=['train_state_to_state'],
    #bins=50,
    #every_n_batches=30))

    extensions.append(
        Plot(theano.config.device + "_result",
             channels=[['train_cost'], ['train_misclassification']],
             every_n_batches=monitor_rate))

    main_loop = MainLoop(model=model,
                         data_stream=train_stream,
                         algorithm=algorithm,
                         extensions=extensions)
    main_loop.run()