Ejemplo n.º 1
0
def main(save_to, num_epochs, bokeh=False):
    mlp = MLP([Tanh(), Softmax()], [784, 100, 10],
              weights_init=IsotropicGaussian(0.01),
              biases_init=Constant(0))
    mlp.initialize()
    x = tensor.matrix('features')
    y = tensor.lmatrix('targets')
    probs = mlp.apply(x)
    cost = CategoricalCrossEntropy().apply(y.flatten(), probs)
    error_rate = MisclassificationRate().apply(y.flatten(), probs)

    cg = ComputationGraph([cost])
    W1, W2 = VariableFilter(roles=[WEIGHT])(cg.variables)
    cost = cost + .00005 * (W1**2).sum() + .00005 * (W2**2).sum()
    cost.name = 'final_cost'

    mnist_train = MNIST("train")
    mnist_test = MNIST("test")

    algorithm = GradientDescent(cost=cost,
                                params=cg.parameters,
                                step_rule=Scale(learning_rate=0.1))
    extensions = [
        Timing(),
        FinishAfter(after_n_epochs=num_epochs),
        DataStreamMonitoring([cost, error_rate],
                             DataStream(mnist_test,
                                        iteration_scheme=SequentialScheme(
                                            mnist_test.num_examples, 500)),
                             prefix="test"),
        TrainingDataMonitoring([
            cost, error_rate,
            aggregation.mean(algorithm.total_gradient_norm)
        ],
                               prefix="train",
                               after_epoch=True),
        Checkpoint(save_to),
        Printing()
    ]

    if bokeh:
        extensions.append(
            Plot('MNIST example',
                 channels=[[
                     'test_final_cost',
                     'test_misclassificationrate_apply_error_rate'
                 ], ['train_total_gradient_norm']]))

    main_loop = MainLoop(algorithm,
                         DataStream(mnist_train,
                                    iteration_scheme=SequentialScheme(
                                        mnist_train.num_examples, 50)),
                         model=Model(cost),
                         extensions=extensions)

    main_loop.run()
Ejemplo n.º 2
0
def main(name, epochs, batch_size, learning_rate, dim, mix_dim, old_model_name,
         max_length, bokeh, GRU, dropout, depth, max_grad, step_method,
         epsilon, sample):

    #----------------------------------------------------------------------
    datasource = name

    def shnum(x):
        """ Convert a positive float into a short tag-usable string
             E.g.: 0 -> 0, 0.005 -> 53, 100 -> 1-2
        """
        return '0' if x <= 0 else '%s%d' % (
            ("%e" % x)[0], -np.floor(np.log10(x)))

    jobname = "%s-%dX%dm%dd%dr%sb%de%s" % (
        datasource, depth, dim, mix_dim, int(
            dropout * 10), shnum(learning_rate), batch_size, shnum(epsilon))
    if max_length != 600:
        jobname += '-L%d' % max_length

    if GRU:
        jobname += 'g'
    if max_grad != 5.:
        jobname += 'G%g' % max_grad
    if step_method != 'adam':
        jobname += step_method

    if sample:
        print("Sampling")
    else:
        print("\nRunning experiment %s" % jobname)

    #----------------------------------------------------------------------
    if depth > 1:
        transition = LSTMstack(dim=dim,
                               depth=depth,
                               name="transition",
                               lstm_name="transition")
        assert not GRU
    elif GRU:
        transition = GatedRecurrent(dim=dim, name="transition")
    else:
        transition = LSTM(dim=dim, name="transition")

    emitter = SketchEmitter(mix_dim=mix_dim, epsilon=epsilon, name="emitter")
    readout = Readout(readout_dim=emitter.get_dim('inputs'),
                      source_names=['states'],
                      emitter=emitter,
                      name="readout")
    normal_inputs = [
        name for name in transition.apply.sequences if 'mask' not in name
    ]
    fork = Fork(normal_inputs, prototype=Linear(use_bias=True))
    generator = SequenceGenerator(readout=readout,
                                  transition=transition,
                                  fork=fork)

    # Initialization settings
    generator.weights_init = OrthogonalGlorot()
    generator.biases_init = Constant(0)

    # Build the cost computation graph [steps,batch_size, 3]
    x = T.tensor3('features', dtype=floatX)[:max_length, :, :]
    x.tag.test_value = np.ones((max_length, batch_size, 3)).astype(np.float32)
    cost = generator.cost(x)
    cost.name = "sequence_log_likelihood"

    # Give an idea of what's going on
    model = Model(cost)
    params = model.get_params()
    logger.info("Parameters:\n" +
                pprint.pformat([(key, value.get_value().shape)
                                for key, value in params.items()],
                               width=120))
    model_size = 0
    for v in params.itervalues():
        s = v.get_value().shape
        model_size += s[0] * (s[1] if len(s) > 1 else 1)
    logger.info("Total number of parameters %d" % model_size)

    #------------------------------------------------------------
    extensions = []
    if old_model_name == 'continue':
        extensions.append(LoadFromDump(jobname))
    elif old_model_name:
        # or you can just load the weights without state using:
        old_params = LoadFromDump(old_model_name).manager.load_parameters()
        model.set_param_values(old_params)
    else:
        # Initialize parameters
        for brick in model.get_top_bricks():
            brick.initialize()

    if sample:
        assert old_model_name and old_model_name != 'continue'
        Sample(generator, steps=max_length, path='.').do(None)
        exit(0)

    #------------------------------------------------------------
    # Define the training algorithm.
    cg = ComputationGraph(cost)
    if dropout > 0.:
        from blocks.roles import INPUT, OUTPUT
        dropout_target = VariableFilter(roles=[OUTPUT],
                                        bricks=[transition],
                                        name_regex='states')(cg.variables)
        cg = apply_dropout(cg, dropout_target, dropout)
        cost = cg.outputs[0]

    if step_method == 'adam':
        step_rule = Adam(learning_rate)
    elif step_method == 'rmsprop':
        step_rule = RMSProp(learning_rate, decay_rate=0.95)
    elif step_method == 'adagrad':
        step_rule = AdaGrad(learning_rate)
    elif step_method == 'adadelta':
        step_rule = AdaDelta()
    elif step_method == 'scale':
        step_rule = Scale(learning_rate=0.1)
    else:
        raise Exception('Unknown sttep method %s' % step_method)

    step_rule = CompositeRule([StepClipping(max_grad), step_rule])

    algorithm = GradientDescent(cost=cost,
                                params=cg.parameters,
                                step_rule=step_rule)

    #------------------------------------------------------------
    observables = [cost]

    # Fetch variables useful for debugging
    (energies, ) = VariableFilter(applications=[generator.readout.readout],
                                  name_regex="output")(cg.variables)
    (activations, ) = VariableFilter(
        applications=[generator.transition.apply],
        name=generator.transition.apply.states[0])(cg.variables)
    min_energy = named_copy(energies.min(), "min_energy")
    max_energy = named_copy(energies.max(), "max_energy")
    mean_activation = named_copy(abs(activations).mean(), "mean_activation")
    observables += [min_energy, max_energy, mean_activation]

    observables += [algorithm.total_step_norm, algorithm.total_gradient_norm]
    for name, param in params.items():
        observables.append(named_copy(param.norm(2), name + "_norm"))
        observables.append(
            named_copy(algorithm.gradients[param].norm(2),
                       name + "_grad_norm"))

    #------------------------------------------------------------
    datasource_fname = os.path.join(fuel.config.data_path, datasource,
                                    datasource + '.hdf5')

    train_ds = H5PYDataset(
        datasource_fname,  #max_length=max_length,
        which_set='train',
        sources=('features', ),
        load_in_memory=True)
    train_stream = DataStream(train_ds,
                              iteration_scheme=ShuffledScheme(
                                  train_ds.num_examples, batch_size))

    test_ds = H5PYDataset(
        datasource_fname,  #max_length=max_length,
        which_set='test',
        sources=('features', ),
        load_in_memory=True)
    test_stream = DataStream(test_ds,
                             iteration_scheme=SequentialScheme(
                                 test_ds.num_examples, batch_size))

    train_stream = Mapping(train_stream, _transpose)
    test_stream = Mapping(test_stream, _transpose)

    def stream_stats(ds, label):
        itr = ds.get_epoch_iterator(as_dict=True)
        batch_count = 0
        examples_count = 0
        for batch in itr:
            batch_count += 1
            examples_count += batch['features'].shape[1]
        print('%s #batch %d #examples %d' %
              (label, batch_count, examples_count))

    stream_stats(train_stream, 'train')
    stream_stats(test_stream, 'test')

    extensions += [
        Timing(every_n_batches=10),
        TrainingDataMonitoring(observables, prefix="train",
                               every_n_batches=10),
        DataStreamMonitoring(
            [cost],
            test_stream,
            prefix="test",
            on_resumption=True,
            after_epoch=False,  # by default this is True
            every_n_batches=100),
        # all monitored data is ready so print it...
        # (next steps may take more time and we want to see the
        # results as soon as possible so print as soon as you can)
        Printing(every_n_batches=10),
        # perform multiple dumps at different intervals
        # so if one of them breaks (has nan) we can hopefully
        # find a model from few batches ago in the other
        Dump(jobname, every_n_batches=11),
        Dump(jobname + '.test', every_n_batches=100),
        Sample(generator,
               steps=max_length,
               path=jobname + '.test',
               every_n_batches=100),
        ProgressBar(),
        FinishAfter(after_n_epochs=epochs)
        # This shows a way to handle NaN emerging during
        # training: simply finish it.
        .add_condition("after_batch", _is_nan),
    ]

    if bokeh:
        extensions.append(Plot('sketch', channels=[
            ['cost'],
        ]))

    # Construct the main loop and start training!
    main_loop = MainLoop(model=model,
                         data_stream=train_stream,
                         algorithm=algorithm,
                         extensions=extensions)

    main_loop.run()
Ejemplo n.º 3
0
def pretrain_rnn(train, rnnrbm, test=None, epochs=1000, bokeh=True):
    lr = theano.shared(float32(0.1))

    probs, _, _, _ = rnnrbm.rnn_pretrain_pred(x, x_mask)
    cost = NegativeLogLikelihood().apply(y, probs, y_mask)

    error_rate = MismulitclassificationRate().apply(y, probs, y_mask)
    error_rate.name = "error on note as a whole"
    mistake_rate = MismulitmistakeRate().apply(y, probs, y_mask)
    mistake_rate.name = "single error within note"
    cost.name = 'final_cost'

    model = Model(cost)
    cg = ComputationGraph([cost])
    step_rule = CompositeRule([
        RemoveNotFinite(),
        StepClipping(30.0),
        Adam(learning_rate=lr),
        StepClipping(6.0),
        RemoveNotFinite()
    ])
    algorithm = GradientDescent(step_rule=step_rule,
                                cost=cost,
                                params=cg.parameters)
    extensions = [
        SharedVariableModifier(parameter=lr,
                               function=lambda n, v: float32(0.7 * v)
                               if n % 700 == 0 else v),
        FinishAfter(after_n_epochs=epochs),
        TrainingDataMonitoring(
            [
                cost,
                error_rate,
                mistake_rate,
            ],  # hidden_states, debug_val, param_nans,
            # aggregation.mean(algorithm.total_gradient_norm)],  #+ params,
            prefix="train",
            after_epoch=False,
            every_n_batches=40),
        Timing(),
        Printing(),
        ProgressBar()
    ]
    if test is not None:
        extensions.append(
            DataStreamMonitoring([cost, error_rate, mistake_rate],
                                 data_stream=test,
                                 updates=cg.updates,
                                 prefix="test",
                                 after_epoch=False,
                                 every_n_batches=40))

    if bokeh:
        extensions.append(
            Plot(
                'Pretrain RNN',
                channels=[
                    [
                        'train_error on note as a whole',
                        'train_single error within note',
                        'test_error on note as a whole',
                        'test_single error within note'
                    ],
                    ['train_rbm_cost'],
                    # ['train_total_gradient_norm'],
                ]))

    main_loop = MainLoop(algorithm=algorithm,
                         data_stream=train,
                         model=model,
                         extensions=extensions)
    return main_loop
Ejemplo n.º 4
0
def train_rnnrbm(train,
                 rnnrbm,
                 epochs=1000,
                 test=None,
                 bokeh=True,
                 load_path=None):
    cdk = theano.shared(10)
    lr = theano.shared(float32(0.004))

    cost, v_sample = rnnrbm.cost(examples=x, mask=x_mask, k=cdk)

    error_rate = MismulitclassificationRate().apply(x, v_sample[-1], x_mask)
    error_rate.name = "error on note as a whole"
    mistake_rate = MismulitmistakeRate().apply(x, v_sample[-1], x_mask)
    mistake_rate.name = "single error within note"
    cost.name = 'rbm_cost'

    model = Model(cost)
    cg = ComputationGraph([cost])
    step_rule = CompositeRule([
        RemoveNotFinite(),
        StepClipping(30.0),
        Adam(learning_rate=lr),
        StepClipping(6.0),
        RemoveNotFinite()
    ])  # Scale(0.01)
    gradients = dict(
        equizip(cg.parameters,
                T.grad(cost, cg.parameters, consider_constant=[v_sample])))
    algorithm = GradientDescent(step_rule=step_rule,
                                gradients=gradients,
                                cost=cost,
                                params=cg.parameters)
    algorithm.add_updates(cg.updates)
    extensions = [
        SharedVariableModifier(parameter=cdk,
                               function=lambda n, v: rnnrbm_cdk[n]
                               if rnnrbm_cdk.get(n) else v),
        SharedVariableModifier(parameter=lr,
                               function=lambda n, v: float32(0.78 * v)
                               if n % (200 * 5) == 0 else v),
        FinishAfter(after_n_epochs=epochs),
        TrainingDataMonitoring(
            [
                cost,
                error_rate,
                mistake_rate,
            ],  # hidden_states, debug_val, param_nans,
            # aggregation.mean(algorithm.total_gradient_norm)],  #+ params,
            prefix="train",
            after_epoch=False,
            every_n_batches=40),
        Timing(),
        Printing(),
        ProgressBar()
    ]
    if test is not None:
        extensions.append(
            DataStreamMonitoring([cost, error_rate, mistake_rate],
                                 data_stream=test,
                                 updates=cg.updates,
                                 prefix="test",
                                 after_epoch=False,
                                 every_n_batches=40))
    if bokeh:
        extensions.append(
            Plot(
                'Training RNN-RBM',
                channels=[
                    [
                        'train_error on note as a whole',
                        'train_single error within note',
                        'test_error on note as a whole',
                        'test_single error within note'
                    ],
                    ['train_final_cost'],
                    # ['train_total_gradient_norm'],
                ]))

    main_loop = MainLoop(algorithm=algorithm,
                         data_stream=train,
                         model=model,
                         extensions=extensions)
    return main_loop
Ejemplo n.º 5
0
def main():
    x = T.tensor3('features')
    m = T.matrix('features_mask')
    y = T.imatrix('targets')

    embedding_size = 300
    glove_version = "glove.6B.300d.txt"
    #embedding_size = 50
    #glove_version = "vectors.6B.50d.txt"

    o = x.sum(axis=1) + m.mean() * 0

    score_layer = Linear(
            input_dim = 300,
            output_dim = 1,
            weights_init = IsotropicGaussian(std=0.02),
            biases_init = Constant(0.),
            name="linear2")
    score_layer.initialize()
    o = score_layer.apply(o)
    probs = Sigmoid().apply(o)

    cost = - (y * T.log(probs) + (1-y) * T.log(1 - probs)).mean()
    cost.name = 'cost'
    misclassification = (y * (probs < 0.5) + (1-y) * (probs > 0.5)).mean()
    misclassification.name = 'misclassification'

    # =================
    cg = ComputationGraph([cost])
    params = cg.parameters

    algorithm = GradientDescent(
            cost = cg.outputs[0],
            params=params,
            step_rule = CompositeRule([
                StepClipping(threshold=4),
                AdaM(),
                ])

            )

    # ========
    print "setting up data"
    ports = {
            'gpu0_train' : 5557,
            'gpu0_test' : 5558,
            'gpu1_train' : 5559,
            'gpu1_test' : 5560,
            }

    #batch_size = 16
    batch_size = 16
    def start_server(port, which_set):
        fuel.server.logger.setLevel('WARN')
        dataset = IMDBText(which_set, sorted=True)

        n_train = dataset.num_examples
        #scheme = ShuffledScheme(examples=n_train, batch_size=batch_size)
        scheme = BatchwiseShuffledScheme(examples=n_train, batch_size=batch_size)

        stream = DataStream(
                dataset=dataset,
                iteration_scheme=scheme)
        print "loading glove"
        glove = GloveTransformer(glove_version, data_stream=stream)
        padded = Padding(
                data_stream=glove,
                mask_sources=('features',)
                )

        fuel.server.start_server(padded, port=port, hwm=20)

    train_port = ports[theano.config.device + '_train']
    train_p = Process(target=start_server, args=(train_port, 'train'))
    train_p.start()

    test_port = ports[theano.config.device + '_test']
    test_p = Process(target=start_server, args=(test_port, 'test'))
    test_p.start()

    train_stream = ServerDataStream(('features', 'features_mask', 'targets'), port=train_port)
    test_stream = ServerDataStream(('features', 'features_mask', 'targets'), port=test_port)

    print "setting up model"

    n_examples = 25000
    #======
    model = Model(cost)
    extensions = []
    extensions.append(EpochProgress(batch_per_epoch=n_examples // batch_size + 1))
    extensions.append(TrainingDataMonitoring(
        [
            cost,
            misclassification,
            ],
        prefix='train',
        after_epoch=True
        ))

    #extensions.append(DataStreamMonitoring(
        #[cost, misclassification],
        #data_stream=test_stream,
        #prefix='test',
        #after_epoch=True
        #))
    extensions.append(Timing())
    extensions.append(Printing())

    extensions.append(Plot(
        theano.config.device+"_result",
        channels=[['train_cost']],
        after_epoch=True
        ))


    main_loop = MainLoop(
            model=model,
            data_stream=train_stream,
            algorithm=algorithm,
            extensions=extensions)
    main_loop.run()
Ejemplo n.º 6
0
    def start(self):
        x = T.matrix('features', config.floatX)
        y = T.imatrix('targets')

        self.x = x

        DIMS = [108 * 5, 1000, 1000, 1000, 1000, 1943]
        NUMS = [1, 1, 1, 1, 1, 1]
        FUNCS = [
            Rectifier,
            Rectifier,
            Rectifier,
            Rectifier,
            # Rectifier,
            # Maxout(num_pieces=5),
            # Maxout(num_pieces=5),
            # Maxout(num_pieces=5),
            # SimpleRecurrent,
            # SimpleRecurrent,
            # SimpleRecurrent,
            Softmax,
        ]

        def lllistool(i, inp, func):
            l = Linear(input_dim=DIMS[i],
                       output_dim=DIMS[i + 1] * NUMS[i + 1],
                       weights_init=IsotropicGaussian(std=DIMS[i]**(-0.5)),
                       biases_init=IsotropicGaussian(std=DIMS[i]**(-0.5)),
                       name='Lin{}'.format(i))
            l.initialize()
            func.name = 'Fun{}'.format(i)
            if func == SimpleRecurrent:
                gong = func(dim=DIMS[i + 1],
                            activation=Rectifier(),
                            weights_init=IsotropicGaussian(
                                std=(DIMS[i] + DIMS[i + 1])**(-0.5)))
            else:
                gong = func()
            ret = gong.apply(l.apply(inp))
            return ret

        oup = x
        for i in range(len(DIMS) - 1):
            oup = lllistool(i, oup, FUNCS[i])
        y_hat = oup

        self.y_hat_prob = y_hat

        cost = CategoricalCrossEntropy().apply(y.flatten(),
                                               y_hat).astype(config.floatX)

        cg = ComputationGraph(cost)
        orig_cg = cg
        ips = VariableFilter(roles=[INPUT])(cg.variables)
        ops = VariableFilter(roles=[OUTPUT])(cg.variables)
        cg = apply_dropout(cg, ips[0:2:1], 0.2)
        cg = apply_dropout(cg, ips[2:-2:1], 0.5)
        cost = cg.outputs[0]

        cost.name = 'cost'

        # mps = theano.shared(np.array([ph2id(ph48239(id2ph(t))) for t in range(48)]))
        mps = theano.shared(np.array([ph2id(state239(t))
                                      for t in range(1943)]))
        z_hat = T.argmax(y_hat, axis=1)

        y39, _ = scan(fn=lambda t: mps[t],
                      outputs_info=None,
                      sequences=[y.flatten()])
        y_hat39, _ = scan(fn=lambda t: mps[t],
                          outputs_info=None,
                          sequences=[z_hat])

        self.y_hat39 = y_hat39

        lost01 = (T.sum(T.neq(y_hat39, y39)) / y39.shape[0]).astype(
            config.floatX)
        lost01.name = '0/1 loss'
        lost23 = (T.sum(T.neq(y_hat39, y39)) / y39.shape[0]).astype(
            config.floatX)
        #lost23 = MisclassificationRate().apply(y39, y_hat39).astype(config.floatX)
        lost23.name = '2/3 loss'

        Ws = VariableFilter(roles=[WEIGHT])(cg.variables)
        norms = sum(w.norm(2) for w in Ws)
        norms.name = 'norms'
        path = pjoin(PATH['fuel'], pfx + '_train.hdf5')
        data = H5PYDataset(path,
                           which_set='train',
                           load_in_memory=True,
                           subset=slice(0, 100000))
        # data = H5PYDataset(path, which_set='train', load_in_memory=True)
        data_v = H5PYDataset(pjoin(PATH['fuel'], pfx + '_validate.hdf5'),
                             which_set='validate',
                             load_in_memory=True)
        num = data.num_examples
        data_stream = DataStream(data,
                                 iteration_scheme=ShuffledScheme(
                                     num, batch_size=128))
        data_stream_v = DataStream(data_v,
                                   iteration_scheme=SequentialScheme(
                                       data_v.num_examples, batch_size=128))
        algo = GradientDescent(cost=cost,
                               params=cg.parameters,
                               step_rule=CompositeRule([Momentum(0.002, 0.9)]))
        monitor = DataStreamMonitoring(variables=[cost, lost01, norms],
                                       data_stream=data_stream)
        monitor_v = DataStreamMonitoring(variables=[lost23],
                                         data_stream=data_stream_v)
        plt = Plot('AlpAlpAlp',
                   channels=[['0/1 loss', '2/3 loss']],
                   after_epoch=True)
        main_loop = MainLoop(data_stream=data_stream,
                             algorithm=algo,
                             extensions=[
                                 monitor, monitor_v,
                                 FinishAfter(after_n_epochs=2000),
                                 Printing(), plt
                             ])

        main_loop.run()
Ejemplo n.º 7
0
    def start(self):
        xx = T.matrix('features', config.floatX)
        yy = T.imatrix('targets')
        zm = BNUM*(xx.shape[0]//BNUM)
        x = xx[:zm].reshape((BNUM, zm//BNUM, xx.shape[1])).dimshuffle(1, 0, 2)
        y = yy[:zm].reshape((BNUM, zm//BNUM)).dimshuffle(1, 0)
        # x = xx[:zm].reshape((zm//16, 16, xx.shape[1]))
        # y = yy[:zm].reshape((zm//16, 16))

        DIMS = [108*5, 200, 200, 200, LABEL]
        NUMS = [1, 1, 1, 1, 1]
        # DIMS = [108*5, 48]
        # NUMS = [1, 1]
        FUNCS = [
            Rectifier, 
            Rectifier, 
            Rectifier, 
            # Rectifier, 
            # Rectifier, 
            # Maxout(num_pieces=5),
            # Maxout(num_pieces=5),
            # Maxout(num_pieces=5),
            # SimpleRecurrent,
            # SimpleRecurrent,
            # SimpleRecurrent,
            # LSTM,
            # LSTM,
            # LSTM,

            # SequenceGenerator,

            # Softmax,
            None,
        ]

        def lllistool(i, inp, func):
            if func == LSTM:
                NUMS[i+1] *= 4
            sdim = DIMS[i]
            if func == SimpleRecurrent or func == LSTM:
                sdim = DIMS[i] + DIMS[i+1]
            l = Linear(input_dim=DIMS[i], output_dim=DIMS[i+1] * NUMS[i+1], 
                       weights_init=IsotropicGaussian(std=sdim**(-0.5)), 
                       biases_init=IsotropicGaussian(std=sdim**(-0.5)),
                       name='Lin{}'.format(i))
            l.initialize()
            if func == SimpleRecurrent:
                gong = func(dim=DIMS[i+1], activation=Rectifier(), weights_init=IsotropicGaussian(std=sdim**(-0.5)))
                gong.initialize()
                ret = gong.apply(l.apply(inp))
            elif func == LSTM:
                gong = func(dim=DIMS[i+1], activation=Tanh(), weights_init=IsotropicGaussian(std=sdim**(-0.5)))
                gong.initialize()
                print(inp)
                ret, _ = gong.apply(
                    l.apply(inp), 
                    T.zeros((inp.shape[1], DIMS[i+1])),
                    T.zeros((inp.shape[1], DIMS[i+1])),
                )
            elif func == SequenceGenerator:
                gong = func(
                    readout=None, 
                    transition=SimpleRecurrent(dim=100, activation=Rectifier(), weights_init=IsotropicGaussian(std=0.1)))
                ret = None
            elif func == None:
                ret = l.apply(inp)
            else:
                gong = func()
                ret = gong.apply(l.apply(inp))
            return ret

        oup = x
        for i in range(len(DIMS)-1):
            oup = lllistool(i, oup, FUNCS[i])
        y_hat = oup

        y_rsp = y.reshape((y.shape[0]*y.shape[1],))
        y_dsf_rsp = y.dimshuffle(1, 0).reshape((y.shape[0]*y.shape[1],))
        yh_rsp = y_hat.reshape((y_hat.shape[0]*y_hat.shape[1], y_hat.shape[2]))
        yh_dsf_rsp = y_hat.dimshuffle(1, 0, 2).reshape((y_hat.shape[0]*y_hat.shape[1], y_hat.shape[2]))
        sfmx = Softmax().apply(yh_rsp)

        # cost = CategoricalCrossEntropy().apply(y, y_hat).astype(config.floatX)

        # j, wlh = Yimumu(y_hat, y)
        # cost = CategoricalCrossEntropy().apply(y_rsp, sfmx) + j
        cost = CategoricalCrossEntropy().apply(y_rsp, sfmx)
        # cost_p = cost_p.astype(config.floatX)
        # cost = CTC_cost(y, y_hat)
        cost = cost.astype(config.floatX)

        cg = ComputationGraph(cost)
        # cg_p = ComputationGraph(cost_p)
        orig_cg = cg
        ips = VariableFilter(roles=[INPUT])(cg.variables)
        ops = VariableFilter(roles=[OUTPUT])(cg.variables)
        # print(ips, ops)
        # cg = apply_dropout(cg, ips[0:2:1], 0.2)
        # cg = apply_dropout(cg, ips[2:-2:1], 0.5)
        # cost = cg.outputs[0].astype(config.floatX)

        cost.name = 'cost'

        mps = theano.shared(np.array([ph2id(ph48239(id2ph(t))) for t in range(48)]))
        # yh_dsf_rsp = theano.printing.Print('YapYapYap')(yh_dsf_rsp)
        # z_hat = T.argmax(yh_dsf_rsp[:,:-1], axis=1)
        z_hat = T.argmax(yh_dsf_rsp, axis=1)
        # z_hat = theano.printing.Print('Yap')(z_hat)
        # z_hat = Yimumu_Decode()(y_hat, wlh)
        z_hat_hat = CTC_Decode()(y_hat)

        y39,_ = scan(fn=lambda t: mps[t], outputs_info=None, sequences=[y_dsf_rsp])
        y_hat39,_ = scan(fn=lambda t: mps[t], outputs_info=None, sequences=[z_hat])
        y_hat_hat39 = y_hat39
        # y_hat_hat39,_ = scan(fn=lambda t: mps[t], outputs_info=None, sequences=[z_hat_hat])
        # trm = TrimOp()(y_hat_hat39)
        # trm = trm[1:1+trm[0]]
        # trm = theano.printing.Print('Trm')(trm)

        lost01 = (T.sum(T.neq(y_hat39, y39)) / y39.shape[0]).astype(config.floatX)
        lost01.name = '0/1 loss'
        lost23 = (T.sum(T.neq(y_hat39, y39)) / y39.shape[0]).astype(config.floatX)
        lost23.name = '2/3 loss'

        edit01 = EditDistance()(y39, y_hat_hat39).astype(config.floatX) #+ T.sum(trm) * 1E-10
        # edit01 = edit01.astype(config.floatX)
        edit01.name = '0/1 edit'
        edit23 = EditDistance()(y39, y_hat_hat39).astype(config.floatX)
        edit23.name = '2/3 edit'


        Ws = cg.parameters
        # Ws = Ws + [wlh]
        print(list(Ws))
        norms = sum(w.norm(2) for w in Ws)
        norms = norms.astype(config.floatX)
        norms.name = 'norms'
        path = pjoin(PATH['fuel'], 'train_train.hdf5')
        data = H5PYDataset(path, which_set='train', load_in_memory=True, subset=slice(0, 100000))
        # data = H5PYDataset(path, which_set='train', load_in_memory=True)
        data_v = H5PYDataset(pjoin(PATH['fuel'], 'train_validate.hdf5'), which_set='validate', load_in_memory=True)
        num = data.num_examples
        data_stream = DataStream(data, iteration_scheme=ShuffledScheme(
                        num, batch_size=SLEN*BNUM))
        data_stream_v = DataStream(data_v, iteration_scheme=SequentialScheme(
                        data_v.num_examples, batch_size=SLEN*BNUM))
        algo = GradientDescent(cost=cost, params=Ws, step_rule=CompositeRule([
            Momentum(0.005, 0.9)
            # AdaDelta()
        ]))
        # algo_p = GradientDescent(cost=cost_p, params=cg_p.parameters, step_rule=CompositeRule([
            # Momentum(0.01, 0.9)
            # # AdaDelta()
        # ]))
        monitor = DataStreamMonitoring( variables=[cost, lost01, edit01, norms],
                data_stream=data_stream)
        monitor_v = DataStreamMonitoring( variables=[lost23, edit23],
                data_stream=data_stream_v)
        plt = Plot('AlpYap', channels=[['0/1 loss', '2/3 loss'], ['0/1 edit', '2/3 edit']], after_epoch=True)

        # main_loop_p = MainLoop(data_stream = data_stream, 
                # algorithm=algo_p, 
                # extensions=[monitor, monitor_v, FinishAfter(after_n_epochs=10), Printing(), plt])
        # main_loop_p.run()

        main_loop = MainLoop(data_stream = data_stream, 
                algorithm=algo, 
                extensions=[monitor, monitor_v, FinishAfter(after_n_epochs=2000), Printing(), plt])

        main_loop.run()

        pfile = open('zzz.pkl', 'wb')
        pickle.dump(orig_cg, pfile)
        # pickle.dump(wlh, pfile)
        pfile.close()
        
        ################

        test_feat = np.load(pjoin(PATH['numpy'], 'train_test_features.npy')).astype(config.floatX)
        func = theano.function([xx], y_hat.astype(config.floatX))
        test_hat = []
        for i in range(19):
            tmp = func(test_feat[i*10000:(i+1)*10000])
            tmp = tmp.transpose((1, 0, 2)).reshape((tmp.shape[0]*tmp.shape[1], tmp.shape[2]))
            test_hat.append(tmp)
        test_hat = np.concatenate(test_hat, axis=0)
        test_hat = np.concatenate((test_hat, np.zeros((2, LABEL))), axis=0)

        alpha = T.tensor3(config.floatX)
        beta = alpha.argmax(axis=2)
        # beta = alpha[:,:,:-1].argmax(axis=2)
        # beta = Yimumu_Decode()(alpha, wlh)
        # beta = CTC_Decode()(alpha)
        func2 = theano.function([alpha], beta)

        lens = []
        tags = []
        with shelve.open(SHELVE['test']) as f:
            names = f['names']
            for n in names:
                lens.append(len(f[n]))
                for i in range(lens[-1]):
                    tags.append(n+'_'+str(i+1))

        seq = []
        seq2 = []
        nowcnt = 0
        for i in lens:
            nxt = nowcnt + i
            cur_hat = test_hat[nowcnt:nxt].reshape((i, 1, LABEL)).astype(config.floatX)
            nowcnt = nxt
            fc2 = func2(cur_hat).flatten()
            fc3 = []
            fc4 = []
            for j in fc2:
                fc3.append(ph48239(id2ph(j)))
                fc4.append(ph2c(ph48239(id2ph(j))))
            seq.append(fc3)
            seq2.append(''.join(trim(fc4)))

        seq_flat = np.concatenate(seq)
        with open('hw1_outz.txt', 'w') as f:
            f.write('id,prediction\n')
            for t, i in zip(tags, seq_flat):
                f.write(t+','+i+'\n')

        with open('hw2_outz.txt', 'w') as f:
            f.write('id,phone_sequence\n')
            for n, i in zip(names, seq2):
                f.write(n+','+i+'\n')
Ejemplo n.º 8
0
def main(mode, save_path, num_batches, from_dump):
    if mode == "train":
        # Experiment configuration
        dimension = 100
        readout_dimension = len(char2code)

        # Data processing pipeline
        data_stream = DataStreamMapping(
            mapping=lambda data: tuple(array.T for array in data),
            data_stream=PaddingDataStream(
                BatchDataStream(
                    iteration_scheme=ConstantScheme(10),
                    data_stream=DataStreamMapping(
                        mapping=reverse_words,
                        add_sources=("targets", ),
                        data_stream=DataStreamFilter(
                            predicate=lambda data: len(data[0]) <= 100,
                            data_stream=OneBillionWord(
                                "training", [99],
                                char2code,
                                level="character",
                                preprocess=str.lower).get_default_stream())))))

        # Build the model
        chars = tensor.lmatrix("features")
        chars_mask = tensor.matrix("features_mask")
        targets = tensor.lmatrix("targets")
        targets_mask = tensor.matrix("targets_mask")

        encoder = Bidirectional(GatedRecurrent(dim=dimension,
                                               activation=Tanh()),
                                weights_init=Orthogonal())
        encoder.initialize()
        fork = Fork([
            name
            for name in encoder.prototype.apply.sequences if name != 'mask'
        ],
                    weights_init=IsotropicGaussian(0.1),
                    biases_init=Constant(0))
        fork.input_dim = dimension
        fork.fork_dims = {name: dimension for name in fork.fork_names}
        fork.initialize()
        lookup = LookupTable(readout_dimension,
                             dimension,
                             weights_init=IsotropicGaussian(0.1))
        lookup.initialize()
        transition = Transition(activation=Tanh(),
                                dim=dimension,
                                attended_dim=2 * dimension,
                                name="transition")
        attention = SequenceContentAttention(
            state_names=transition.apply.states,
            match_dim=dimension,
            name="attention")
        readout = LinearReadout(readout_dim=readout_dimension,
                                source_names=["states"],
                                emitter=SoftmaxEmitter(name="emitter"),
                                feedbacker=LookupFeedback(
                                    readout_dimension, dimension),
                                name="readout")
        generator = SequenceGenerator(readout=readout,
                                      transition=transition,
                                      attention=attention,
                                      weights_init=IsotropicGaussian(0.1),
                                      biases_init=Constant(0),
                                      name="generator")
        generator.push_initialization_config()
        transition.weights_init = Orthogonal()
        generator.initialize()
        bricks = [encoder, fork, lookup, generator]

        # Give an idea of what's going on
        params = Selector(bricks).get_params()
        logger.info("Parameters:\n" +
                    pprint.pformat([(key, value.get_value().shape)
                                    for key, value in params.items()],
                                   width=120))

        # Build the cost computation graph
        batch_cost = generator.cost(
            targets,
            targets_mask,
            attended=encoder.apply(**dict_union(fork.apply(
                lookup.lookup(chars), return_dict=True),
                                                mask=chars_mask)),
            attended_mask=chars_mask).sum()
        batch_size = named_copy(chars.shape[1], "batch_size")
        cost = aggregation.mean(batch_cost, batch_size)
        cost.name = "sequence_log_likelihood"
        logger.info("Cost graph is built")

        # Fetch variables useful for debugging
        max_length = named_copy(chars.shape[0], "max_length")
        cost_per_character = named_copy(
            aggregation.mean(batch_cost, batch_size * max_length),
            "character_log_likelihood")
        cg = ComputationGraph(cost)
        energies = unpack(VariableFilter(application=readout.readout,
                                         name="output")(cg.variables),
                          singleton=True)
        min_energy = named_copy(energies.min(), "min_energy")
        max_energy = named_copy(energies.max(), "max_energy")
        (activations, ) = VariableFilter(
            application=generator.transition.apply,
            name="states")(cg.variables)
        mean_activation = named_copy(activations.mean(), "mean_activation")

        # Define the training algorithm.
        algorithm = GradientDescent(cost=cost,
                                    step_rule=CompositeRule([
                                        GradientClipping(10.0),
                                        SteepestDescent(0.01)
                                    ]))

        observables = [
            cost, min_energy, max_energy, mean_activation, batch_size,
            max_length, cost_per_character, algorithm.total_step_norm,
            algorithm.total_gradient_norm
        ]
        for name, param in params.items():
            observables.append(named_copy(param.norm(2), name + "_norm"))
            observables.append(
                named_copy(algorithm.gradients[param].norm(2),
                           name + "_grad_norm"))

        main_loop = MainLoop(
            model=bricks,
            data_stream=data_stream,
            algorithm=algorithm,
            extensions=([LoadFromDump(from_dump)] if from_dump else []) + [
                Timing(),
                TrainingDataMonitoring(observables, after_every_batch=True),
                TrainingDataMonitoring(
                    observables, prefix="average", every_n_batches=10),
                FinishAfter(after_n_batches=num_batches).add_condition(
                    "after_batch", lambda log: math.isnan(
                        log.current_row.total_gradient_norm)),
                Plot(os.path.basename(save_path),
                     [["average_" + cost.name],
                      ["average_" + cost_per_character.name]],
                     every_n_batches=10),
                SerializeMainLoop(save_path,
                                  every_n_batches=500,
                                  save_separately=["model", "log"]),
                Printing(every_n_batches=1)
            ])
        main_loop.run()
    elif mode == "test":
        with open(save_path, "rb") as source:
            encoder, fork, lookup, generator = dill.load(source)
        logger.info("Model is loaded")
        chars = tensor.lmatrix("features")
        generated = generator.generate(
            n_steps=3 * chars.shape[0],
            batch_size=chars.shape[1],
            attended=encoder.apply(**dict_union(
                fork.apply(lookup.lookup(chars), return_dict=True))),
            attended_mask=tensor.ones(chars.shape))
        sample_function = ComputationGraph(generated).get_theano_function()
        logging.info("Sampling function is compiled")

        while True:
            # Python 2-3 compatibility
            line = input("Enter a sentence\n")
            batch_size = int(input("Enter a number of samples\n"))
            encoded_input = [
                char2code.get(char, char2code["<UNK>"])
                for char in line.lower().strip()
            ]
            encoded_input = ([char2code['<S>']] + encoded_input +
                             [char2code['</S>']])
            print("Encoder input:", encoded_input)
            target = reverse_words((encoded_input, ))[0]
            print("Target: ", target)
            states, samples, glimpses, weights, costs = sample_function(
                numpy.repeat(numpy.array(encoded_input)[:, None],
                             batch_size,
                             axis=1))

            messages = []
            for i in range(samples.shape[1]):
                sample = list(samples[:, i])
                try:
                    true_length = sample.index(char2code['</S>']) + 1
                except ValueError:
                    true_length = len(sample)
                sample = sample[:true_length]
                cost = costs[:true_length, i].sum()
                message = "({})".format(cost)
                message += "".join(code2char[code] for code in sample)
                if sample == target:
                    message += " CORRECT!"
                messages.append((cost, message))
            messages.sort(key=lambda tuple_: -tuple_[0])
            for _, message in messages:
                print(message)
Ejemplo n.º 9
0
    extensions=[
        Timing(),
        TrainingDataMonitoring([m.variational_cost],
                               prefix="train",
                               after_epoch=True),
        DataStreamMonitoring([m.variational_cost], test_stream, prefix="test"),
        DataStreamMonitoring([m.cost], test_stream, prefix="test")
        #, FinishAfter(after_n_epochs=10)
        ,
        Printing()
        #, Dump("out.pkl")
        ,
        ExperimentSaver("../VAOutCIFAR", "."),
        EpochProgress(dataset.num_examples // 128),
        Plot('cifar10',
             channels=[[
                 'train_variational_cost', 'test_variational_cost', 'test_cost'
             ]])
    ])
main_loop.run()

_func_noisy = theano.function([m.X], m.noisy)
_func_produced = theano.function([m.X], m.produced)

batch = test_stream.get_epoch_iterator().next()[0]
out_noise = _func_noisy(batch)
out_produced = _func_produced(batch)
import cv2
for k in range(10):
    print out_noise.shape
    img = np.reshape(out_noise[k, :], (28, 28))
    img = cv2.resize(img, (500, 500), interpolation=cv2.INTER_NEAREST)
Ejemplo n.º 10
0
    cg = ComputationGraph(cost)

    # Print shapes
    shapes = [param.get_value().shape for param in cg.parameters]
    print('Parameter shapes')
    for shape, count in Counter(shapes).most_common():
        print('    {:15}: {}'.format(shape, count))

    # Set up training algorithm
    algorithm = GradientDescent(cost=cost,
                                params=cg.parameters,
                                step_rule=CompositeRule(
                                    [StepClipping(10),
                                     AdaDelta()]))

    # Train!
    main_loop = MainLoop(model=Model(cost),
                         algorithm=algorithm,
                         data_stream=masked_stream,
                         extensions=[
                             TrainingDataMonitoring([cost],
                                                    after_every_batch=True),
                             Plot('En-Fr',
                                  channels=[['decoder_cost_cost']],
                                  after_every_batch=True),
                             Printing(after_every_batch=True),
                             Checkpoint('model.pkl', every_n_batches=2048)
                         ])
    main_loop.run()
Ejemplo n.º 11
0
        #data_stream=test_stream,
        #prefix='test',
        #after_epoch=True
        #))
    extensions.append(Timing())
    extensions.append(Printing())

    #extensions.append(Plot("norms", channels=[['train_lstm_norm', 'train_pre_norm']], after_epoch=True))
    #extensions.append(Plot(theano.config.device+"_result", channels=[['test_misclassification', 'train_misclassification']], after_epoch=True))
    extensions.append(PlotHistogram(
        channels=['train_state_to_state'],
        bins=50,
        every_n_batches=30))

    extensions.append(Plot(
        theano.config.device+"_result",
        channels=[['train_cost']],
        every_n_batches=30))


    main_loop = MainLoop(
            model=model,
            data_stream=train_stream,
            algorithm=algorithm,
            extensions=extensions)
    main_loop.run()


if __name__ == "__main__":
    main()
Ejemplo n.º 12
0
    extensions=[
        Timing(),
        TrainingDataMonitoring([m.variational_cost],
                               prefix="train",
                               after_epoch=True),
        DataStreamMonitoring([m.variational_cost], test_stream, prefix="test"),
        DataStreamMonitoring([m.cost], test_stream, prefix="test")
        #, FinishAfter(after_n_epochs=10)
        ,
        Printing()
        #, Dump("out.pkl")
        ,
        ExperimentSaver("../VAOut", "."),
        EpochProgress(dataset.num_examples // 128),
        Plot('mnist2',
             channels=[[
                 'train_variational_cost', 'test_variational_cost', 'test_cost'
             ]])
    ])
main_loop.run()

_func_noisy = theano.function([m.X], m.noisy)
_func_produced = theano.function([m.X], m.produced)

batch = test_stream.get_epoch_iterator().next()[0]
out_noise = _func_noisy(batch)
out_produced = _func_produced(batch)
import cv2
for k in range(10):
    print out_noise.shape
    img = np.reshape(out_noise[k, :], (28, 28))
    img = cv2.resize(img, (500, 500), interpolation=cv2.INTER_NEAREST)
Ejemplo n.º 13
0
def main_run(_config, _log):
    from collections import namedtuple
    c = namedtuple("Config", _config.keys())(*_config.values())

    _log.info("Running with" + str(_config))

    import theano
    from theano import tensor as T
    import numpy as np

    from dataset import IMDBText, GloveTransformer

    from blocks.initialization import Uniform, Constant, IsotropicGaussian, NdarrayInitialization, Identity, Orthogonal
    from blocks.bricks.recurrent import LSTM, SimpleRecurrent, GatedRecurrent
    from blocks.bricks.parallel import Fork

    from blocks.bricks import Linear, Sigmoid, Tanh, Rectifier
    from blocks import bricks

    from blocks.extensions import Printing, Timing
    from blocks.extensions.monitoring import (DataStreamMonitoring,
                                              TrainingDataMonitoring)

    from blocks.extensions.plot import Plot
    from plot import PlotHistogram

    from blocks.algorithms import GradientDescent, Adam, Scale, StepClipping, CompositeRule, AdaDelta
    from blocks.graph import ComputationGraph, apply_dropout
    from blocks.main_loop import MainLoop
    from blocks.model import Model

    from cuboid.algorithms import AdaM, NAG
    from cuboid.extensions import EpochProgress

    from fuel.streams import DataStream, ServerDataStream
    from fuel.transformers import Padding

    from fuel.schemes import ShuffledScheme
    from Conv1D import Conv1D, MaxPooling1D
    from schemes import BatchwiseShuffledScheme
    from bricks import WeightedSigmoid, GatedRecurrentFull

    from multiprocessing import Process
    import fuel
    import logging
    from initialization import SumInitialization

    from transformers import DropSources
    global train_p
    global test_p

    x = T.tensor3('features')
    #m = T.matrix('features_mask')
    y = T.imatrix('targets')

    #x = x+m.mean()*0

    dropout_variables = []
    embedding_size = 300
    glove_version = "glove.6B.300d.txt"
    #embedding_size = 50
    #glove_version = "vectors.6B.50d.txt"

    gloveMapping = Linear(
        input_dim=embedding_size,
        output_dim=c.rnn_input_dim,
        weights_init=Orthogonal(),
        #weights_init = IsotropicGaussian(c.wstd),
        biases_init=Constant(0.0),
        name="gloveMapping")
    gloveMapping.initialize()
    o = gloveMapping.apply(x)
    o = Rectifier(name="gloveRec").apply(o)
    dropout_variables.append(o)

    summed_mapped_glove = o.sum(axis=1)  # take out the sequence
    glove_out = Linear(input_dim=c.rnn_input_dim,
                       output_dim=1.0,
                       weights_init=IsotropicGaussian(c.wstd),
                       biases_init=Constant(0.0),
                       name="mapping_to_output")
    glove_out.initialize()
    deeply_sup_0 = glove_out.apply(summed_mapped_glove)
    deeply_sup_probs = Sigmoid(name="deeply_sup_softmax").apply(deeply_sup_0)

    input_dim = c.rnn_input_dim
    hidden_dim = c.rnn_dim

    gru = GatedRecurrentFull(
        hidden_dim=hidden_dim,
        activation=Tanh(),
        #activation=bricks.Identity(),
        gate_activation=Sigmoid(),
        state_to_state_init=SumInitialization(
            [Identity(1.0), IsotropicGaussian(c.wstd)]),
        state_to_reset_init=IsotropicGaussian(c.wstd),
        state_to_update_init=IsotropicGaussian(c.wstd),
        input_to_state_transform=Linear(input_dim=input_dim,
                                        output_dim=hidden_dim,
                                        weights_init=IsotropicGaussian(c.wstd),
                                        biases_init=Constant(0.0)),
        input_to_update_transform=Linear(
            input_dim=input_dim,
            output_dim=hidden_dim,
            weights_init=IsotropicGaussian(c.wstd),
            #biases_init=Constant(-2.0)),
            biases_init=Constant(-1.0)),
        input_to_reset_transform=Linear(
            input_dim=input_dim,
            output_dim=hidden_dim,
            weights_init=IsotropicGaussian(c.wstd),
            #biases_init=Constant(-3.0))
            biases_init=Constant(-2.0)))
    gru.initialize()
    rnn_in = o.dimshuffle(1, 0, 2)
    #rnn_in = o
    #rnn_out = gru.apply(rnn_in, mask=m.T)
    rnn_out = gru.apply(rnn_in)
    state_to_state = gru.rnn.state_to_state
    state_to_state.name = "state_to_state"
    #o = rnn_out[-1, :, :]
    o = rnn_out[-1]

    #o = rnn_out[:, -1, :]
    #o = rnn_out.mean(axis=1)

    #print rnn_last_out.eval({
    #x: np.ones((3, 101, 300), dtype=theano.config.floatX),
    #m: np.ones((3, 101), dtype=theano.config.floatX)})
    #raw_input()
    #o = rnn_out.mean(axis=1)
    dropout_variables.append(o)

    score_layer = Linear(input_dim=hidden_dim,
                         output_dim=1,
                         weights_init=IsotropicGaussian(std=c.wstd),
                         biases_init=Constant(0.),
                         name="linear2")
    score_layer.initialize()
    o = score_layer.apply(o)
    probs = Sigmoid().apply(o)

    #probs = deeply_sup_probs
    cost = -(y * T.log(probs) + (1 - y) * T.log(1 - probs)).mean()
    #cost_deeply_sup0 = - (y * T.log(deeply_sup_probs) + (1-y) * T.log(1 - deeply_sup_probs)).mean()
    # cost += cost_deeply_sup0 * c.deeply_factor

    cost.name = 'cost'
    misclassification = (y * (probs < 0.5) + (1 - y) * (probs > 0.5)).mean()
    misclassification.name = 'misclassification'

    #print rnn_in.shape.eval(
    #{x : np.ones((45, 111, embedding_size), dtype=theano.config.floatX),
    #})
    #print rnn_out.shape.eval(
    #{x : np.ones((45, 111, embedding_size), dtype=theano.config.floatX),
    #m : np.ones((45, 111), dtype=theano.config.floatX)})
    #print (m).sum(axis=1).shape.eval({
    #m : np.ones((45, 111), dtype=theano.config.floatX)})
    #print (m).shape.eval({
    #m : np.ones((45, 111), dtype=theano.config.floatX)})
    #raw_input()

    # =================

    cg = ComputationGraph([cost])
    cg = apply_dropout(cg, variables=dropout_variables, drop_prob=0.5)
    params = cg.parameters

    algorithm = GradientDescent(
        cost=cg.outputs[0],
        params=params,
        step_rule=CompositeRule([
            StepClipping(threshold=4),
            Adam(learning_rate=0.002, beta1=0.1, beta2=0.001),
            #NAG(lr=0.1, momentum=0.9),
            #AdaDelta(),
        ]))

    # ========
    print "setting up data"
    ports = {
        'gpu0_train': 5557,
        'gpu0_test': 5558,
        'cuda0_train': 5557,
        'cuda0_test': 5558,
        'opencl0:0_train': 5557,
        'opencl0:0_test': 5558,
        'gpu1_train': 5559,
        'gpu1_test': 5560,
    }

    #batch_size = 16
    #batch_size = 32
    batch_size = 40

    def start_server(port, which_set):
        fuel.server.logger.setLevel('WARN')
        dataset = IMDBText(which_set, sorted=True)

        n_train = dataset.num_examples
        #scheme = ShuffledScheme(examples=n_train, batch_size=batch_size)
        scheme = BatchwiseShuffledScheme(examples=n_train,
                                         batch_size=batch_size)

        stream = DataStream(dataset=dataset, iteration_scheme=scheme)
        print "loading glove"
        glove = GloveTransformer(glove_version, data_stream=stream)
        padded = Padding(
            data_stream=glove,
            #mask_sources=('features',)
            mask_sources=('features', ))

        padded = DropSources(padded, ['features_mask'])

        fuel.server.start_server(padded, port=port, hwm=20)

    train_port = ports[theano.config.device + '_train']
    train_p = Process(target=start_server, args=(train_port, 'train'))
    train_p.start()

    test_port = ports[theano.config.device + '_test']
    test_p = Process(target=start_server, args=(test_port, 'test'))
    test_p.start()

    #train_stream = ServerDataStream(('features', 'features_mask', 'targets'), port=train_port)
    #test_stream = ServerDataStream(('features', 'features_mask', 'targets'), port=test_port)

    train_stream = ServerDataStream(('features', 'targets'), port=train_port)
    test_stream = ServerDataStream(('features', 'targets'), port=test_port)

    print "setting up model"
    #ipdb.set_trace()

    n_examples = 25000
    print "Batches per epoch", n_examples // (batch_size + 1)
    batches_extensions = 100
    monitor_rate = 50
    #======
    model = Model(cg.outputs[0])
    extensions = []
    extensions.append(
        EpochProgress(batch_per_epoch=n_examples // batch_size + 1))
    extensions.append(
        TrainingDataMonitoring(
            [cost, misclassification],
            prefix='train',
            every_n_batches=monitor_rate,
        ))

    extensions.append(
        DataStreamMonitoring([cost, misclassification],
                             data_stream=test_stream,
                             prefix='test',
                             after_epoch=True,
                             before_first_epoch=False))

    extensions.append(Timing())
    extensions.append(Printing())

    #extensions.append(Plot("norms", channels=[['train_lstm_norm', 'train_pre_norm']], after_epoch=True))
    #extensions.append(Plot(theano.config.device+"_result", channels=[['test_misclassification', 'train_misclassification']], after_epoch=True))

    #extensions.append(PlotHistogram(
    #channels=['train_state_to_state'],
    #bins=50,
    #every_n_batches=30))

    extensions.append(
        Plot(theano.config.device + "_result",
             channels=[['train_cost'], ['train_misclassification']],
             every_n_batches=monitor_rate))

    main_loop = MainLoop(model=model,
                         data_stream=train_stream,
                         algorithm=algorithm,
                         extensions=extensions)
    main_loop.run()
Ejemplo n.º 14
0
def main():
    x = T.tensor3('features')
    m = T.matrix('features_mask')
    y = T.imatrix('targets')
    x = m.mean() + x #stupid mask not always needed...

    #embedding_size = 300
    #glove_version = "glove.6B.300d.txt"

    embedding_size = 50
    glove_version = "vectors.6B.50d.txt"
    wstd = 0.02

    conv1 = Conv1D(filter_length=5, num_filters=128, input_dim=embedding_size,
            weights_init=IsotropicGaussian(std=wstd),
            biases_init=Constant(0.0))
    conv1.initialize()
    o = conv1.apply(x)
    o = Rectifier(name="conv1red").apply(o)
    o = MaxPooling1D(pooling_length=5
            #, step=2
            ).apply(o)

    conv2 = Conv1D(filter_length=5, num_filters=128, input_dim=128,
            weights_init=IsotropicGaussian(std=wstd),
            biases_init=Constant(0.0),
            step=3,
            name="conv2")
    conv2.initialize()
    o = conv2.apply(o)

    o = Rectifier(name="conv2rec").apply(o)
    conv2 = Conv1D(filter_length=5, num_filters=128, input_dim=128,
            weights_init=IsotropicGaussian(std=wstd),
            biases_init=Constant(0.0),
            step=3,
            name="conv3")
    conv2.initialize()
    o = conv2.apply(o)
    o = Rectifier(name="conv3rec").apply(o)

    fork = Fork(weights_init=IsotropicGaussian(0.02),
            biases_init=Constant(0.),
            input_dim=128,
            output_dims=[128]*3,
            output_names=['inputs', 'reset_inputs', 'update_inputs']
            )
    fork.initialize()

    inputs, reset_inputs, update_inputs = fork.apply(o)

    out = o.mean(axis=1)

    #gru = GatedRecurrent(dim=128,
            #weights_init=IsotropicGaussian(0.02),
            #biases_init=IsotropicGaussian(0.0))

    #gru.initialize()
    #states = gru.apply(inputs=inputs, reset_inputs=reset_inputs, update_inputs=update_inputs)

    #out = states[:, -1, :]

    hidden = Linear(
        input_dim = 128,
        output_dim = 128,
        weights_init = Uniform(std=0.01),
        biases_init = Constant(0.))
    hidden.initialize()

    o = hidden.apply(out)
    o = Rectifier().apply(o)
    #hidden = Linear(
        #input_dim = 128,
        #output_dim = 128,
        #weights_init = IsotropicGaussian(std=0.02),
        #biases_init = Constant(0.),
        #name="hiddenmap2")
    #hidden.initialize()

    #o = hidden.apply(o)
    #o = Rectifier(name="rec2").apply(o)


    score_layer = Linear(
            input_dim = 128,
            output_dim = 1,
            weights_init = IsotropicGaussian(std=wstd),
            biases_init = Constant(0.),
            name="linear2")
    score_layer.initialize()
    o = score_layer.apply(o)

    probs = Sigmoid().apply(o)

    cost = - (y * T.log(probs) + (1-y) * T.log(1 - probs)).mean()
    cost.name = 'cost'
    misclassification = (y * (probs < 0.5) + (1-y) * (probs > 0.5)).mean()
    misclassification.name = 'misclassification'

    #print (rnn_states * m.dimshuffle(0, 1, 'x')).sum(axis=1).shape.eval(
            #{x : np.ones((45, 111, embedding_size), dtype=theano.config.floatX),
                #m : np.ones((45, 111), dtype=theano.config.floatX)})
    #print (m).sum(axis=1).shape.eval({
                #m : np.ones((45, 111), dtype=theano.config.floatX)})
    #print (m).shape.eval({
                #m : np.ones((45, 111), dtype=theano.config.floatX)})
    #raw_input()


    # =================

    cg = ComputationGraph([cost])
    params = cg.parameters

    algorithm = GradientDescent(
            cost = cost,
            params=params,
            step_rule = CompositeRule([
                StepClipping(threshold=10),
                AdaM(),
                #AdaDelta(),
                ])

            )


    # ========
    print "setting up data"
    ports = {
            'gpu0_train' : 5557,
            'gpu0_test' : 5558,
            'gpu1_train' : 5559,
            'gpu1_test' : 5560,
            }

    batch_size = 16
    def start_server(port, which_set):
        fuel.server.logger.setLevel('WARN')

        dataset = IMDBText(which_set)
        n_train = dataset.num_examples
        stream = DataStream(
                dataset=dataset,
                iteration_scheme=ShuffledScheme(
                    examples=n_train,
                    batch_size=batch_size)
                )
        print "loading glove"
        glove = GloveTransformer(glove_version, data_stream=stream)
        padded = Padding(
                data_stream=glove,
                mask_sources=('features',)
                )

        fuel.server.start_server(padded, port=port, hwm=20)

    train_port = ports[theano.config.device + '_train']
    train_p = Process(target=start_server, args=(train_port, 'train'))
    train_p.start()

    test_port = ports[theano.config.device + '_test']
    test_p = Process(target=start_server, args=(test_port, 'test'))
    test_p.start()

    train_stream = ServerDataStream(('features', 'features_mask', 'targets'), port=train_port)
    test_stream = ServerDataStream(('features', 'features_mask', 'targets'), port=test_port)

    print "setting up model"
    #import ipdb
    #ipdb.set_trace()

    n_examples = 25000
    #======
    model = Model(cost)
    extensions = []
    extensions.append(EpochProgress(batch_per_epoch=n_examples // batch_size + 1))
    extensions.append(TrainingDataMonitoring(
        [cost, misclassification],
        prefix='train',
        after_epoch=True
        ))

    extensions.append(DataStreamMonitoring(
        [cost, misclassification],
        data_stream=test_stream,
        prefix='test',
        after_epoch=True
        ))
    extensions.append(Timing())
    extensions.append(Printing())

    #extensions.append(Plot("norms", channels=[['train_lstm_norm', 'train_pre_norm']], after_epoch=True))
    extensions.append(Plot(theano.config.device+"_result", channels=[['test_misclassification', 'train_misclassification']], after_epoch=True))

    main_loop = MainLoop(
            model=model,
            data_stream=train_stream,
            algorithm=algorithm,
            extensions=extensions)
    main_loop.run()
Ejemplo n.º 15
0
data_stream = Flatten(
    DataStream.default_stream(dataset=train,
                              iteration_scheme=SequentialScheme(
                                  train.num_examples, batch_size=128)))

training_monitor = TrainingDataMonitoring([cost, error], after_batch=True)

# Use the 'valid' set for validation during training:
validation_stream = Flatten(
    DataStream.default_stream(dataset=valid,
                              iteration_scheme=SequentialScheme(
                                  valid.num_examples, batch_size=256)))

validation_monitor = DataStreamMonitoring(variables=[cost, error],
                                          data_stream=validation_stream,
                                          prefix='validation',
                                          after_batch=False)

# The main loop will train the network and output reports, etc
main = MainLoop(model=model,
                data_stream=data_stream,
                algorithm=algo,
                extensions=[
                    FinishAfter(after_n_epochs=10),
                    Printing(), validation_monitor, training_monitor,
                    Plot('LR_AdniNet',
                         channels=[['entropy', 'validation_entropy'],
                                   ['error', 'validation_error']],
                         after_batch=True)
                ])
print "Model Initialized, Data Loaded: Start training with main.run()"
Ejemplo n.º 16
0
def main():
    x = T.tensor3('features')
    m = T.matrix('features_mask')
    y = T.imatrix('targets')

    #rnn = SimpleRecurrent(
            #dim = 50,
            #activation=Tanh(),
            #weights_init = Uniform(std=0.01),
            #biases_init = Constant(0.)
        #)

    #rnn = GatedRecurrent(
            #dim = 50,
            #activation=Tanh(),
            #weights_init = Uniform(std=0.01),
            #biases_init = Constant(0.)
        #)

    embedding_size = 300
    #glove_version = "vectors.6B.100d.txt"
    glove_version = "glove.6B.300d.txt"
    #fork = Fork(weights_init=IsotropicGaussian(0.02),
            #biases_init=Constant(0.),
            #input_dim=embedding_size,
            #output_dims=[embedding_size]*3,
            #output_names=['inputs', 'reset_inputs', 'update_inputs']
            #)

    rnn = LSTM(
            dim = embedding_size,
            activation=Tanh(),
            weights_init = IsotropicGaussian(std=0.02),
        )
    rnn.initialize()

    #fork.initialize()
    wstd = 0.02

    score_layer = Linear(
            input_dim = 128,
            output_dim = 1,
            weights_init = IsotropicGaussian(std=wstd),
            biases_init = Constant(0.),
            name="linear2")
    score_layer.initialize()

    gloveMapping = Linear(
            input_dim = embedding_size,
            output_dim = embedding_size,
            weights_init = IsotropicGaussian(std=wstd),
            biases_init = Constant(0.0),
            name="gloveMapping"
            )
    gloveMapping.initialize()
    o = gloveMapping.apply(x)
    o = Rectifier(name="rectivfyglove").apply(o)

    forget_bias = np.zeros((embedding_size*4), dtype=theano.config.floatX)
    forget_bias[embedding_size:embedding_size*2] = 4.0
    toLSTM = Linear(
            input_dim = embedding_size,
            output_dim = embedding_size*4,
            weights_init = IsotropicGaussian(std=wstd),
            biases_init = Constant(forget_bias),
            #biases_init = Constant(0.0),
            name="ToLSTM"
            )
    toLSTM.initialize()


    rnn_states, rnn_cells = rnn.apply(toLSTM.apply(o) * T.shape_padright(m), mask=m)
    #inputs, reset_inputs, update_inputs = fork.apply(x)
    #rnn_states = rnn.apply(inputs=inputs, reset_inputs=reset_inputs, update_inputs=update_inputs, mask=m)

    #rnn_out = rnn_states[:, -1, :]
    rnn_out = (rnn_states * m.dimshuffle(0, 1, 'x')).sum(axis=1) / m.sum(axis=1).dimshuffle(0, 'x')
    #rnn_out = (rnn_states).mean(axis=1)# / m.sum(axis=1)

    hidden = Linear(
        input_dim = embedding_size,
        output_dim = 128,
        weights_init = Uniform(std=0.01),
        biases_init = Constant(0.))
    hidden.initialize()

    o = hidden.apply(rnn_out)
    o = Rectifier().apply(o)
    hidden = Linear(
        input_dim = 128,
        output_dim = 128,
        weights_init = IsotropicGaussian(std=0.02),
        biases_init = Constant(0.),
        name="hiddenmap2")
    hidden.initialize()

    o = hidden.apply(o)
    o = Rectifier(name="rec2").apply(o)

    o = score_layer.apply(o)

    probs = Sigmoid().apply(o)

    cost = - (y * T.log(probs) + (1-y) * T.log(1 - probs)).mean()
    cost.name = 'cost'
    misclassification = (y * (probs < 0.5) + (1-y) * (probs > 0.5)).mean()
    misclassification.name = 'misclassification'

    #print (rnn_states * m.dimshuffle(0, 1, 'x')).sum(axis=1).shape.eval(
            #{x : np.ones((45, 111, embedding_size), dtype=theano.config.floatX),
                #m : np.ones((45, 111), dtype=theano.config.floatX)})
    #print (m).sum(axis=1).shape.eval({
                #m : np.ones((45, 111), dtype=theano.config.floatX)})
    #print (m).shape.eval({
                #m : np.ones((45, 111), dtype=theano.config.floatX)})
    #raw_input()


    # =================

    cg = ComputationGraph([cost])
    params = cg.parameters

    algorithm = GradientDescent(
            cost = cost,
            params=params,
            step_rule = CompositeRule([
                StepClipping(threshold=10),
                AdaM(),
                #AdaDelta(),
                ])

            )


    # ========
    print "setting up data"

    train_dataset = IMDBText('train')
    test_dataset = IMDBText('test')
    batch_size = 16
    n_train = train_dataset.num_examples
    train_stream = DataStream(
            dataset=train_dataset,
            iteration_scheme=ShuffledScheme(
                examples=n_train,
                batch_size=batch_size)
            )
    glove = GloveTransformer(glove_version, data_stream=train_stream)

    train_padded = Padding(
            data_stream=glove,
            mask_sources=('features',)
            #mask_sources=[]
            )


    test_stream = DataStream(
            dataset=test_dataset,
            iteration_scheme=ShuffledScheme(
                examples=n_train,
                batch_size=batch_size)
            )
    glove = GloveTransformer(glove_version, data_stream=test_stream)
    test_padded = Padding(
            data_stream=glove,
            mask_sources=('features',)
            #mask_sources=[]
            )
    print "setting up model"
    #import ipdb
    #ipdb.set_trace()

    lstm_norm = rnn.W_state.norm(2)
    lstm_norm.name = "lstm_norm"

    pre_norm= gloveMapping.W.norm(2)
    pre_norm.name = "pre_norm"

    #======
    model = Model(cost)
    extensions = []
    extensions.append(EpochProgress(batch_per_epoch=train_dataset.num_examples // batch_size + 1))
    extensions.append(TrainingDataMonitoring(
        [cost, misclassification, lstm_norm, pre_norm],
        prefix='train',
        after_epoch=True
        ))

    extensions.append(DataStreamMonitoring(
        [cost, misclassification],
        data_stream=test_padded,
        prefix='test',
        after_epoch=True
        ))
    extensions.append(Timing())
    extensions.append(Printing())

    extensions.append(Plot("norms", channels=[['train_lstm_norm', 'train_pre_norm']], after_epoch=True))
    extensions.append(Plot("result", channels=[['train_cost', 'train_misclassification']], after_epoch=True))

    main_loop = MainLoop(
            model=model,
            data_stream=train_padded,
            algorithm=algorithm,
            extensions=extensions)
    main_loop.run()
Ejemplo n.º 17
0
def main(mode, save_path, num_batches, data_path=None):
    reverser = WordReverser(100, len(char2code), name="reverser")

    if mode == "train":
        # Data processing pipeline
        dataset_options = dict(dictionary=char2code, level="character",
                               preprocess=_lower)
        if data_path:
            dataset = TextFile(data_path, **dataset_options)
        else:
            dataset = OneBillionWord("training", [99], **dataset_options)
        data_stream = dataset.get_example_stream()
        data_stream = Filter(data_stream, _filter_long)
        data_stream = Mapping(data_stream, reverse_words,
                              add_sources=("targets",))
        data_stream = Batch(data_stream, iteration_scheme=ConstantScheme(10))
        data_stream = Padding(data_stream)
        data_stream = Mapping(data_stream, _transpose)

        # Initialization settings
        reverser.weights_init = IsotropicGaussian(0.1)
        reverser.biases_init = Constant(0.0)
        reverser.push_initialization_config()
        reverser.encoder.weghts_init = Orthogonal()
        reverser.generator.transition.weights_init = Orthogonal()

        # Build the cost computation graph
        chars = tensor.lmatrix("features")
        chars_mask = tensor.matrix("features_mask")
        targets = tensor.lmatrix("targets")
        targets_mask = tensor.matrix("targets_mask")
        batch_cost = reverser.cost(
            chars, chars_mask, targets, targets_mask).sum()
        batch_size = named_copy(chars.shape[1], "batch_size")
        cost = aggregation.mean(batch_cost,  batch_size)
        cost.name = "sequence_log_likelihood"
        logger.info("Cost graph is built")

        # Give an idea of what's going on
        model = Model(cost)
        params = model.get_params()
        logger.info("Parameters:\n" +
                    pprint.pformat(
                        [(key, value.get_value().shape) for key, value
                         in params.items()],
                        width=120))

        # Initialize parameters
        for brick in model.get_top_bricks():
            brick.initialize()

        # Define the training algorithm.
        cg = ComputationGraph(cost)
        algorithm = GradientDescent(
            cost=cost, params=cg.parameters,
            step_rule=CompositeRule([StepClipping(10.0), Scale(0.01)]))

        # Fetch variables useful for debugging
        generator = reverser.generator
        (energies,) = VariableFilter(
            application=generator.readout.readout,
            name="output")(cg.variables)
        (activations,) = VariableFilter(
            application=generator.transition.apply,
            name=generator.transition.apply.states[0])(cg.variables)
        max_length = named_copy(chars.shape[0], "max_length")
        cost_per_character = named_copy(
            aggregation.mean(batch_cost, batch_size * max_length),
            "character_log_likelihood")
        min_energy = named_copy(energies.min(), "min_energy")
        max_energy = named_copy(energies.max(), "max_energy")
        mean_activation = named_copy(abs(activations).mean(),
                                     "mean_activation")
        observables = [
            cost, min_energy, max_energy, mean_activation,
            batch_size, max_length, cost_per_character,
            algorithm.total_step_norm, algorithm.total_gradient_norm]
        for name, param in params.items():
            observables.append(named_copy(
                param.norm(2), name + "_norm"))
            observables.append(named_copy(
                algorithm.gradients[param].norm(2), name + "_grad_norm"))

        # Construct the main loop and start training!
        average_monitoring = TrainingDataMonitoring(
            observables, prefix="average", every_n_batches=10)
        main_loop = MainLoop(
            model=model,
            data_stream=data_stream,
            algorithm=algorithm,
            extensions=[
                Timing(),
                TrainingDataMonitoring(observables, after_batch=True),
                average_monitoring,
                FinishAfter(after_n_batches=num_batches)
                # This shows a way to handle NaN emerging during
                # training: simply finish it.
                .add_condition("after_batch", _is_nan),
                Plot(os.path.basename(save_path),
                     [[average_monitoring.record_name(cost)],
                      [average_monitoring.record_name(cost_per_character)]],
                     every_n_batches=10),
                # Saving the model and the log separately is convenient,
                # because loading the whole pickle takes quite some time.
                Checkpoint(save_path, every_n_batches=500,
                           save_separately=["model", "log"]),
                Printing(every_n_batches=1)])
        main_loop.run()
    elif mode == "sample" or mode == "beam_search":
        chars = tensor.lmatrix("input")
        generated = reverser.generate(chars)
        model = Model(generated)
        logger.info("Loading the model..")
        model.set_param_values(load_parameter_values(save_path))

        def generate(input_):
            """Generate output sequences for an input sequence.

            Incapsulates most of the difference between sampling and beam
            search.

            Returns
            -------
            outputs : list of lists
                Trimmed output sequences.
            costs : list
                The negative log-likelihood of generating the respective
                sequences.

            """
            if mode == "beam_search":
                samples, = VariableFilter(
                    bricks=[reverser.generator], name="outputs")(
                        ComputationGraph(generated[1]))
                # NOTE: this will recompile beam search functions
                # every time user presses Enter. Do not create
                # a new `BeamSearch` object every time if
                # speed is important for you.
                beam_search = BeamSearch(input_.shape[1], samples)
                outputs, costs = beam_search.search(
                    {chars: input_}, char2code['</S>'],
                    3 * input_.shape[0])
            else:
                _1, outputs, _2, _3, costs = (
                    model.get_theano_function()(input_))
                outputs = list(outputs.T)
                costs = list(costs.T)
                for i in range(len(outputs)):
                    outputs[i] = list(outputs[i])
                    try:
                        true_length = outputs[i].index(char2code['</S>']) + 1
                    except ValueError:
                        true_length = len(outputs[i])
                    outputs[i] = outputs[i][:true_length]
                    costs[i] = costs[i][:true_length].sum()
            return outputs, costs

        while True:
            line = input("Enter a sentence\n")
            message = ("Enter the number of samples\n" if mode == "sample"
                       else "Enter the beam size\n")
            batch_size = int(input(message))

            encoded_input = [char2code.get(char, char2code["<UNK>"])
                             for char in line.lower().strip()]
            encoded_input = ([char2code['<S>']] + encoded_input +
                             [char2code['</S>']])
            print("Encoder input:", encoded_input)
            target = reverse_words((encoded_input,))[0]
            print("Target: ", target)

            samples, costs = generate(
                numpy.repeat(numpy.array(encoded_input)[:, None],
                             batch_size, axis=1))
            messages = []
            for sample, cost in equizip(samples, costs):
                message = "({})".format(cost)
                message += "".join(code2char[code] for code in sample)
                if sample == target:
                    message += " CORRECT!"
                messages.append((cost, message))
            messages.sort(key=operator.itemgetter(0), reverse=True)
            for _, message in messages:
                print(message)
Ejemplo n.º 18
0
        ],  #+ params,
        prefix="train",
        after_epoch=True),
    Timing(),
    Printing(),
    ProgressBar()
]
bokeh = True
if bokeh:
    extensions.append(
        Plot(
            'Testing blocks',
            channels=[
                #['train_hidden_states_NANS', 'train_params_nans'],
                [
                    'test_error on note as a whole',
                    'test_single error within note'
                ],
                ['test_final_cost', 'train_final_cost'],
                ['train_total_gradient_norm'],
                #['train_' + param.name for param in params],
            ]))

main_loop = MainLoop(algorithm=algorithm,
                     data_stream=train,
                     model=model,
                     extensions=extensions)
main_loop.run()

newdir = str(datetime.now())
os.mkdir(newdir)
for i, param in enumerate(main_loop.model.parameters):
Ejemplo n.º 19
0
def main(name, epochs, batch_size, learning_rate, attention, n_iter, enc_dim,
         dec_dim, z_dim):

    if name is None:
        tag = "watt" if attention else "woatt"
        name = "%s-t%d-enc%d-dec%d-z%d" % (tag, n_iter, enc_dim, dec_dim,
                                           z_dim)

    print("\nRunning experiment %s" % name)
    print("         learning rate: %5.3f" % learning_rate)
    print("             attention: %s" % attention)
    print("          n_iterations: %d" % n_iter)
    print("     encoder dimension: %d" % enc_dim)
    print("           z dimension: %d" % z_dim)
    print("     decoder dimension: %d" % dec_dim)
    print()

    #------------------------------------------------------------------------

    x_dim = 28 * 28
    img_height, img_width = (28, 28)

    rnninits = {
        'weights_init': Orthogonal(),
        #'weights_init': IsotropicGaussian(0.001),
        'biases_init': Constant(0.),
    }
    inits = {
        'weights_init': Orthogonal(),
        #'weights_init': IsotropicGaussian(0.01),
        'biases_init': Constant(0.),
    }

    prior_mu = T.zeros([z_dim])
    prior_log_sigma = T.zeros([z_dim])

    if attention:
        read_N = 4
        write_N = 6
        read_dim = 2 * read_N**2

        reader = AttentionReader(x_dim=x_dim,
                                 dec_dim=dec_dim,
                                 width=img_width,
                                 height=img_height,
                                 N=read_N,
                                 **inits)
        writer = AttentionWriter(input_dim=dec_dim,
                                 output_dim=x_dim,
                                 width=img_width,
                                 height=img_height,
                                 N=read_N,
                                 **inits)
    else:
        read_dim = 2 * x_dim

        reader = Reader(x_dim=x_dim, dec_dim=dec_dim, **inits)
        writer = Writer(input_dim=dec_dim, output_dim=x_dim, **inits)

    encoder = LSTM(dim=enc_dim, name="RNN_enc", **rnninits)
    decoder = LSTM(dim=dec_dim, name="RNN_dec", **rnninits)
    encoder_mlp = MLP([Tanh()], [(read_dim + dec_dim), 4 * enc_dim],
                      name="MLP_enc",
                      **inits)
    decoder_mlp = MLP([Tanh()], [z_dim, 4 * dec_dim], name="MLP_dec", **inits)
    q_sampler = Qsampler(input_dim=enc_dim, output_dim=z_dim, **inits)

    for brick in [
            reader, writer, encoder, decoder, encoder_mlp, decoder_mlp,
            q_sampler
    ]:
        brick.allocate()
        brick.initialize()

    #------------------------------------------------------------------------
    x = tensor.matrix('features')

    # This is one iteration
    def one_iteration(c, h_enc, c_enc, z_mean, z_log_sigma, z, h_dec, c_dec,
                      x):
        x_hat = x - T.nnet.sigmoid(c)
        r = reader.apply(x, x_hat, h_dec)
        i_enc = encoder_mlp.apply(T.concatenate([r, h_dec], axis=1))
        h_enc, c_enc = encoder.apply(states=h_enc,
                                     cells=c_enc,
                                     inputs=i_enc,
                                     iterate=False)
        z_mean, z_log_sigma, z = q_sampler.apply(h_enc)
        i_dec = decoder_mlp.apply(z)
        h_dec, c_dec = decoder.apply(states=h_dec,
                                     cells=c_dec,
                                     inputs=i_dec,
                                     iterate=False)
        c = c + writer.apply(h_dec)
        return c, h_enc, c_enc, z_mean, z_log_sigma, z, h_dec, c_dec

    outputs_info = [
        T.zeros([batch_size, x_dim]),  # c
        T.zeros([batch_size, enc_dim]),  # h_enc
        T.zeros([batch_size, enc_dim]),  # c_enc
        T.zeros([batch_size, z_dim]),  # z_mean
        T.zeros([batch_size, z_dim]),  # z_log_sigma
        T.zeros([batch_size, z_dim]),  # z
        T.zeros([batch_size, dec_dim]),  # h_dec
        T.zeros([batch_size, dec_dim]),  # c_dec
    ]

    outputs, scan_updates = theano.scan(fn=one_iteration,
                                        sequences=[],
                                        outputs_info=outputs_info,
                                        non_sequences=[x],
                                        n_steps=n_iter)

    c, h_enc, c_enc, z_mean, z_log_sigma, z, h_dec, c_dec = outputs

    kl_terms = (prior_log_sigma - z_log_sigma + 0.5 *
                (tensor.exp(2 * z_log_sigma) +
                 (z_mean - prior_mu)**2) / tensor.exp(2 * prior_log_sigma) -
                0.5).sum(axis=-1)

    x_recons = T.nnet.sigmoid(c[-1, :, :])
    recons_term = BinaryCrossEntropy().apply(x, x_recons)
    recons_term.name = "recons_term"

    cost = recons_term + kl_terms.sum(axis=0).mean()
    cost.name = "nll_bound"

    #------------------------------------------------------------
    cg = ComputationGraph([cost])
    params = VariableFilter(roles=[PARAMETER])(cg.variables)

    algorithm = GradientDescent(
        cost=cost,
        params=params,
        step_rule=CompositeRule([
            #StepClipping(3.),
            Adam(learning_rate),
        ])
        #step_rule=RMSProp(learning_rate),
        #step_rule=Momentum(learning_rate=learning_rate, momentum=0.95)
    )
    algorithm.add_updates(scan_updates)

    #------------------------------------------------------------------------
    # Setup monitors
    monitors = [cost]
    for t in range(n_iter):
        kl_term_t = kl_terms[t, :].mean()
        kl_term_t.name = "kl_term_%d" % t

        x_recons_t = T.nnet.sigmoid(c[t, :, :])
        recons_term_t = BinaryCrossEntropy().apply(x, x_recons_t)
        recons_term_t = recons_term_t.mean()
        recons_term_t.name = "recons_term_%d" % t

        monitors += [kl_term_t, recons_term_t]

    train_monitors = monitors[:]
    train_monitors += [aggregation.mean(algorithm.total_gradient_norm)]
    train_monitors += [aggregation.mean(algorithm.total_step_norm)]

    # Live plotting...
    plot_channels = [["train_nll_bound", "test_nll_bound"],
                     ["train_kl_term_%d" % t for t in range(n_iter)],
                     ["train_recons_term_%d" % t for t in range(n_iter)],
                     ["train_total_gradient_norm", "train_total_step_norm"]]

    #------------------------------------------------------------

    mnist_train = BinarizedMNIST("train", sources=['features'])
    mnist_test = BinarizedMNIST("test", sources=['features'])
    #mnist_train = MNIST("train", binary=True, sources=['features'])
    #mnist_test = MNIST("test", binary=True, sources=['features'])

    main_loop = MainLoop(
        model=None,
        data_stream=ForceFloatX(
            DataStream(mnist_train,
                       iteration_scheme=SequentialScheme(
                           mnist_train.num_examples, batch_size))),
        algorithm=algorithm,
        extensions=[
            Timing(),
            FinishAfter(after_n_epochs=epochs),
            DataStreamMonitoring(
                monitors,
                ForceFloatX(
                    DataStream(mnist_test,
                               iteration_scheme=SequentialScheme(
                                   mnist_test.num_examples, batch_size))),
                updates=scan_updates,
                prefix="test"),
            TrainingDataMonitoring(train_monitors,
                                   prefix="train",
                                   after_every_epoch=True),
            SerializeMainLoop(name + ".pkl"),
            Plot(name, channels=plot_channels),
            ProgressBar(),
            Printing()
        ])
    main_loop.run()
Ejemplo n.º 20
0
def main(name, epochs, batch_size, learning_rate, 
         attention, n_iter, enc_dim, dec_dim, z_dim):

     # Learning rate
    def lr_tag(value):
        """ Convert a float into a short tag-usable string representation. E.g.:
            0.1   -> 11
            0.01  -> 12
            0.001 -> 13
            0.005 -> 53
        """
        exp = np.floor(np.log10(value))
        leading = ("%e"%value)[0]
        return "%s%d" % (leading, -exp)

    if name is None:
        tag = "watt" if attention else "woatt"
        lr_str = lr_tag(learning_rate)
        name = "%s-t%d-enc%d-dec%d-z%d-lr%s" % (tag, n_iter, enc_dim, dec_dim, z_dim, lr_str)

    print("\nRunning experiment %s" % name)
    print("         learning rate: %5.3f" % learning_rate) 
    print("             attention: %s" % attention)
    print("          n_iterations: %d" % n_iter)
    print("     encoder dimension: %d" % enc_dim)
    print("           z dimension: %d" % z_dim)
    print("     decoder dimension: %d" % dec_dim)
    print()


    #------------------------------------------------------------------------

    x_dim = 28*28
    img_height, img_width = (28, 28)
    
    rnninits = {
        #'weights_init': Orthogonal(),
        'weights_init': IsotropicGaussian(0.01),
        'biases_init': Constant(0.),
    }
    inits = {
        #'weights_init': Orthogonal(),
        'weights_init': IsotropicGaussian(0.01),
        'biases_init': Constant(0.),
    }
    
    if attention:
        read_N = 4
        write_N = 7
        read_dim = 2*read_N**2

        reader = AttentionReader(x_dim=x_dim, dec_dim=dec_dim,
                                 width=img_width, height=img_height,
                                 N=read_N, **inits)
        writer = AttentionWriter(input_dim=dec_dim, output_dim=x_dim,
                                 width=img_width, height=img_height,
                                 N=read_N, **inits)
    else:
        read_dim = 2*x_dim

        reader = Reader(x_dim=x_dim, dec_dim=dec_dim, **inits)
        writer = Writer(input_dim=dec_dim, output_dim=x_dim, **inits)

    encoder_rnn = LSTM(dim=enc_dim, name="RNN_enc", **rnninits)
    decoder_rnn = LSTM(dim=dec_dim, name="RNN_dec", **rnninits)
    encoder_mlp = MLP([Tanh()], [(read_dim+dec_dim), 4*enc_dim], name="MLP_enc", **inits)
    decoder_mlp = MLP([Tanh()], [             z_dim, 4*dec_dim], name="MLP_dec", **inits)
    q_sampler = Qsampler(input_dim=enc_dim, output_dim=z_dim, **inits)

    draw = DrawModel(
                n_iter, 
                reader=reader,
                encoder_mlp=encoder_mlp,
                encoder_rnn=encoder_rnn,
                sampler=q_sampler,
                decoder_mlp=decoder_mlp,
                decoder_rnn=decoder_rnn,
                writer=writer)
    draw.initialize()


    #------------------------------------------------------------------------
    x = tensor.matrix('features')
    
    #x_recons = 1. + x
    x_recons, kl_terms = draw.reconstruct(x)
    #x_recons, _, _, _, _ = draw.silly(x, n_steps=10, batch_size=100)
    #x_recons = x_recons[-1,:,:]

    #samples = draw.sample(100) 
    #x_recons = samples[-1, :, :]
    #x_recons = samples[-1, :, :]

    recons_term = BinaryCrossEntropy().apply(x, x_recons)
    recons_term.name = "recons_term"

    cost = recons_term + kl_terms.sum(axis=0).mean()
    cost.name = "nll_bound"

    #------------------------------------------------------------
    cg = ComputationGraph([cost])
    params = VariableFilter(roles=[PARAMETER])(cg.variables)

    algorithm = GradientDescent(
        cost=cost, 
        params=params,
        step_rule=CompositeRule([
            StepClipping(3.), 
            Adam(learning_rate),
        ])
        #step_rule=RMSProp(learning_rate),
        #step_rule=Momentum(learning_rate=learning_rate, momentum=0.95)
    )
    #algorithm.add_updates(scan_updates)


    #------------------------------------------------------------------------
    # Setup monitors
    monitors = [cost]
    """
    for t in range(n_iter):
        kl_term_t = kl_terms[t,:].mean()
        kl_term_t.name = "kl_term_%d" % t

        x_recons_t = T.nnet.sigmoid(c[t,:,:])
        recons_term_t = BinaryCrossEntropy().apply(x, x_recons_t)
        recons_term_t = recons_term_t.mean()
        recons_term_t.name = "recons_term_%d" % t

        monitors +=[kl_term_t, recons_term_t]
    """
    train_monitors = monitors[:]
    train_monitors += [aggregation.mean(algorithm.total_gradient_norm)]
    train_monitors += [aggregation.mean(algorithm.total_step_norm)]
    # Live plotting...
    plot_channels = [
        ["train_nll_bound", "test_nll_bound"],
        ["train_kl_term_%d" % t for t in range(n_iter)],
        ["train_recons_term_%d" % t for t in range(n_iter)],
        ["train_total_gradient_norm", "train_total_step_norm"]
    ]

    #------------------------------------------------------------

    mnist_train = BinarizedMNIST("train", sources=['features'])
    mnist_test = BinarizedMNIST("test", sources=['features'])

    main_loop = MainLoop(
        model=Model(cost),
        data_stream=ForceFloatX(DataStream(mnist_train,
                        iteration_scheme=SequentialScheme(
                        mnist_train.num_examples, batch_size))),
        algorithm=algorithm,
        extensions=[
            Timing(),
            FinishAfter(after_n_epochs=epochs),
            DataStreamMonitoring(
                monitors,
                ForceFloatX(DataStream(mnist_test,
                    iteration_scheme=SequentialScheme(
                    mnist_test.num_examples, batch_size))),
##                updates=scan_updates, 
                prefix="test"),
            TrainingDataMonitoring(
                train_monitors, 
                prefix="train",
                after_every_epoch=True),
            SerializeMainLoop(name+".pkl"),
            Plot(name, channels=plot_channels),
            ProgressBar(),
            Printing()])
    main_loop.run()
Ejemplo n.º 21
0
def main(name, dataset, epochs, batch_size, learning_rate, attention, n_iter,
         enc_dim, dec_dim, z_dim, oldmodel):

    image_size, data_train, data_valid, data_test = datasets.get_data(dataset)

    train_stream = Flatten(
        DataStream(data_train,
                   iteration_scheme=SequentialScheme(data_train.num_examples,
                                                     batch_size)))
    valid_stream = Flatten(
        DataStream(data_valid,
                   iteration_scheme=SequentialScheme(data_valid.num_examples,
                                                     batch_size)))
    test_stream = Flatten(
        DataStream(data_test,
                   iteration_scheme=SequentialScheme(data_test.num_examples,
                                                     batch_size)))

    if name is None:
        name = dataset

    img_height, img_width = image_size
    x_dim = img_height * img_width

    rnninits = {
        #'weights_init': Orthogonal(),
        'weights_init': IsotropicGaussian(0.01),
        'biases_init': Constant(0.),
    }
    inits = {
        #'weights_init': Orthogonal(),
        'weights_init': IsotropicGaussian(0.01),
        'biases_init': Constant(0.),
    }

    if attention != "":
        read_N, write_N = attention.split(',')

        read_N = int(read_N)
        write_N = int(write_N)
        read_dim = 2 * read_N**2

        reader = AttentionReader(x_dim=x_dim,
                                 dec_dim=dec_dim,
                                 width=img_width,
                                 height=img_height,
                                 N=read_N,
                                 **inits)
        writer = AttentionWriter(input_dim=dec_dim,
                                 output_dim=x_dim,
                                 width=img_width,
                                 height=img_height,
                                 N=write_N,
                                 **inits)
        attention_tag = "r%d-w%d" % (read_N, write_N)
    else:
        read_dim = 2 * x_dim

        reader = Reader(x_dim=x_dim, dec_dim=dec_dim, **inits)
        writer = Writer(input_dim=dec_dim, output_dim=x_dim, **inits)

        attention_tag = "full"

    #----------------------------------------------------------------------

    # Learning rate
    def lr_tag(value):
        """ Convert a float into a short tag-usable string representation. E.g.:
            0.1   -> 11
            0.01  -> 12
            0.001 -> 13
            0.005 -> 53
        """
        exp = np.floor(np.log10(value))
        leading = ("%e" % value)[0]
        return "%s%d" % (leading, -exp)

    lr_str = lr_tag(learning_rate)

    subdir = time.strftime("%Y%m%d-%H%M%S") + "-" + name
    longname = "%s-%s-t%d-enc%d-dec%d-z%d-lr%s" % (
        dataset, attention_tag, n_iter, enc_dim, dec_dim, z_dim, lr_str)
    pickle_file = subdir + "/" + longname + ".pkl"

    print("\nRunning experiment %s" % longname)
    print("               dataset: %s" % dataset)
    print("          subdirectory: %s" % subdir)
    print("         learning rate: %g" % learning_rate)
    print("             attention: %s" % attention)
    print("          n_iterations: %d" % n_iter)
    print("     encoder dimension: %d" % enc_dim)
    print("           z dimension: %d" % z_dim)
    print("     decoder dimension: %d" % dec_dim)
    print("            batch size: %d" % batch_size)
    print("                epochs: %d" % epochs)
    print()

    #----------------------------------------------------------------------

    encoder_rnn = LSTM(dim=enc_dim, name="RNN_enc", **rnninits)
    decoder_rnn = LSTM(dim=dec_dim, name="RNN_dec", **rnninits)
    encoder_mlp = MLP([Identity()], [(read_dim + dec_dim), 4 * enc_dim],
                      name="MLP_enc",
                      **inits)
    decoder_mlp = MLP([Identity()], [z_dim, 4 * dec_dim],
                      name="MLP_dec",
                      **inits)
    q_sampler = Qsampler(input_dim=enc_dim, output_dim=z_dim, **inits)

    draw = DrawModel(n_iter,
                     reader=reader,
                     encoder_mlp=encoder_mlp,
                     encoder_rnn=encoder_rnn,
                     sampler=q_sampler,
                     decoder_mlp=decoder_mlp,
                     decoder_rnn=decoder_rnn,
                     writer=writer)
    draw.initialize()

    #------------------------------------------------------------------------
    x = tensor.matrix('features')

    #x_recons = 1. + x
    x_recons, kl_terms = draw.reconstruct(x)
    #x_recons, _, _, _, _ = draw.silly(x, n_steps=10, batch_size=100)
    #x_recons = x_recons[-1,:,:]

    #samples = draw.sample(100)
    #x_recons = samples[-1, :, :]
    #x_recons = samples[-1, :, :]

    recons_term = BinaryCrossEntropy().apply(x, x_recons)
    recons_term.name = "recons_term"

    cost = recons_term + kl_terms.sum(axis=0).mean()
    cost.name = "nll_bound"

    #------------------------------------------------------------
    cg = ComputationGraph([cost])
    params = VariableFilter(roles=[PARAMETER])(cg.variables)

    algorithm = GradientDescent(
        cost=cost,
        params=params,
        step_rule=CompositeRule([
            StepClipping(10.),
            Adam(learning_rate),
        ])
        #step_rule=RMSProp(learning_rate),
        #step_rule=Momentum(learning_rate=learning_rate, momentum=0.95)
    )
    #algorithm.add_updates(scan_updates)

    #------------------------------------------------------------------------
    # Setup monitors
    monitors = [cost]
    for t in range(n_iter):
        kl_term_t = kl_terms[t, :].mean()
        kl_term_t.name = "kl_term_%d" % t

        #x_recons_t = T.nnet.sigmoid(c[t,:,:])
        #recons_term_t = BinaryCrossEntropy().apply(x, x_recons_t)
        #recons_term_t = recons_term_t.mean()
        #recons_term_t.name = "recons_term_%d" % t

        monitors += [kl_term_t]

    train_monitors = monitors[:]
    train_monitors += [aggregation.mean(algorithm.total_gradient_norm)]
    train_monitors += [aggregation.mean(algorithm.total_step_norm)]
    # Live plotting...
    plot_channels = [
        ["train_nll_bound", "test_nll_bound"],
        ["train_kl_term_%d" % t for t in range(n_iter)],
        #["train_recons_term_%d" % t for t in range(n_iter)],
        ["train_total_gradient_norm", "train_total_step_norm"]
    ]

    #------------------------------------------------------------

    if not os.path.exists(subdir):
        os.makedirs(subdir)

    main_loop = MainLoop(
        model=Model(cost),
        data_stream=train_stream,
        algorithm=algorithm,
        extensions=[
            Timing(),
            FinishAfter(after_n_epochs=epochs),
            TrainingDataMonitoring(train_monitors,
                                   prefix="train",
                                   after_epoch=True),
            #            DataStreamMonitoring(
            #                monitors,
            #                valid_stream,
            ##                updates=scan_updates,
            #                prefix="valid"),
            DataStreamMonitoring(
                monitors,
                test_stream,
                #                updates=scan_updates,
                prefix="test"),
            Checkpoint(name,
                       before_training=False,
                       after_epoch=True,
                       save_separately=['log', 'model']),
            #Checkpoint(image_size=image_size, save_subdir=subdir, path=pickle_file, before_training=False, after_epoch=True, save_separately=['log', 'model']),
            Plot(name, channels=plot_channels),
            ProgressBar(),
            Printing()
        ])

    if oldmodel is not None:
        print("Initializing parameters with old model %s" % oldmodel)
        with open(oldmodel, "rb") as f:
            oldmodel = pickle.load(f)
            main_loop.model.set_param_values(oldmodel.get_param_values())
        del oldmodel

    main_loop.run()
Ejemplo n.º 22
0
def main():
    print("Build the network")
    input_of_image = tensor.matrix('features')
    input_to_hidden = Linear(name='input_to_hidden',
                             input_dim=784,
                             output_dim=100)
    h = Tanh().apply(input_to_hidden.apply(input_of_image))
    hidden_to_output = Linear(name='hidden_to_output',
                              input_dim=100,
                              output_dim=10)
    output_hat = Softmax().apply(hidden_to_output.apply(h))

    output = tensor.lmatrix('targets')
    cost = CategoricalCrossEntropy().apply(output.flatten(), output_hat)
    correct_rate = 1 - MisclassificationRate().apply(output.flatten(),
                                                     output_hat)
    correct_rate.name = 'correct_rate'
    print(type(correct_rate))
    cost.name = 'cost'

    cg = ComputationGraph(cost)

    # Initialize the parameters
    input_to_hidden.weights_init = hidden_to_output.weights_init = IsotropicGaussian(
        0.01)
    input_to_hidden.biases_init = hidden_to_output.biases_init = Constant(0)
    input_to_hidden.initialize()
    hidden_to_output.initialize()

    # Train
    print("Prepare the data.")
    mnist_train = MNIST("train")
    mnist_test = MNIST("test")

    ## Carve the data into lots of batches.
    data_stream_train = DataStream(mnist_train,
                                   iteration_scheme=SequentialScheme(
                                       mnist_train.num_examples,
                                       batch_size=256))

    ## Set the algorithm for the training.
    algorithm = GradientDescent(cost=cost,
                                params=cg.parameters,
                                step_rule=CompositeRule(
                                    [Scale(0.9),
                                     StepSwitcher(0.05, 0.1)]))

    ## Add a monitor extension for the training.
    data_stream_test = DataStream(
        mnist_test,
        iteration_scheme=SequentialScheme(mnist_test.num_examples,
                                          batch_size=1024))

    test_monitor = DataStreamMonitoring(variables=[cost, correct_rate],
                                        data_stream=data_stream_test,
                                        prefix="test",
                                        after_every_epoch=True)

    train_monitor = TrainingDataMonitoring(
        variables=[cost, correct_rate, algorithm.total_step_norm],
        prefix='train',
        after_every_batch=True)

    ## Add a plot monitor.
    plot = Plot(document='new',
                channels=[['train_correct_rate', 'test_correct_rate']],
                start_server=True,
                after_every_batch=True)

    print("Start training")
    main_loop = MainLoop(algorithm=algorithm,
                         data_stream=data_stream_train,
                         extensions=[
                             plot, test_monitor, train_monitor,
                             FinishAfter(after_n_epochs=20),
                             Printing()
                         ])
    main_loop.run()