Example #1
0
    def __init__(self,
                 cost,
                 params,
                 subtensor_params={},
                 step_rule=None,
                 *args,
                 **kwargs):
        full_params = params
        self.subtensor_params = subtensor_params

        # For each LookupTable, we replace it by its subtensors appearing in the graph
        params = [
            param for param in full_params if param not in subtensor_params
        ]
        for _, (_, _, outputs, _) in subtensor_params.iteritems():
            params.extend(outputs)

        super(GradientDescent, self).__init__(cost=cost,
                                              params=params,
                                              **kwargs)
        # self.params contains the list of outputs of the lookup tables

        logger.info("Taking the cost gradient")
        self.gradients = dict(
            equizip(self.params, tensor.grad(self.cost, self.params)))

        # We combine the gradients extracted from the same parameter
        for param, (subparam, canonized_indices, outputs,
                    indices) in subtensor_params.iteritems():
            # This is necessary if we want to compute the l2 norm correctly (e.g. for StepClipping)
            tmp = shared_floatx(param.get_value() * 0.)
            for (output, indice) in zip(outputs, indices):
                tmp = tensor.inc_subtensor(tmp[indice], self.gradients[output])
                del self.gradients[output]
            self.gradients[subparam] = tmp[canonized_indices]

        # We remove the subtensors from the list of parameters
        self.params = full_params

        logger.info("The cost gradient computation graph is built")

        self.step_rule = step_rule if step_rule else Scale()

        self.total_gradient_norm = named_copy(l2_norm(self.gradients.values()),
                                              "total_gradient_norm")
        self.steps, self.step_rule_updates = (self.step_rule.compute_steps(
            self.gradients))
        self.total_step_norm = named_copy(l2_norm(self.steps.values()),
                                          "total_step_norm")
Example #2
0
def train_model(cost,
                train_stream,
                valid_stream,
                valid_freq,
                valid_rare,
                load_location=None,
                save_location=None):
    cost.name = 'nll'
    perplexity = 2**(cost / tensor.log(2))
    perplexity.name = 'ppl'

    # Define the model
    model = Model(cost)

    # Load the parameters from a dumped model
    if load_location is not None:
        logger.info('Loading parameters...')
        model.set_param_values(load_parameter_values(load_location))

    cg = ComputationGraph(cost)
    algorithm = GradientDescent(cost=cost,
                                step_rule=Scale(learning_rate=0.01),
                                params=cg.parameters)
    main_loop = MainLoop(model=model,
                         data_stream=train_stream,
                         algorithm=algorithm,
                         extensions=[
                             DataStreamMonitoring([cost, perplexity],
                                                  valid_stream,
                                                  prefix='valid',
                                                  every_n_batches=500),
                             DataStreamMonitoring([cost, perplexity],
                                                  valid_rare,
                                                  prefix='valid_rare',
                                                  every_n_batches=500),
                             DataStreamMonitoring([cost, perplexity],
                                                  valid_freq,
                                                  prefix='valid_frequent',
                                                  every_n_batches=500),
                             Printing(every_n_batches=500)
                         ])
    main_loop.run()

    # Save the main loop
    if save_location is not None:
        logger.info('Saving the main loop...')
        dump_manager = MainLoopDumpManager(save_location)
        dump_manager.dump(main_loop)
        logger.info('Saved')
def test_composite_rule():
    rule = CompositeRule([StepClipping(4), Scale(0.1)])
    gradients = {0: shared_floatx(3.0), 1: shared_floatx(4.0)}
    result, _ = rule.compute_steps(gradients)
    assert_allclose(result[0].eval(), 12 / 50.0)
    assert_allclose(result[1].eval(), 16 / 50.0)

    class RuleWithUpdates(StepRule):
        def __init__(self, updates):
            self.updates = updates

        def compute_steps(self, previous_steps):
            return previous_steps, self.updates

    rule = CompositeRule([RuleWithUpdates([(1, 2)]),
                          RuleWithUpdates([(3, 4)])])
    assert rule.compute_steps(None)[1] == [(1, 2), (3, 4)]
Example #4
0
def setup_mainloop(extension):
    """Set up a simple main loop for progress bar tests.

    Create a MainLoop, register the given extension, supply it with a
    DataStream and a minimal model/cost to optimize.

    """
    features = [numpy.array(f, dtype=floatX) for f in [[1, 2], [3, 4], [5, 6]]]
    dataset = IterableDataset(dict(features=features))

    W = shared_floatx([0, 0], name='W')
    x = tensor.vector('features')
    cost = tensor.sum((x - W)**2)
    cost.name = "cost"

    algorithm = GradientDescent(cost=cost, params=[W], step_rule=Scale(1e-3))

    main_loop = MainLoop(model=None,
                         data_stream=dataset.get_example_stream(),
                         algorithm=algorithm,
                         extensions=[FinishAfter(after_n_epochs=1), extension])

    return main_loop
def test_restrict():
    rule1 = Scale(0.1)
    rule2 = Restrict(rule1, (1, 4))
    rval, _ = rule2.compute_steps(
        OrderedDict((i, shared_floatx(i * i)) for i in range(6)))
    assert_allclose(rval[0].eval(), 0.0)
    assert_allclose(rval[1].eval(), 0.1)
    assert_allclose(rval[2].eval(), 4.0)
    assert_allclose(rval[3].eval(), 9.0)
    assert_allclose(rval[4].eval(), 1.6)
    assert_allclose(rval[5].eval(), 25.0)

    steps, updates = Restrict(DummyUpdatesStepRule(), (1, 4)).compute_steps(
        OrderedDict((i, shared_floatx(i * i)) for i in range(6)))

    assert_allclose(steps[0].eval(), 0.0)
    assert_allclose(steps[1].eval(), 3.0)
    assert_allclose(steps[2].eval(), 4.0)
    assert_allclose(steps[3].eval(), 9.0)
    assert_allclose(steps[4].eval(), 18.0)
    assert_allclose(steps[5].eval(), 25.0)

    assert updates == [(10, 100), (40, 400)]
Example #6
0
def setup_algorithms(cost, cg, method, type="ff"):
    """Setup training algorithm.

    Parameters
    ----------
    cost : expression
        cost expression
    cg : ComputationGraph
        Computation graph
    method : string
        training method: SGD, momentum SGD, AdaGrad, RMSprop
    learning_rate : float
        learning rate for learning method

    Returns
    -------
    algorithm : GradientDescent
        Gradient Descent algorithm based on different optimization method
    """
    if method == "sgd":
        step_rule = Scale(learning_rate=0.01)
    elif method == "momentum":
        step_rule = Momentum(learning_rate=0.01, momentum=0.95)
    elif method == "adagrad":
        step_rule = AdaGrad()
    elif method == "rmsprop":
        step_rule = RMSProp()

    if type == "RNN":
        step_rule = CompositeRule([StepClipping(1.0), step_rule])

    algorithm = GradientDescent(cost=cost,
                                parameters=cg.parameters,
                                step_rule=step_rule)

    return algorithm
Example #7
0
def build_and_run(save_to,modelconfig,experimentconfig):
    """ part of this is adapted from lasagne tutorial""" 

    n, num_filters, image_size, num_blockstack = modelconfig['depth'], modelconfig['num_filters'], modelconfig['image_size'], modelconfig['num_blockstack']
    
    print("Amount of bottlenecks: %d" % n)

    # Prepare Theano variables for inputs and targets
    input_var = T.tensor4('image_features')
    #target_value = T.ivector('targets')
    target_var = T.lmatrix('targets')
    target_vec = T.extra_ops.to_one_hot(target_var[:,0],2)
    #target_var = T.matrix('targets')
    # Create residual net model
    print("Building model...")
    network = build_cnn(input_var, image_size, n, num_blockstack, num_filters)
    get_info(network)
    prediction = lasagne.utils.as_theano_expression(lasagne.layers.get_output(network))
    test_prediction = lasagne.utils.as_theano_expression(lasagne.layers.get_output(network,deterministic=True))

    # Loss function -> The objective to minimize 
    print("Instanciation of loss function...")
 
    #loss = CategoricalCrossEntropy().apply(target_var.flatten(), prediction)
    #test_loss = CategoricalCrossEntropy().apply(target_var.flatten(), test_prediction)
 #   loss = lasagne.objectives.categorical_crossentropy(prediction, target_var.flatten()).mean()
  #  test_loss = lasagne.objectives.categorical_crossentropy(test_prediction, target_var.flatten()).mean()
    loss = lasagne.objectives.squared_error(prediction,target_vec).mean()
    test_loss = lasagne.objectives.squared_error(test_prediction,target_vec).mean()
  #  loss = tensor.nnet.binary_crossentropy(prediction, target_var).mean()
  #  test_loss = tensor.nnet.binary_crossentropy(test_prediction, target_var).mean()
    test_loss.name = "loss"

#    loss.name = 'x-ent_error'
#    loss.name = 'sqr_error'
    layers = lasagne.layers.get_all_layers(network)

    #l1 and l2 regularization
    #pondlayers = {x:0.000025 for i,x in enumerate(layers)}
    #l1_penality = lasagne.regularization.regularize_layer_params_weighted(pondlayers, lasagne.regularization.l2)
    #l2_penality = lasagne.regularization.regularize_layer_params(layers[len(layers)/4:], lasagne.regularization.l1) * 25e-6
    #reg_penalty = l1_penality + l2_penality
    #reg_penalty.name = 'reg_penalty'
    #loss = loss + reg_penalty
    loss.name = 'reg_loss'
    error_rate = MisclassificationRate().apply(target_var.flatten(), test_prediction).copy(
            name='error_rate')

    
    # Load the dataset
    print("Loading data...")
    istest = 'test' in experimentconfig.keys()
    if istest:
        print("Using test stream")
    train_stream, valid_stream, test_stream = get_stream(experimentconfig['batch_size'],image_size,test=istest)

    # Defining step rule and algorithm
    if 'step_rule' in experimentconfig.keys() and not experimentconfig['step_rule'] is None :
        step_rule = experimentconfig['step_rule'](learning_rate=experimentconfig['learning_rate'])
    else :
        step_rule=Scale(learning_rate=experimentconfig['learning_rate'])

    params = map(lasagne.utils.as_theano_expression,lasagne.layers.get_all_params(network, trainable=True))
    print("Initializing algorithm")
    algorithm = GradientDescent(
                cost=loss, gradients={var:T.grad(loss,var) for var in params},#parameters=cg.parameters, #params
                step_rule=step_rule)

    #algorithm.add_updates(extra_updates)


    grad_norm = aggregation.mean(algorithm.total_gradient_norm)
    grad_norm.name = "grad_norm"

    print("Initializing extensions...")
    plot = Plot(save_to, channels=[['train_loss','valid_loss'], 
['train_grad_norm'],
#['train_grad_norm','train_reg_penalty'],
['train_error_rate','valid_error_rate']], server_url='http://hades.calculquebec.ca:5042')    

    checkpoint = Checkpoint('models/best_'+save_to+'.tar')
  #  checkpoint.add_condition(['after_n_batches=25'],

    checkpoint.add_condition(['after_epoch'],
                         predicate=OnLogRecord('valid_error_rate_best_so_far'))

    #Defining extensions
    extensions = [Timing(),
                  FinishAfter(after_n_epochs=experimentconfig['num_epochs'],
                              after_n_batches=experimentconfig['num_batches']),
                  TrainingDataMonitoring([test_loss, error_rate, grad_norm], # reg_penalty],
                  prefix="train", after_epoch=True), #after_n_epochs=1
                  DataStreamMonitoring([test_loss, error_rate],valid_stream,prefix="valid", after_epoch=True), #after_n_epochs=1
                  plot,
                  #Checkpoint(save_to,after_n_epochs=5),
                  #ProgressBar(),
             #     Plot(save_to, channels=[['train_loss','valid_loss'], ['train_error_rate','valid_error_rate']], server_url='http://hades.calculquebec.ca:5042'), #'grad_norm'
                  #       after_batch=True),
                  Printing(after_epoch=True),
                  TrackTheBest('valid_error_rate',min), #Keep best
                  checkpoint,  #Save best
                  FinishIfNoImprovementAfter('valid_error_rate_best_so_far', epochs=5)] # Early-stopping

 #   model = Model(loss)
 #   print("Model",model)


    main_loop = MainLoop(
        algorithm,
        train_stream,
       # model=model,
        extensions=extensions)
    print("Starting main loop...")

    main_loop.run()
Example #8
0
def main():
    feature_maps = [20, 50]
    mlp_hiddens = [50]
    conv_sizes = [5, 5]
    pool_sizes = [3, 3]
    save_to = "DvC.pkl"
    batch_size = 500
    image_size = (32, 32)
    output_size = 2
    learningRate = 0.1
    num_epochs = 10
    num_batches = None
    host_plot = 'http://*****:*****@ %s' %
             ('CNN ', datetime.datetime.now(), socket.gethostname()),
             channels=[['valid_cost', 'valid_error_rate'],
                       ['train_total_gradient_norm']],
             after_epoch=True,
             server_url=host_plot))

    model = Model(cost)

    main_loop = MainLoop(algorithm,
                         stream_data_train,
                         model=model,
                         extensions=extensions)

    main_loop.run()
def main():
    mlp_hiddens = [1000]
    filter_sizes = [(9, 9), (5, 5), (5, 5)]
    feature_maps = [80, 50, 20]
    pooling_sizes = [(3, 3), (2, 2), (2, 2)]
    save_to = "DvC.pkl"
    image_size = (128, 128)
    output_size = 2
    learningRate = 0.1
    num_epochs = 300
    num_batches = None
    if socket.gethostname() == 'tim-X550JX':
        host_plot = 'http://*****:*****@ %s' %
             ('CNN ', datetime.datetime.now(), socket.gethostname()),
             channels=[['train_error_rate', 'valid_error_rate'],
                       ['train_total_gradient_norm']],
             after_epoch=True,
             server_url=host_plot))

    model = Model(cost)

    main_loop = MainLoop(algorithm,
                         stream_data_train,
                         model=model,
                         extensions=extensions)

    main_loop.run()
Example #10
0
def main(num_epochs,
         feature_maps=None,
         mlp_hiddens=None,
         conv_sizes=None,
         pool_sizes=None,
         batch_size=500,
         num_batches=None):

    ############# Architecture #############
    if feature_maps is None:
        feature_maps = [20, 50]
    if mlp_hiddens is None:
        mlp_hiddens = [500]
    if conv_sizes is None:
        conv_sizes = [5, 5]
    if pool_sizes is None:
        pool_sizes = [2, 2]
    image_size = (32, 32)
    batch_size = 50
    output_size = 2
    learningRate = 0.1
    num_epochs = 10
    num_batches = None
    delta = 0.01
    drop_prob = 0.5
    weight_noise = 0.75

    # Use ReLUs everywhere and softmax for the final prediction
    conv_activations = [Rectifier() for _ in feature_maps]
    mlp_activations = [Rectifier() for _ in mlp_hiddens] + [Softmax()]
    convnet = LeNet(conv_activations,
                    3,
                    image_size,
                    filter_sizes=zip(conv_sizes, conv_sizes),
                    feature_maps=feature_maps,
                    pooling_sizes=zip(pool_sizes, pool_sizes),
                    top_mlp_activations=mlp_activations,
                    top_mlp_dims=mlp_hiddens + [output_size],
                    border_mode='full',
                    weights_init=Uniform(width=.2),
                    biases_init=Constant(0))

    # We push initialization config to set different initialization schemes
    # for convolutional layers.

    convnet.push_initialization_config()
    convnet.layers[0].weights_init = Uniform(width=.2)
    convnet.layers[1].weights_init = Uniform(width=.09)
    convnet.top_mlp.linear_transformations[0].weights_init = Uniform(width=.08)
    convnet.top_mlp.linear_transformations[1].weights_init = Uniform(width=.11)
    convnet.initialize()
    logging.info(
        "Input dim: {} {} {}".format(*convnet.children[0].get_dim('input_')))
    for i, layer in enumerate(convnet.layers):
        if isinstance(layer, Activation):
            logging.info("Layer {} ({})".format(i, layer.__class__.__name__))
        else:
            logging.info("Layer {} ({}) dim: {} {} {}".format(
                i, layer.__class__.__name__, *layer.get_dim('output')))

    x = tensor.tensor4('image_features')
    y = tensor.lmatrix('targets')

    probs = (convnet.apply(x)).copy(name='probs')

    # Computational Graph just for cost for drop_out and noise application
    cg_probs = ComputationGraph([probs])
    inputs = VariableFilter(roles=[INPUT])(cg_probs.variables)
    weights = VariableFilter(roles=[FILTER, WEIGHT])(cg_probs.variables)

    ############# Regularization #############
    #regularization = 0
    logger.info('Applying regularization')
    regularization = delta * sum([(W**2).mean() for W in weights])
    probs.name = "reg_probs"

    ############# Guaussian Noise #############

    logger.info('Applying Gaussian noise')
    cg_train = apply_noise(cg_probs, weights, weight_noise)

    ############# Dropout #############

    logger.info('Applying dropout')
    cg_probs = apply_dropout(cg_probs, inputs, drop_prob)
    dropped_out = VariableFilter(roles=[DROPOUT])(cg_probs.variables)
    inputs_referenced = [var.tag.replacement_of for var in dropped_out]
    set(inputs) == set(inputs_referenced)

    ############# Batch normalization #############

    # recalculate probs after dropout and noise and regularization:
    probs = cg_probs.outputs[0] + regularization
    cost = (CategoricalCrossEntropy().apply(y.flatten(),
                                            probs).copy(name='cost'))
    error_rate = (MisclassificationRate().apply(y.flatten(),
                                                probs).copy(name='error_rate'))
    cg = ComputationGraph([probs, cost, error_rate])
    cg = apply_batch_normalization(cg)

    ########### Loading images #####################

    from fuel.datasets.dogs_vs_cats import DogsVsCats
    from fuel.streams import DataStream, ServerDataStream
    from fuel.schemes import ShuffledScheme
    from fuel.transformers.image import RandomFixedSizeCrop, MinimumImageDimensions, Random2DRotation
    from fuel.transformers import Flatten, Cast, ScaleAndShift

    def create_data(data):
        stream = DataStream(data,
                            iteration_scheme=ShuffledScheme(
                                data.num_examples, batch_size))
        stream_downscale = MinimumImageDimensions(
            stream, image_size, which_sources=('image_features', ))
        stream_rotate = Random2DRotation(stream_downscale,
                                         which_sources=('image_features', ))
        stream_max = ScikitResize(stream_rotate,
                                  image_size,
                                  which_sources=('image_features', ))
        stream_scale = ScaleAndShift(stream_max,
                                     1. / 255,
                                     0,
                                     which_sources=('image_features', ))
        stream_cast = Cast(stream_scale,
                           dtype='float32',
                           which_sources=('image_features', ))
        #stream_flat = Flatten(stream_scale, which_sources=('image_features',))

        return stream_cast

    stream_data_train = create_data(
        DogsVsCats(('train', ), subset=slice(0, 20)))
    stream_data_test = create_data(
        DogsVsCats(('train', ), subset=slice(20, 30)))

    # Train with simple SGD
    algorithm = GradientDescent(cost=cost,
                                parameters=cg.parameters,
                                step_rule=Scale(learning_rate=learningRate))
    #algorithm = GradientDescent(cost=cost, parameters=cg.parameters,step_rule=Adam(0.001))
    #algorithm.add_updates(extra_updates)

    # `Timing` extension reports time for reading data, aggregating a batch and monitoring;
    # `ProgressBar` displays a nice progress bar during training.
    extensions = []
    extensions.append(Timing())
    extensions.append(
        FinishAfter(after_n_epochs=num_epochs, after_n_batches=num_batches))
    extensions.append(
        DataStreamMonitoring([cost, error_rate],
                             stream_data_test,
                             prefix="valid"))
    extensions.append(
        TrainingDataMonitoring([
            cost, error_rate,
            aggregation.mean(algorithm.total_gradient_norm)
        ],
                               prefix="train",
                               after_epoch=True))
    #extensions.append(Checkpoint(save_to))
    extensions.append(ProgressBar())
    extensions.append(Printing())

    logger.info("Building the model")
    model = Model(cost)

    main_loop = MainLoop(algorithm,
                         stream_data_train,
                         model=model,
                         extensions=extensions)

    main_loop.run()
Example #11
0
def main(name, epochs, batch_size, learning_rate, dim, mix_dim, old_model_name,
         max_length, bokeh, GRU, dropout, depth, max_grad, step_method,
         epsilon, sample):

    #----------------------------------------------------------------------
    datasource = name

    def shnum(x):
        """ Convert a positive float into a short tag-usable string
             E.g.: 0 -> 0, 0.005 -> 53, 100 -> 1-2
        """
        return '0' if x <= 0 else '%s%d' % (
            ("%e" % x)[0], -np.floor(np.log10(x)))

    jobname = "%s-%dX%dm%dd%dr%sb%de%s" % (
        datasource, depth, dim, mix_dim, int(
            dropout * 10), shnum(learning_rate), batch_size, shnum(epsilon))
    if max_length != 600:
        jobname += '-L%d' % max_length

    if GRU:
        jobname += 'g'
    if max_grad != 5.:
        jobname += 'G%g' % max_grad
    if step_method != 'adam':
        jobname += step_method

    if sample:
        print("Sampling")
    else:
        print("\nRunning experiment %s" % jobname)

    #----------------------------------------------------------------------
    if depth > 1:
        transition = LSTMstack(dim=dim,
                               depth=depth,
                               name="transition",
                               lstm_name="transition")
        assert not GRU
    elif GRU:
        transition = GatedRecurrent(dim=dim, name="transition")
    else:
        transition = LSTM(dim=dim, name="transition")

    emitter = SketchEmitter(mix_dim=mix_dim, epsilon=epsilon, name="emitter")
    readout = Readout(readout_dim=emitter.get_dim('inputs'),
                      source_names=['states'],
                      emitter=emitter,
                      name="readout")
    normal_inputs = [
        name for name in transition.apply.sequences if 'mask' not in name
    ]
    fork = Fork(normal_inputs, prototype=Linear(use_bias=True))
    generator = SequenceGenerator(readout=readout,
                                  transition=transition,
                                  fork=fork)

    # Initialization settings
    generator.weights_init = OrthogonalGlorot()
    generator.biases_init = Constant(0)

    # Build the cost computation graph [steps,batch_size, 3]
    x = T.tensor3('features', dtype=floatX)[:max_length, :, :]
    x.tag.test_value = np.ones((max_length, batch_size, 3)).astype(np.float32)
    cost = generator.cost(x)
    cost.name = "sequence_log_likelihood"

    # Give an idea of what's going on
    model = Model(cost)
    params = model.get_params()
    logger.info("Parameters:\n" +
                pprint.pformat([(key, value.get_value().shape)
                                for key, value in params.items()],
                               width=120))
    model_size = 0
    for v in params.itervalues():
        s = v.get_value().shape
        model_size += s[0] * (s[1] if len(s) > 1 else 1)
    logger.info("Total number of parameters %d" % model_size)

    #------------------------------------------------------------
    extensions = []
    if old_model_name == 'continue':
        extensions.append(LoadFromDump(jobname))
    elif old_model_name:
        # or you can just load the weights without state using:
        old_params = LoadFromDump(old_model_name).manager.load_parameters()
        model.set_param_values(old_params)
    else:
        # Initialize parameters
        for brick in model.get_top_bricks():
            brick.initialize()

    if sample:
        assert old_model_name and old_model_name != 'continue'
        Sample(generator, steps=max_length, path='.').do(None)
        exit(0)

    #------------------------------------------------------------
    # Define the training algorithm.
    cg = ComputationGraph(cost)
    if dropout > 0.:
        from blocks.roles import INPUT, OUTPUT
        dropout_target = VariableFilter(roles=[OUTPUT],
                                        bricks=[transition],
                                        name_regex='states')(cg.variables)
        cg = apply_dropout(cg, dropout_target, dropout)
        cost = cg.outputs[0]

    if step_method == 'adam':
        step_rule = Adam(learning_rate)
    elif step_method == 'rmsprop':
        step_rule = RMSProp(learning_rate, decay_rate=0.95)
    elif step_method == 'adagrad':
        step_rule = AdaGrad(learning_rate)
    elif step_method == 'adadelta':
        step_rule = AdaDelta()
    elif step_method == 'scale':
        step_rule = Scale(learning_rate=0.1)
    else:
        raise Exception('Unknown sttep method %s' % step_method)

    step_rule = CompositeRule([StepClipping(max_grad), step_rule])

    algorithm = GradientDescent(cost=cost,
                                params=cg.parameters,
                                step_rule=step_rule)

    #------------------------------------------------------------
    observables = [cost]

    # Fetch variables useful for debugging
    (energies, ) = VariableFilter(applications=[generator.readout.readout],
                                  name_regex="output")(cg.variables)
    (activations, ) = VariableFilter(
        applications=[generator.transition.apply],
        name=generator.transition.apply.states[0])(cg.variables)
    min_energy = named_copy(energies.min(), "min_energy")
    max_energy = named_copy(energies.max(), "max_energy")
    mean_activation = named_copy(abs(activations).mean(), "mean_activation")
    observables += [min_energy, max_energy, mean_activation]

    observables += [algorithm.total_step_norm, algorithm.total_gradient_norm]
    for name, param in params.items():
        observables.append(named_copy(param.norm(2), name + "_norm"))
        observables.append(
            named_copy(algorithm.gradients[param].norm(2),
                       name + "_grad_norm"))

    #------------------------------------------------------------
    datasource_fname = os.path.join(fuel.config.data_path, datasource,
                                    datasource + '.hdf5')

    train_ds = H5PYDataset(
        datasource_fname,  #max_length=max_length,
        which_set='train',
        sources=('features', ),
        load_in_memory=True)
    train_stream = DataStream(train_ds,
                              iteration_scheme=ShuffledScheme(
                                  train_ds.num_examples, batch_size))

    test_ds = H5PYDataset(
        datasource_fname,  #max_length=max_length,
        which_set='test',
        sources=('features', ),
        load_in_memory=True)
    test_stream = DataStream(test_ds,
                             iteration_scheme=SequentialScheme(
                                 test_ds.num_examples, batch_size))

    train_stream = Mapping(train_stream, _transpose)
    test_stream = Mapping(test_stream, _transpose)

    def stream_stats(ds, label):
        itr = ds.get_epoch_iterator(as_dict=True)
        batch_count = 0
        examples_count = 0
        for batch in itr:
            batch_count += 1
            examples_count += batch['features'].shape[1]
        print('%s #batch %d #examples %d' %
              (label, batch_count, examples_count))

    stream_stats(train_stream, 'train')
    stream_stats(test_stream, 'test')

    extensions += [
        Timing(every_n_batches=10),
        TrainingDataMonitoring(observables, prefix="train",
                               every_n_batches=10),
        DataStreamMonitoring(
            [cost],
            test_stream,
            prefix="test",
            on_resumption=True,
            after_epoch=False,  # by default this is True
            every_n_batches=100),
        # all monitored data is ready so print it...
        # (next steps may take more time and we want to see the
        # results as soon as possible so print as soon as you can)
        Printing(every_n_batches=10),
        # perform multiple dumps at different intervals
        # so if one of them breaks (has nan) we can hopefully
        # find a model from few batches ago in the other
        Dump(jobname, every_n_batches=11),
        Dump(jobname + '.test', every_n_batches=100),
        Sample(generator,
               steps=max_length,
               path=jobname + '.test',
               every_n_batches=100),
        ProgressBar(),
        FinishAfter(after_n_epochs=epochs)
        # This shows a way to handle NaN emerging during
        # training: simply finish it.
        .add_condition("after_batch", _is_nan),
    ]

    if bokeh:
        extensions.append(Plot('sketch', channels=[
            ['cost'],
        ]))

    # Construct the main loop and start training!
    main_loop = MainLoop(model=model,
                         data_stream=train_stream,
                         algorithm=algorithm,
                         extensions=extensions)

    main_loop.run()
Example #12
0
                                parameters=params,
                                step_rule=AdaDelta(),
                                on_unused_sources='warn')
elif "momentum" in config:
    print "using momentum"
    mWeight = float(config["momentum"])
    algorithm = GradientDescent(cost=cost,
                                parameters=params,
                                step_rule=Momentum(learning_rate=learning_rate,
                                                   momentum=mWeight),
                                on_unused_sources='warn')
else:
    print "using traditional SGD"
    algorithm = GradientDescent(cost=cost,
                                parameters=params,
                                step_rule=Scale(learning_rate=learning_rate),
                                on_unused_sources='warn')
extensions = []
if "saveModel" in config:
    print "saving model after training"
    extensions.append(Checkpoint(path=networkfile, after_n_epochs=n_epochs))
extensions.append(FinishAfter(after_n_epochs=n_epochs))
extensions.append(
    F1Extension(layer3=layer3,
                y=y,
                model=model,
                data_stream=data_stream_test,
                num_samples=numSamplesTest,
                batch_size=1,
                every_n_epochs=1))
extensions.append(
Example #13
0
def train(algorithm, learning_rate, clipping, momentum, layer_size, epochs,
          test_cost, experiment_path, initialization, init_width, weight_noise,
          z_prob, z_prob_states, z_prob_cells, drop_prob_igates,
          ogates_zoneout, batch_size, stoch_depth, share_mask, gaussian_drop,
          rnn_type, num_layers, norm_cost_coeff, penalty, testing, seq_len,
          decrease_lr_after_epoch, lr_decay, **kwargs):

    print '.. PTB experiment'
    print '.. arguments:', ' '.join(sys.argv)
    t0 = time.time()

    ###########################################
    #
    # LOAD DATA
    #
    ###########################################

    def onehot(x, numclasses=None):
        """ Convert integer encoding for class-labels (starting with 0 !)
            to one-hot encoding.
            The output is an array whose shape is the shape of the input array
            plus an extra dimension, containing the 'one-hot'-encoded labels.
        """
        if x.shape == ():
            x = x[None]
        if numclasses is None:
            numclasses = x.max() + 1
        result = numpy.zeros(list(x.shape) + [numclasses], dtype="int")
        z = numpy.zeros(x.shape, dtype="int")
        for c in range(numclasses):
            z *= 0
            z[numpy.where(x == c)] = 1
            result[..., c] += z
        return result.astype(theano.config.floatX)

    alphabetsize = 10000
    data = np.load('penntree_char_and_word.npz')
    trainset = data['train_words']
    validset = data['valid_words']
    testset = data['test_words']

    if testing:
        trainset = trainset[:3000]
        validset = validset[:3000]

    if share_mask:
        if not z_prob:
            raise ValueError('z_prob must be provided when using share_mask')
        if z_prob_cells or z_prob_states:
            raise ValueError(
                'z_prob_states and z_prob_cells must not be provided when using share_mask (use z_prob instead)'
            )
        z_prob_cells = z_prob
        # we don't want to actually use these masks, so this is to debug
        z_prob_states = None
    else:
        if z_prob:
            raise ValueError('z_prob is only used with share_mask')
        z_prob_cells = z_prob_cells or '1'
        z_prob_states = z_prob_states or '1'


#    rng = np.random.RandomState(seed)

###########################################
#
# MAKE STREAMS
#
###########################################

    def prep_dataset(dataset):
        dataset = dataset[:(len(dataset) - (len(dataset) %
                                            (seq_len * batch_size)))]
        dataset = dataset.reshape(batch_size, -1, seq_len).transpose((1, 0, 2))

        stream = DataStream(
            IndexableDataset(indexables=OrderedDict([('data', dataset)])),
            iteration_scheme=SequentialExampleScheme(dataset.shape[0]))
        stream = Transpose(stream, [(1, 0)])
        stream = SampleDropsNPWord(stream, z_prob_states, z_prob_cells,
                                   drop_prob_igates, layer_size, num_layers,
                                   False, stoch_depth, share_mask,
                                   gaussian_drop, alphabetsize)
        stream.sources = ('data', ) * 3 + stream.sources + (
            'zoneouts_states', 'zoneouts_cells', 'zoneouts_igates')
        return (stream, )

    train_stream, = prep_dataset(trainset)
    valid_stream, = prep_dataset(validset)
    test_stream, = prep_dataset(testset)

    ####################

    data = train_stream.get_epoch_iterator(as_dict=True).next()

    ####################

    ###########################################
    #
    # BUILD MODEL
    #
    ###########################################
    print '.. building model'

    x = T.tensor3('data')
    y = x
    zoneouts_states = T.tensor3('zoneouts_states')
    zoneouts_cells = T.tensor3('zoneouts_cells')
    zoneouts_igates = T.tensor3('zoneouts_igates')

    x.tag.test_value = data['data']
    zoneouts_states.tag.test_value = data['zoneouts_states']
    zoneouts_cells.tag.test_value = data['zoneouts_cells']
    zoneouts_igates.tag.test_value = data['zoneouts_igates']

    if init_width and not initialization == 'uniform':
        raise ValueError('Width is only for uniform init, whassup?')

    if initialization == 'glorot':
        weights_init = NormalizedInitialization()
    elif initialization == 'uniform':
        weights_init = Uniform(width=init_width)
    elif initialization == 'ortho':
        weights_init = OrthogonalInitialization()
    else:
        raise ValueError('No such initialization')

    if rnn_type.lower() == 'lstm':
        in_to_hids = [
            Linear(layer_size if l > 0 else alphabetsize,
                   layer_size * 4,
                   name='in_to_hid%d' % l,
                   weights_init=weights_init,
                   biases_init=Constant(0.0)) for l in range(num_layers)
        ]
        recurrent_layers = [
            DropLSTM(dim=layer_size,
                     weights_init=weights_init,
                     activation=Tanh(),
                     model_type=6,
                     name='rnn%d' % l,
                     ogates_zoneout=ogates_zoneout) for l in range(num_layers)
        ]
    elif rnn_type.lower() == 'gru':
        in_to_hids = [
            Linear(layer_size if l > 0 else alphabetsize,
                   layer_size * 3,
                   name='in_to_hid%d' % l,
                   weights_init=weights_init,
                   biases_init=Constant(0.0)) for l in range(num_layers)
        ]
        recurrent_layers = [
            DropGRU(dim=layer_size,
                    weights_init=weights_init,
                    activation=Tanh(),
                    name='rnn%d' % l) for l in range(num_layers)
        ]
    elif rnn_type.lower() == 'srnn':  # FIXME!!! make ReLU
        in_to_hids = [
            Linear(layer_size if l > 0 else alphabetsize,
                   layer_size,
                   name='in_to_hid%d' % l,
                   weights_init=weights_init,
                   biases_init=Constant(0.0)) for l in range(num_layers)
        ]
        recurrent_layers = [
            DropSimpleRecurrent(dim=layer_size,
                                weights_init=weights_init,
                                activation=Rectifier(),
                                name='rnn%d' % l) for l in range(num_layers)
        ]
    else:
        raise NotImplementedError

    hid_to_out = Linear(layer_size,
                        alphabetsize,
                        name='hid_to_out',
                        weights_init=weights_init,
                        biases_init=Constant(0.0))

    for layer in in_to_hids:
        layer.initialize()
    for layer in recurrent_layers:
        layer.initialize()
    hid_to_out.initialize()

    layer_input = x  #in_to_hid.apply(x)

    init_updates = OrderedDict()
    for l, (in_to_hid, layer) in enumerate(zip(in_to_hids, recurrent_layers)):
        rnn_embedding = in_to_hid.apply(layer_input)
        if rnn_type.lower() == 'lstm':
            states_init = theano.shared(
                np.zeros((batch_size, layer_size), dtype=floatX))
            cells_init = theano.shared(
                np.zeros((batch_size, layer_size), dtype=floatX))
            states_init.name, cells_init.name = "states_init", "cells_init"
            states, cells = layer.apply(
                rnn_embedding,
                zoneouts_states[:, :, l * layer_size:(l + 1) * layer_size],
                zoneouts_cells[:, :, l * layer_size:(l + 1) * layer_size],
                zoneouts_igates[:, :, l * layer_size:(l + 1) * layer_size],
                states_init, cells_init)
            init_updates.update([(states_init, states[-1]),
                                 (cells_init, cells[-1])])
        elif rnn_type.lower() in ['gru', 'srnn']:
            # untested!
            states_init = theano.shared(
                np.zeros((batch_size, layer_size), dtype=floatX))
            states_init.name = "states_init"
            states = layer.apply(rnn_embedding, zoneouts_states,
                                 zoneouts_igates, states_init)
            init_updates.update([(states_init, states[-1])])
        else:
            raise NotImplementedError
        layer_input = states

    y_hat_pre_softmax = hid_to_out.apply(T.join(0, [states_init], states[:-1]))
    shape_ = y_hat_pre_softmax.shape
    y_hat = Softmax().apply(y_hat_pre_softmax.reshape((-1, alphabetsize)))

    ####################

    ###########################################
    #
    # SET UP COSTS AND MONITORS
    #
    ###########################################

    cost = CategoricalCrossEntropy().apply(y.reshape((-1, alphabetsize)),
                                           y_hat).copy('cost')

    bpc = (cost / np.log(2.0)).copy(name='bpr')
    perp = T.exp(cost).copy(name='perp')

    cost_train = cost.copy(name='train_cost')
    cg_train = ComputationGraph([cost_train])

    ###########################################
    #
    # NORM STABILIZER
    #
    ###########################################
    norm_cost = 0.

    def _magnitude(x, axis=-1):
        return T.sqrt(
            T.maximum(T.sqr(x).sum(axis=axis),
                      numpy.finfo(x.dtype).tiny))

    if penalty == 'cells':
        assert VariableFilter(roles=[MEMORY_CELL])(cg_train.variables)
        for cell in VariableFilter(roles=[MEMORY_CELL])(cg_train.variables):
            norms = _magnitude(cell)
            norm_cost += T.mean(
                T.sum((norms[1:] - norms[:-1])**2, axis=0) / (seq_len - 1))
    elif penalty == 'hids':
        for l in range(num_layers):
            assert 'rnn%d_apply_states' % l in [
                o.name
                for o in VariableFilter(roles=[OUTPUT])(cg_train.variables)
            ]
        for output in VariableFilter(roles=[OUTPUT])(cg_train.variables):
            for l in range(num_layers):
                if output.name == 'rnn%d_apply_states' % l:
                    norms = _magnitude(output)
                    norm_cost += T.mean(
                        T.sum((norms[1:] - norms[:-1])**2, axis=0) /
                        (seq_len - 1))

    norm_cost.name = 'norm_cost'
    #cost_valid = cost_train
    cost_train += norm_cost_coeff * norm_cost
    cost_train = cost_train.copy(
        'cost_train')  #should this be cost_train.outputs[0]? no.

    cg_train = ComputationGraph([cost_train])

    ###########################################
    #
    # WEIGHT NOISE
    #
    ###########################################

    if weight_noise > 0:
        weights = VariableFilter(roles=[WEIGHT])(cg_train.variables)
        cg_train = apply_noise(cg_train, weights, weight_noise)
        cost_train = cg_train.outputs[0].copy(name='cost_train')

    model = Model(cost_train)

    learning_rate = float(learning_rate)
    clipping = StepClipping(threshold=np.cast[floatX](clipping))
    if algorithm == 'adam':
        adam = Adam(learning_rate=learning_rate)
        learning_rate = adam.learning_rate
        step_rule = CompositeRule([adam, clipping])
    elif algorithm == 'rms_prop':
        rms_prop = RMSProp(learning_rate=learning_rate)
        learning_rate = rms_prop.learning_rate
        step_rule = CompositeRule([clipping, rms_prop])
    elif algorithm == 'momentum':
        sgd_momentum = Momentum(learning_rate=learning_rate, momentum=momentum)
        learning_rate = sgd_momentum.learning_rate
        step_rule = CompositeRule([clipping, sgd_momentum])
    elif algorithm == 'sgd':
        sgd = Scale(learning_rate=learning_rate)
        learning_rate = sgd.learning_rate
        step_rule = CompositeRule([clipping, sgd])
    else:
        raise NotImplementedError
    algorithm = GradientDescent(step_rule=step_rule,
                                cost=cost_train,
                                parameters=cg_train.parameters)
    # theano_func_kwargs={"mode": theano.compile.MonitorMode(post_func=detect_nan)})

    algorithm.add_updates(init_updates)

    def cond_number(x):
        _, _, sing_vals = T.nlinalg.svd(x, True, True)
        sing_mags = abs(sing_vals)
        return T.max(sing_mags) / T.min(sing_mags)

    def rms(x):
        return (x * x).mean().sqrt()

    whysplode_cond = []
    whysplode_rms = []
    for i, p in enumerate(init_updates):
        v = p.get_value()
        if p.get_value().shape == 2:
            whysplode_cond.append(
                cond_number(p).copy(
                    'ini%d:%s_cond(%s)' %
                    (i, p.name, "x".join(map(str,
                                             p.get_value().shape)))))
        whysplode_rms.append(
            rms(p).copy('ini%d:%s_rms(%s)' %
                        (i, p.name, "x".join(map(str,
                                                 p.get_value().shape)))))
    for i, p in enumerate(cg_train.parameters):
        v = p.get_value()
        if p.get_value().shape == 2:
            whysplode_cond.append(
                cond_number(p).copy(
                    'ini%d:%s_cond(%s)' %
                    (i, p.name, "x".join(map(str,
                                             p.get_value().shape)))))
        whysplode_rms.append(
            rms(p).copy('ini%d:%s_rms(%s)' %
                        (i, p.name, "x".join(map(str,
                                                 p.get_value().shape)))))

    observed_vars = [
        cost_train, cost, bpc, perp, learning_rate,
        aggregation.mean(
            algorithm.total_gradient_norm).copy("gradient_norm_mean")
    ]  # + whysplode_rms

    parameters = model.get_parameter_dict()
    for name, param in parameters.iteritems():
        observed_vars.append(param.norm(2).copy(name=name + "_norm"))
        observed_vars.append(
            algorithm.gradients[param].norm(2).copy(name=name + "_grad_norm"))

    train_monitor = TrainingDataMonitoring(variables=observed_vars,
                                           prefix="train",
                                           after_epoch=True)

    dev_inits = [p.clone() for p in init_updates]
    cg_dev = ComputationGraph([cost, bpc, perp] +
                              init_updates.values()).replace(
                                  zip(init_updates.keys(), dev_inits))
    dev_cost, dev_bpc, dev_perp = cg_dev.outputs[:3]
    dev_init_updates = OrderedDict(zip(dev_inits, cg_dev.outputs[3:]))

    dev_monitor = DataStreamMonitoring(variables=[dev_cost, dev_bpc, dev_perp],
                                       data_stream=valid_stream,
                                       prefix="dev",
                                       updates=dev_init_updates)

    # noone does this
    if 'load_path' in kwargs:
        with open(kwargs['load_path']) as f:
            loaded = np.load(f)
            model = Model(cost_train)
            params_dicts = model.get_parameter_dict()
            params_names = params_dicts.keys()
            for param_name in params_names:
                param = params_dicts[param_name]
                # '/f_6_.W' --> 'f_6_.W'
                slash_index = param_name.find('/')
                param_name = param_name[slash_index + 1:]
                if param.get_value().shape == loaded[param_name].shape:
                    print 'Found: ' + param_name
                    param.set_value(loaded[param_name])
                else:
                    print 'Not found: ' + param_name

    extensions = []
    extensions.extend(
        [FinishAfter(after_n_epochs=epochs), train_monitor, dev_monitor])
    if test_cost:
        test_inits = [p.clone() for p in init_updates]
        cg_test = ComputationGraph([cost, bpc, perp] +
                                   init_updates.values()).replace(
                                       zip(init_updates.keys(), test_inits))
        test_cost, test_bpc, test_perp = cg_test.outputs[:3]
        test_init_updates = OrderedDict(zip(test_inits, cg_test.outputs[3:]))

        test_monitor = DataStreamMonitoring(
            variables=[test_cost, test_bpc, test_perp],
            data_stream=test_stream,
            prefix="test",
            updates=test_init_updates)
        extensions.extend([test_monitor])

    if not os.path.exists(experiment_path):
        os.makedirs(experiment_path)
    log_path = os.path.join(experiment_path, 'log.txt')
    fh = logging.FileHandler(filename=log_path)
    fh.setLevel(logging.DEBUG)
    logger.addHandler(fh)

    extensions.append(
        SaveParams('dev_cost', model, experiment_path, every_n_epochs=1))
    extensions.append(SaveLog(every_n_epochs=1))
    extensions.append(ProgressBar())
    extensions.append(Printing())

    class RollsExtension(TrainingExtension):
        """ rolls the cell and state activations between epochs so that first batch gets correct initial activations """
        def __init__(self, shvars):
            self.shvars = shvars

        def before_epoch(self):
            for v in self.shvars:
                v.set_value(np.roll(v.get_value(), 1, 0))

    extensions.append(
        RollsExtension(init_updates.keys() + dev_init_updates.keys() +
                       (test_init_updates.keys() if test_cost else [])))

    class LearningRateSchedule(TrainingExtension):
        """ Lets you set a number to divide learning rate by each epoch + when to start doing that """
        def __init__(self):
            self.epoch_number = 0

        def after_epoch(self):
            self.epoch_number += 1
            if self.epoch_number > decrease_lr_after_epoch:
                learning_rate.set_value(learning_rate.get_value() / lr_decay)

    if bool(lr_decay) != bool(decrease_lr_after_epoch):
        raise ValueError(
            'Need to define both lr_decay and decrease_lr_after_epoch')
    if lr_decay and decrease_lr_after_epoch:
        extensions.append(LearningRateSchedule())

    main_loop = MainLoop(model=model,
                         data_stream=train_stream,
                         algorithm=algorithm,
                         extensions=extensions)
    t1 = time.time()
    print "Building time: %f" % (t1 - t0)

    main_loop.run()
    print "Execution time: %f" % (time.time() - t1)
Example #14
0
print "model parameters:"
print model.get_parameter_dict()

if "adagrad" in config:
  print "using adagrad"
  thisRule=AdaGrad(learning_rate=learning_rate)
elif "adadelta" in config:
  print "using adadelta"
  thisRule=AdaDelta()
elif "momentum" in config:
  print "using momentum"
  mWeight = float(config["momentum"])
  thisRule=Momentum(learning_rate=learning_rate, momentum=mWeight)
else:
  print "using traditional SGD"
  thisRule=Scale(learning_rate=learning_rate)

if "gradientClipping" in config:
  threshold = float(config["gradientClipping"])
  print "using gradient clipping with threshold ", threshold
  thisRule=CompositeRule([StepClipping(threshold), thisRule])

#step_rule=CompositeRule([StepClipping(config['step_clipping']),
#                                 eval(config['step_rule'])()])

algorithm = GradientDescent(cost=cost,
                            parameters=params,
                            step_rule=thisRule,
                            on_unused_sources='warn')
extensions = []
if "saveModel" in config:
Example #15
0
def run(epochs=1, corpus="data/", HIDDEN_DIMS=100, path="./"):
    brown = BrownDataset(corpus)

    INPUT_DIMS = brown.get_vocabulary_size()

    OUTPUT_DIMS = brown.get_vocabulary_size()

    # These are theano variables
    x = tensor.lmatrix('context')
    y = tensor.ivector('output')

    # Construct the graph
    input_to_hidden = LookupTable(name='input_to_hidden', length=INPUT_DIMS,
                                  dim=HIDDEN_DIMS)

    # Compute the weight matrix for every word in the context and then compute
    # the average.
    h = tensor.mean(input_to_hidden.apply(x), axis=1)

    hidden_to_output = Linear(name='hidden_to_output', input_dim=HIDDEN_DIMS,
                              output_dim=OUTPUT_DIMS)
    y_hat = Softmax().apply(hidden_to_output.apply(h))

    # And initialize with random varibales and set the bias vector to 0
    weights = IsotropicGaussian(0.01)
    input_to_hidden.weights_init = hidden_to_output.weights_init = weights
    input_to_hidden.biases_init = hidden_to_output.biases_init = Constant(0)
    input_to_hidden.initialize()
    hidden_to_output.initialize()

    # And now the cost function
    cost = CategoricalCrossEntropy().apply(y, y_hat)
    cg = ComputationGraph(cost)

    W1, W2 = VariableFilter(roles=[WEIGHT])(cg.variables)
    cost = cost + 0.01 * (W1 ** 2).sum() + 0.01 * (W2 ** 2).sum()
    cost.name = 'cost_with_regularization'

    mini_batch = SequentialScheme(brown.num_instances(), 512)
    data_stream = DataStream.default_stream(brown, iteration_scheme=mini_batch)

    # Now we tie up lose ends and construct the algorithm for the training
    # and define what happens in the main loop.
    algorithm = GradientDescent(cost=cost, parameters=cg.parameters,
                                step_rule=Scale(learning_rate=0.1))

    extensions = [
        ProgressBar(),
        FinishAfter(after_n_epochs=epochs),
        Printing(),
        # TrainingDataMonitoring(variables=[cost]),
        SaveWeights(layers=[input_to_hidden, hidden_to_output],
                    prefixes=['%sfirst' % path, '%ssecond' % path]),
        # Plot(
        #     'Word Embeddings',
        #     channels=[
        #         [
        #             'cost_with_regularization'
        #         ]
        #     ])
    ]

    logger.info("Starting main loop...")
    main = MainLoop(data_stream=data_stream,
                    algorithm=algorithm,
                    extensions=extensions)

    main.run()

    pickle.dump(cg, open('%scg.pickle' % path, 'wb'))
Example #16
0
def main(save_to,
         num_epochs,
         feature_maps=None,
         mlp_hiddens=None,
         conv_sizes=None,
         pool_sizes=None,
         batch_size=500,
         num_batches=None):
    if feature_maps is None:
        feature_maps = [20, 50]
    if mlp_hiddens is None:
        mlp_hiddens = [500]
    if conv_sizes is None:
        conv_sizes = [5, 5]
    if pool_sizes is None:
        pool_sizes = [2, 2]
    image_size = (32, 23)
    batch_size = 50
    output_size = 2
    learningRate = 0.1
    num_epochs = 10
    num_batches = None

    # Use ReLUs everywhere and softmax for the final prediction
    conv_activations = [Rectifier() for _ in feature_maps]
    mlp_activations = [Rectifier() for _ in mlp_hiddens] + [Softmax()]
    convnet = LeNet(conv_activations,
                    3,
                    image_size,
                    filter_sizes=zip(conv_sizes, conv_sizes),
                    feature_maps=feature_maps,
                    pooling_sizes=zip(pool_sizes, pool_sizes),
                    top_mlp_activations=mlp_activations,
                    top_mlp_dims=mlp_hiddens + [output_size],
                    border_mode='full',
                    weights_init=Uniform(width=.2),
                    biases_init=Constant(0))
    # We push initialization config to set different initialization schemes
    # for convolutional layers.
    convnet.push_initialization_config()
    convnet.layers[0].weights_init = Uniform(width=.2)
    convnet.layers[1].weights_init = Uniform(width=.09)
    convnet.top_mlp.linear_transformations[0].weights_init = Uniform(width=.08)
    convnet.top_mlp.linear_transformations[1].weights_init = Uniform(width=.11)
    convnet.initialize()
    logging.info(
        "Input dim: {} {} {}".format(*convnet.children[0].get_dim('input_')))
    for i, layer in enumerate(convnet.layers):
        if isinstance(layer, Activation):
            logging.info("Layer {} ({})".format(i, layer.__class__.__name__))
        else:
            logging.info("Layer {} ({}) dim: {} {} {}".format(
                i, layer.__class__.__name__, *layer.get_dim('output')))
    x = tensor.tensor4('image_features')
    y = tensor.lmatrix('targets')

    # Normalize input and apply the convnet
    probs = convnet.apply(x)
    cost = (CategoricalCrossEntropy().apply(y.flatten(),
                                            probs).copy(name='cost'))
    error_rate = (MisclassificationRate().apply(y.flatten(),
                                                probs).copy(name='error_rate'))

    cg = ComputationGraph([cost, error_rate])

    ########### Loading images #####################

    from fuel.datasets.dogs_vs_cats import DogsVsCats
    from fuel.streams import DataStream, ServerDataStream
    from fuel.schemes import ShuffledScheme
    from fuel.transformers.image import RandomFixedSizeCrop, MinimumImageDimensions, Random2DRotation
    from fuel.transformers import Flatten, Cast, ScaleAndShift

    def create_data(data):
        stream = DataStream.default_stream(data,
                                           iteration_scheme=ShuffledScheme(
                                               data.num_examples, batch_size))
        stream_downscale = MinimumImageDimensions(
            stream, image_size, which_sources=('image_features', ))
        #stream_rotate = Random2DRotation(stream_downscale, which_sources=('image_features',))
        stream_max = ScikitResize(stream_downscale,
                                  image_size,
                                  which_sources=('image_features', ))
        stream_scale = ScaleAndShift(stream_max,
                                     1. / 255,
                                     0,
                                     which_sources=('image_features', ))
        stream_cast = Cast(stream_scale,
                           dtype='float32',
                           which_sources=('image_features', ))
        #stream_flat = Flatten(stream_scale, which_sources=('image_features',))
        return stream_cast

    stream_data_train = create_data(
        DogsVsCats(('train', ), subset=slice(0, 20000)))
    stream_data_test = create_data(
        DogsVsCats(('train', ), subset=slice(20000, 25000)))

    # Train with simple SGD
    algorithm = GradientDescent(cost=cost,
                                parameters=cg.parameters,
                                step_rule=Scale(learning_rate=learningRate))

    # `Timing` extension reports time for reading data, aggregating a batch
    # and monitoring;
    # `ProgressBar` displays a nice progress bar during training.
    extensions = []
    extensions.append(Timing())
    extensions.append(
        FinishAfter(after_n_epochs=num_epochs, after_n_batches=num_batches))
    extensions.append(
        DataStreamMonitoring([cost, error_rate],
                             stream_data_test,
                             prefix="valid"))
    extensions.append(
        TrainingDataMonitoring([
            cost, error_rate,
            aggregation.mean(algorithm.total_gradient_norm)
        ],
                               prefix="train",
                               after_epoch=True))
    extensions.append(Checkpoint(save_to))
    extensions.append(ProgressBar())
    extensions.append(Printing())

    model = Model(cost)

    ########### Loading images #####################
    main_loop = MainLoop(algorithm,
                         stream_data_train,
                         model=model,
                         extensions=extensions)

    main_loop.run()
Example #17
0
def test_training_data_monitoring():
    weights = numpy.array([-1, 1], dtype=theano.config.floatX)
    features = [
        numpy.array(f, dtype=theano.config.floatX)
        for f in [[1, 2], [3, 5], [5, 8]]
    ]
    targets = numpy.array([(weights * f).sum() for f in features])
    n_batches = 3
    dataset = IterableDataset(dict(features=features, targets=targets))

    x = tensor.vector('features')
    y = tensor.scalar('targets')
    W = shared_floatx([0, 0], name='W')
    V = shared_floatx(7, name='V')
    W_sum = W.sum().copy(name='W_sum')
    cost = ((x * W).sum() - y)**2
    cost.name = 'cost'

    class TrueCostExtension(TrainingExtension):
        def before_batch(self, data):
            self.main_loop.log.current_row['true_cost'] = ((
                (W.get_value() * data["features"]).sum() - data["targets"])**2)

    # Note, that unlike a Theano variable, a monitored
    # quantity can't be reused in more than one TrainingDataMonitoring

    ftt1 = MeanFeaturesTimesTarget(requires=[x, y], name='ftt1')
    ftt2 = MeanFeaturesTimesTarget(requires=[x, y], name='ftt2')

    main_loop = MainLoop(model=None,
                         data_stream=dataset.get_example_stream(),
                         algorithm=GradientDescent(cost=cost,
                                                   parameters=[W],
                                                   step_rule=Scale(0.001)),
                         extensions=[
                             FinishAfter(after_n_epochs=1),
                             TrainingDataMonitoring([W_sum, cost, V, ftt1],
                                                    prefix="train1",
                                                    after_batch=True),
                             TrainingDataMonitoring(
                                 [aggregation.mean(W_sum), cost, ftt2],
                                 prefix="train2",
                                 after_epoch=True),
                             TrueCostExtension()
                         ])

    main_loop.run()

    # Check monitoring of a shared varible
    assert_allclose(main_loop.log.current_row['train1_V'], 7.0)

    for i in range(n_batches):
        # The ground truth is written to the log before the batch is
        # processed, where as the extension writes after the batch is
        # processed. This is why the iteration numbers differs here.
        assert_allclose(main_loop.log[i]['true_cost'],
                        main_loop.log[i + 1]['train1_cost'])
    assert_allclose(
        main_loop.log[n_batches]['train2_cost'],
        sum([main_loop.log[i]['true_cost']
             for i in range(n_batches)]) / n_batches)
    assert_allclose(
        main_loop.log[n_batches]['train2_W_sum'],
        sum([
            main_loop.log[i]['train1_W_sum'] for i in range(1, n_batches + 1)
        ]) / n_batches)

    # Check monitoring of non-Theano quantites
    for i in range(n_batches):
        assert_allclose(main_loop.log[i + 1]['train1_ftt1'],
                        features[i] * targets[i])
        assert_allclose(main_loop.log[n_batches]['train2_ftt2'],
                        (features * targets[:, None]).mean(axis=0))
Example #18
0
def main(mode, save_path, num_batches, data_path=None):
    reverser = WordReverser(100, len(char2code), name="reverser")

    if mode == "train":
        # Data processing pipeline
        dataset_options = dict(dictionary=char2code, level="character",
                               preprocess=_lower)
        if data_path:
            dataset = TextFile(data_path, **dataset_options)
        else:
            dataset = OneBillionWord("training", [99], **dataset_options)
        data_stream = dataset.get_example_stream()
        data_stream = Filter(data_stream, _filter_long)
        data_stream = Mapping(data_stream, reverse_words,
                              add_sources=("targets",))
        data_stream = Batch(data_stream, iteration_scheme=ConstantScheme(10))
        data_stream = Padding(data_stream)
        data_stream = Mapping(data_stream, _transpose)

        # Initialization settings
        reverser.weights_init = IsotropicGaussian(0.1)
        reverser.biases_init = Constant(0.0)
        reverser.push_initialization_config()
        reverser.encoder.weghts_init = Orthogonal()
        reverser.generator.transition.weights_init = Orthogonal()

        # Build the cost computation graph
        chars = tensor.lmatrix("features")
        chars_mask = tensor.matrix("features_mask")
        targets = tensor.lmatrix("targets")
        targets_mask = tensor.matrix("targets_mask")
        batch_cost = reverser.cost(
            chars, chars_mask, targets, targets_mask).sum()
        batch_size = named_copy(chars.shape[1], "batch_size")
        cost = aggregation.mean(batch_cost,  batch_size)
        cost.name = "sequence_log_likelihood"
        logger.info("Cost graph is built")

        # Give an idea of what's going on
        model = Model(cost)
        params = model.get_params()
        logger.info("Parameters:\n" +
                    pprint.pformat(
                        [(key, value.get_value().shape) for key, value
                         in params.items()],
                        width=120))

        # Initialize parameters
        for brick in model.get_top_bricks():
            brick.initialize()

        # Define the training algorithm.
        cg = ComputationGraph(cost)
        algorithm = GradientDescent(
            cost=cost, params=cg.parameters,
            step_rule=CompositeRule([StepClipping(10.0), Scale(0.01)]))

        # Fetch variables useful for debugging
        generator = reverser.generator
        (energies,) = VariableFilter(
            application=generator.readout.readout,
            name="output")(cg.variables)
        (activations,) = VariableFilter(
            application=generator.transition.apply,
            name=generator.transition.apply.states[0])(cg.variables)
        max_length = named_copy(chars.shape[0], "max_length")
        cost_per_character = named_copy(
            aggregation.mean(batch_cost, batch_size * max_length),
            "character_log_likelihood")
        min_energy = named_copy(energies.min(), "min_energy")
        max_energy = named_copy(energies.max(), "max_energy")
        mean_activation = named_copy(abs(activations).mean(),
                                     "mean_activation")
        observables = [
            cost, min_energy, max_energy, mean_activation,
            batch_size, max_length, cost_per_character,
            algorithm.total_step_norm, algorithm.total_gradient_norm]
        for name, param in params.items():
            observables.append(named_copy(
                param.norm(2), name + "_norm"))
            observables.append(named_copy(
                algorithm.gradients[param].norm(2), name + "_grad_norm"))

        # Construct the main loop and start training!
        average_monitoring = TrainingDataMonitoring(
            observables, prefix="average", every_n_batches=10)
        main_loop = MainLoop(
            model=model,
            data_stream=data_stream,
            algorithm=algorithm,
            extensions=[
                Timing(),
                TrainingDataMonitoring(observables, after_batch=True),
                average_monitoring,
                FinishAfter(after_n_batches=num_batches)
                # This shows a way to handle NaN emerging during
                # training: simply finish it.
                .add_condition("after_batch", _is_nan),
                Plot(os.path.basename(save_path),
                     [[average_monitoring.record_name(cost)],
                      [average_monitoring.record_name(cost_per_character)]],
                     every_n_batches=10),
                # Saving the model and the log separately is convenient,
                # because loading the whole pickle takes quite some time.
                Checkpoint(save_path, every_n_batches=500,
                           save_separately=["model", "log"]),
                Printing(every_n_batches=1)])
        main_loop.run()
    elif mode == "sample" or mode == "beam_search":
        chars = tensor.lmatrix("input")
        generated = reverser.generate(chars)
        model = Model(generated)
        logger.info("Loading the model..")
        model.set_param_values(load_parameter_values(save_path))

        def generate(input_):
            """Generate output sequences for an input sequence.

            Incapsulates most of the difference between sampling and beam
            search.

            Returns
            -------
            outputs : list of lists
                Trimmed output sequences.
            costs : list
                The negative log-likelihood of generating the respective
                sequences.

            """
            if mode == "beam_search":
                samples, = VariableFilter(
                    bricks=[reverser.generator], name="outputs")(
                        ComputationGraph(generated[1]))
                # NOTE: this will recompile beam search functions
                # every time user presses Enter. Do not create
                # a new `BeamSearch` object every time if
                # speed is important for you.
                beam_search = BeamSearch(input_.shape[1], samples)
                outputs, costs = beam_search.search(
                    {chars: input_}, char2code['</S>'],
                    3 * input_.shape[0])
            else:
                _1, outputs, _2, _3, costs = (
                    model.get_theano_function()(input_))
                outputs = list(outputs.T)
                costs = list(costs.T)
                for i in range(len(outputs)):
                    outputs[i] = list(outputs[i])
                    try:
                        true_length = outputs[i].index(char2code['</S>']) + 1
                    except ValueError:
                        true_length = len(outputs[i])
                    outputs[i] = outputs[i][:true_length]
                    costs[i] = costs[i][:true_length].sum()
            return outputs, costs

        while True:
            line = input("Enter a sentence\n")
            message = ("Enter the number of samples\n" if mode == "sample"
                       else "Enter the beam size\n")
            batch_size = int(input(message))

            encoded_input = [char2code.get(char, char2code["<UNK>"])
                             for char in line.lower().strip()]
            encoded_input = ([char2code['<S>']] + encoded_input +
                             [char2code['</S>']])
            print("Encoder input:", encoded_input)
            target = reverse_words((encoded_input,))[0]
            print("Target: ", target)

            samples, costs = generate(
                numpy.repeat(numpy.array(encoded_input)[:, None],
                             batch_size, axis=1))
            messages = []
            for sample, cost in equizip(samples, costs):
                message = "({})".format(cost)
                message += "".join(code2char[code] for code in sample)
                if sample == target:
                    message += " CORRECT!"
                messages.append((cost, message))
            messages.sort(key=operator.itemgetter(0), reverse=True)
            for _, message in messages:
                print(message)
Example #19
0
    #print(cg.variables)

    print("Created ComputationGraph, parameters:")
    #print(cg.parameters)
    for p in cg.parameters:
        print(str(p), p.dtype, p.shape.tag.test_value)

    print("Created ComputationGraph, inputs:")
    print(cg.inputs)

    algorithm = GradientDescent(
        cost=cost,
        parameters=cg.parameters,
        step_rule=CompositeRule([
            StepClipping(10.0),
            Scale(0.01),
        ]),
    )
    print("Defined Algorithm")

    model = Model(cost)
    print("Defined Model")

    obs_max_length = named_copy(x.shape[0], "obs_max_length")
    observables = [
        cost,
        obs_max_length,
        #min_energy, max_energy,
        #mean_activation,
    ]
Example #20
0
def main(save_to, num_epochs,
         weight_decay=0.0001, noise_pressure=0, subset=None, num_batches=None,
         batch_size=None, histogram=None, resume=False):
    output_size = 10

    prior_noise_level = -10
    noise_step_rule = Scale(1e-6)
    noise_rate = theano.shared(numpy.asarray(1e-5, dtype=theano.config.floatX))
    convnet = create_res_net(out_noise=True, tied_noise=True, tied_sigma=True,
            noise_rate=noise_rate,
            prior_noise_level=prior_noise_level)

    x = tensor.tensor4('features')
    y = tensor.lmatrix('targets')

    # Normalize input and apply the convnet
    test_probs = convnet.apply(x)
    test_cost = (CategoricalCrossEntropy().apply(y.flatten(), test_probs)
            .copy(name='cost'))
    test_error_rate = (MisclassificationRate().apply(y.flatten(), test_probs)
                  .copy(name='error_rate'))
    test_confusion = (ConfusionMatrix().apply(y.flatten(), test_probs)
                  .copy(name='confusion'))
    test_confusion.tag.aggregation_scheme = Sum(test_confusion)

    test_cg = ComputationGraph([test_cost, test_error_rate])

    # Apply dropout to all layer outputs except final softmax
    # dropout_vars = VariableFilter(
    #         roles=[OUTPUT], bricks=[Convolutional],
    #         theano_name_regex="^conv_[25]_apply_output$")(test_cg.variables)
    # drop_cg = apply_dropout(test_cg, dropout_vars, 0.5)

    # Apply 0.2 dropout to the pre-averaging layer
    # dropout_vars_2 = VariableFilter(
    #         roles=[OUTPUT], bricks=[Convolutional],
    #         theano_name_regex="^conv_8_apply_output$")(test_cg.variables)
    # train_cg = apply_dropout(test_cg, dropout_vars_2, 0.2)

    # Apply 0.2 dropout to the input, as in the paper
    # train_cg = apply_dropout(test_cg, [x], 0.2)
    # train_cg = drop_cg
    # train_cg = apply_batch_normalization(test_cg)

    # train_cost, train_error_rate, train_components = train_cg.outputs

    with batch_normalization(convnet):
        with training_noise(convnet):
            train_probs = convnet.apply(x)
    train_cost = (CategoricalCrossEntropy().apply(y.flatten(), train_probs)
                .copy(name='cost'))
    train_components = (ComponentwiseCrossEntropy().apply(y.flatten(),
                train_probs).copy(name='components'))
    train_error_rate = (MisclassificationRate().apply(y.flatten(),
                train_probs).copy(name='error_rate'))
    train_cg = ComputationGraph([train_cost,
                train_error_rate, train_components])
    population_updates = get_batch_normalization_updates(train_cg)
    bn_alpha = 0.9
    extra_updates = [(p, p * bn_alpha + m * (1 - bn_alpha))
                for p, m in population_updates]

    # for annealing
    nit_penalty = theano.shared(numpy.asarray(noise_pressure, dtype=theano.config.floatX))
    nit_penalty.name = 'nit_penalty'

    # Compute noise rates for training graph
    train_logsigma = VariableFilter(roles=[LOG_SIGMA])(train_cg.variables)
    train_mean_log_sigma = tensor.concatenate([n.flatten() for n in train_logsigma]).mean()
    train_mean_log_sigma.name = 'mean_log_sigma'
    train_nits = VariableFilter(roles=[NITS])(train_cg.auxiliary_variables)
    train_nit_rate = tensor.concatenate([n.flatten() for n in train_nits]).mean()
    train_nit_rate.name = 'nit_rate'
    train_nit_regularization = nit_penalty * train_nit_rate
    train_nit_regularization.name = 'nit_regularization'

    # Apply regularization to the cost
    trainable_parameters = VariableFilter(roles=[WEIGHT, BIAS])(
            train_cg.parameters)
    mask_parameters = [p for p in trainable_parameters
            if get_brick(p).name == 'mask']
    noise_parameters = VariableFilter(roles=[NOISE])(train_cg.parameters)
    biases = VariableFilter(roles=[BIAS])(train_cg.parameters)
    weights = VariableFilter(roles=[WEIGHT])(train_cg.variables)
    nonmask_weights = [p for p in weights if get_brick(p).name != 'mask']
    l2_norm = sum([(W ** 2).sum() for W in nonmask_weights])
    l2_norm.name = 'l2_norm'
    l2_regularization = weight_decay * l2_norm
    l2_regularization.name = 'l2_regularization'

    # testversion
    test_cost = test_cost + l2_regularization
    test_cost.name = 'cost_with_regularization'

    # Training version of cost
    train_cost_without_regularization = train_cost
    train_cost_without_regularization.name = 'cost_without_regularization'
    train_cost = train_cost + l2_regularization + train_nit_regularization
    train_cost.name = 'cost_with_regularization'

    cifar10_train = CIFAR10(("train",))
    cifar10_train_stream = RandomPadCropFlip(
        NormalizeBatchLevels(DataStream.default_stream(
            cifar10_train, iteration_scheme=ShuffledScheme(
                cifar10_train.num_examples, batch_size)),
        which_sources=('features',)),
        (32, 32), pad=4, which_sources=('features',))

    test_batch_size = 128
    cifar10_test = CIFAR10(("test",))
    cifar10_test_stream = NormalizeBatchLevels(DataStream.default_stream(
        cifar10_test,
        iteration_scheme=ShuffledScheme(
            cifar10_test.num_examples, test_batch_size)),
        which_sources=('features',))

    momentum = Momentum(0.01, 0.9)

    # Create a step rule that doubles the learning rate of biases, like Caffe.
    # scale_bias = Restrict(Scale(2), biases)
    # step_rule = CompositeRule([scale_bias, momentum])

    # Create a step rule that reduces the learning rate of noise
    scale_mask = Restrict(noise_step_rule, mask_parameters)
    step_rule = CompositeRule([scale_mask, momentum])

    # from theano.compile.nanguardmode import NanGuardMode

    # Train with simple SGD
    algorithm = GradientDescent(
        cost=train_cost, parameters=trainable_parameters,
        step_rule=step_rule)
    algorithm.add_updates(extra_updates)

    #,
    #    theano_func_kwargs={
    #        'mode': NanGuardMode(
    #            nan_is_error=True, inf_is_error=True, big_is_error=True)})

    exp_name = save_to.replace('.%d', '')

    # `Timing` extension reports time for reading data, aggregating a batch
    # and monitoring;
    # `ProgressBar` displays a nice progress bar during training.
    extensions = [Timing(),
                  FinishAfter(after_n_epochs=num_epochs,
                              after_n_batches=num_batches),
                  EpochSchedule(momentum.learning_rate, [
                      (0, 0.01),     # Warm up with 0.01 learning rate
                      (50, 0.1),     # Then go back to 0.1
                      (100, 0.01),
                      (150, 0.001)
                      # (83, 0.01),  # Follow the schedule in the paper
                      # (125, 0.001)
                  ]),
                  EpochSchedule(noise_step_rule.learning_rate, [
                      (0, 1e-2),
                      (2, 1e-1),
                      (4, 1)
                      # (0, 1e-6),
                      # (2, 1e-5),
                      # (4, 1e-4)
                  ]),
                  EpochSchedule(noise_rate, [
                      (0, 1e-2),
                      (2, 1e-1),
                      (4, 1)
                      # (0, 1e-6),
                      # (2, 1e-5),
                      # (4, 1e-4),
                      # (6, 3e-4),
                      # (8, 1e-3), # Causes nit rate to jump
                      # (10, 3e-3),
                      # (12, 1e-2),
                      # (15, 3e-2),
                      # (19, 1e-1),
                      # (24, 3e-1),
                      # (30, 1)
                  ]),
                  NoiseExtension(
                      noise_parameters=noise_parameters),
                  NoisyDataStreamMonitoring(
                      [test_cost, test_error_rate, test_confusion],
                      cifar10_test_stream,
                      noise_parameters=noise_parameters,
                      prefix="test"),
                  TrainingDataMonitoring(
                      [train_cost, train_error_rate, train_nit_rate,
                       train_cost_without_regularization,
                       l2_regularization,
                       train_nit_regularization,
                       momentum.learning_rate,
                       train_mean_log_sigma,
                       aggregation.mean(algorithm.total_gradient_norm)],
                      prefix="train",
                      every_n_batches=17),
                      # after_epoch=True),
                  Plot('Training performance for ' + exp_name,
                      channels=[
                          ['train_cost_with_regularization',
                           'train_cost_without_regularization',
                           'train_nit_regularization',
                           'train_l2_regularization'],
                          ['train_error_rate'],
                          ['train_total_gradient_norm'],
                          ['train_mean_log_sigma'],
                      ],
                      every_n_batches=17),
                  Plot('Test performance for ' + exp_name,
                      channels=[[
                          'train_error_rate',
                          'test_error_rate',
                          ]],
                      after_epoch=True),
                  EpochCheckpoint(save_to, use_cpickle=True, after_epoch=True),
                  ProgressBar(),
                  Printing()]

    if histogram:
        attribution = AttributionExtension(
            components=train_components,
            parameters=cg.parameters,
            components_size=output_size,
            after_batch=True)
        extensions.insert(0, attribution)

    if resume:
        extensions.append(Load(exp_name, True, True))

    model = Model(train_cost)

    main_loop = MainLoop(
        algorithm,
        cifar10_train_stream,
        model=model,
        extensions=extensions)

    main_loop.run()

    if histogram:
        save_attributions(attribution, filename=histogram)

    with open('execution-log.json', 'w') as outfile:
        json.dump(main_loop.log, outfile, cls=NumpyEncoder)
Example #21
0
def main(save_to,
         num_epochs,
         feature_maps=None,
         mlp_hiddens=None,
         conv_sizes=None,
         pool_sizes=None,
         batch_size=500):
    if feature_maps is None:
        feature_maps = [20, 50]
    if mlp_hiddens is None:
        mlp_hiddens = [500]
    if conv_sizes is None:
        conv_sizes = [5, 5]
    if pool_sizes is None:
        pool_sizes = [2, 2]
    image_size = (28, 28)
    output_size = 10

    # Use ReLUs everywhere and softmax for the final prediction
    conv_activations = [Rectifier() for _ in feature_maps]
    mlp_activations = [Rectifier() for _ in mlp_hiddens] + [Softmax()]
    convnet = LeNet(conv_activations,
                    1,
                    image_size,
                    filter_sizes=zip(conv_sizes, conv_sizes),
                    feature_maps=feature_maps,
                    pooling_sizes=zip(pool_sizes, pool_sizes),
                    top_mlp_activations=mlp_activations,
                    top_mlp_dims=mlp_hiddens + [output_size],
                    border_mode='full',
                    weights_init=Uniform(width=.2),
                    biases_init=Constant(0))
    # We push initialization config to set different initialization schemes
    # for convolutional layers.
    convnet.push_initialization_config()
    convnet.layers[0].weights_init = Uniform(width=.2)
    convnet.layers[1].weights_init = Uniform(width=.09)
    convnet.top_mlp.linear_transformations[0].weights_init = Uniform(width=.08)
    convnet.top_mlp.linear_transformations[1].weights_init = Uniform(width=.11)
    convnet.initialize()
    logging.info(
        "Input dim: {} {} {}".format(*convnet.children[0].get_dim('input_')))
    for i, layer in enumerate(convnet.layers):
        logging.info("Layer {} dim: {} {} {}".format(i,
                                                     *layer.get_dim('output')))

    x = tensor.tensor4('features')
    y = tensor.lmatrix('targets')

    # Normalize input and apply the convnet
    probs = convnet.apply(x)
    cost = named_copy(CategoricalCrossEntropy().apply(y.flatten(), probs),
                      'cost')
    error_rate = named_copy(MisclassificationRate().apply(y.flatten(), probs),
                            'error_rate')

    cg = ComputationGraph([cost, error_rate])

    mnist_train = MNIST(("train", ))
    mnist_train_stream = DataStream.default_stream(
        mnist_train,
        iteration_scheme=ShuffledScheme(mnist_train.num_examples, batch_size))

    mnist_test = MNIST(("test", ))
    mnist_test_stream = DataStream.default_stream(
        mnist_test,
        iteration_scheme=ShuffledScheme(mnist_test.num_examples, batch_size))

    # Train with simple SGD
    algorithm = GradientDescent(cost=cost,
                                parameters=cg.parameters,
                                step_rule=Scale(learning_rate=0.1))
    # `Timing` extension reports time for reading data, aggregating a batch
    # and monitoring;
    # `ProgressBar` displays a nice progress bar during training.
    extensions = [
        Timing(),
        FinishAfter(after_n_epochs=num_epochs),
        DataStreamMonitoring([cost, error_rate],
                             mnist_test_stream,
                             prefix="test"),
        TrainingDataMonitoring([
            cost, error_rate,
            aggregation.mean(algorithm.total_gradient_norm)
        ],
                               prefix="train",
                               after_epoch=True),
        Checkpoint(save_to),
        ProgressBar(),
        Printing()
    ]

    model = Model(cost)

    main_loop = MainLoop(algorithm,
                         mnist_train_stream,
                         model=model,
                         extensions=extensions)

    main_loop.run()
Example #22
0
def main(mode, config, use_bokeh=False):

    # Construct model
    logger.info('Building RNN encoder-decoder')
    encoder = BidirectionalEncoder(
        config['src_vocab_size'], config['enc_embed'], config['enc_nhids'],name='word_encoder')
    decoder = Decoder(vocab_size=config['trg_vocab_size'],
                      embedding_dim=config['dec_embed'],
                      state_dim=config['dec_nhids'],
                      representation_dim=config['enc_nhids'] * 2,
                      match_function=config['match_function'],
                      use_doubly_stochastic=config['use_doubly_stochastic'],
                      lambda_ds=config['lambda_ds'],
                      use_local_attention=config['use_local_attention'],
                      window_size=config['window_size'],
                      use_step_decay_cost=config['use_step_decay_cost'],
                      use_concentration_cost=config['use_concentration_cost'],
                      lambda_ct=config['lambda_ct'],
                      use_stablilizer=config['use_stablilizer'],
                      lambda_st=config['lambda_st'])
    # here attended dim (representation_dim) of decoder is 2*enc_nhinds
    # because the context given by the encoder is a bidirectional context

    if mode == "train":

        # Create Theano variables
        logger.info('Creating theano variables')
        context_sentences=[];
        context_sentence_masks=[];
        for i in range(config['ctx_num']):
            context_sentences.append(tensor.lmatrix('context_'+str(i)));
            context_sentence_masks.append(tensor.matrix('context_'+str(i)+'_mask'));
        source_sentence = tensor.lmatrix('source')
        source_sentence_mask = tensor.matrix('source_mask')
        target_sentence = tensor.lmatrix('target')
        target_sentence_mask = tensor.matrix('target_mask')
        sampling_input = tensor.lmatrix('input')
        dev_source = tensor.lmatrix('dev_source')
        dev_target=tensor.lmatrix('dev_target')

        # Get training and development set streams
        tr_stream = get_tr_stream_withContext(**config)
        dev_stream = get_dev_stream_with_grdTruth(**config)

        # Get cost of the model
        sentence_representations_list=encoder.apply(source_sentence, source_sentence_mask);
        sentence_representations_list=sentence_representations_list.dimshuffle(['x',0,1,2]);
        sentence_masks_list=source_sentence_mask.T.dimshuffle(['x',0,1]);
        for i in range(config['ctx_num']):
            tmp_rep=encoder.apply(context_sentences[i],context_sentence_masks[i]);
            tmp_rep=tmp_rep.dimshuffle(['x',0,1,2]);
            sentence_representations_list=tensor.concatenate([sentence_representations_list,tmp_rep],axis=0);
            sentence_masks_list=tensor.concatenate([sentence_masks_list,context_sentence_masks[i].T.dimshuffle(['x',0,1])],axis=0);


        cost = decoder.cost(sentence_representations_list,
                            sentence_masks_list,
                            target_sentence,
                            target_sentence_mask)

        logger.info('Creating computational graph')
        perplexity = tensor.exp(cost)
        perplexity.name = 'perplexity'
        costs_computer = function(context_sentences+context_sentence_masks+[target_sentence,
                                   target_sentence_mask,
                                   source_sentence,
                                   source_sentence_mask], (perplexity))
        cg = ComputationGraph(cost)

        # Initialize model
        logger.info('Initializing model')
        encoder.weights_init =decoder.weights_init = IsotropicGaussian(
            config['weight_scale'])
        encoder.biases_init =decoder.biases_init = Constant(0)
        encoder.push_initialization_config()
        decoder.push_initialization_config()
        encoder.bidir.prototype.weights_init = Orthogonal()
        decoder.transition.weights_init = Orthogonal()
        encoder.initialize()
        decoder.initialize()

        # apply dropout for regularization
        if config['dropout'] < 1.0:
            # dropout is applied to the output of maxout in ghog
            logger.info('Applying dropout')
            dropout_inputs = [x for x in cg.intermediary_variables
                              if x.name == 'maxout_apply_output']
            cg = apply_dropout(cg, dropout_inputs, config['dropout'])

        # Apply weight noise for regularization
        if config['weight_noise_ff'] > 0.0:
            logger.info('Applying weight noise to ff layers')
            enc_params = Selector(encoder.lookup).get_params().values()
            enc_params += Selector(encoder.fwd_fork).get_params().values()
            enc_params += Selector(encoder.back_fork).get_params().values()
            dec_params = Selector(
                decoder.sequence_generator.readout).get_params().values()
            dec_params += Selector(
                decoder.sequence_generator.fork).get_params().values()
            dec_params += Selector(decoder.state_init).get_params().values()
            cg = apply_noise(
                cg, enc_params+dec_params, config['weight_noise_ff'])


        # Print shapes
        shapes = [param.get_value().shape for param in cg.parameters]
        logger.info("Parameter shapes: ")
        for shape, count in Counter(shapes).most_common():
            logger.info('    {:15}: {}'.format(shape, count))
        logger.info("Total number of parameters: {}".format(len(shapes)))

        # Print parameter names
        enc_dec_param_dict = merge(Selector(encoder).get_parameters(),
                                   Selector(decoder).get_parameters())
        logger.info("Parameter names: ")
        for name, value in enc_dec_param_dict.items():
            logger.info('    {:15}: {}'.format(value.get_value().shape, name))
        logger.info("Total number of parameters: {}"
                    .format(len(enc_dec_param_dict)))


        # Set up training model
        logger.info("Building model")
        training_model = Model(cost)

        # Set extensions
        logger.info("Initializing extensions")
        extensions = [
            FinishAfter(after_n_batches=config['finish_after']),
            TrainingDataMonitoring([perplexity], after_batch=True),
            CheckpointNMT(config['saveto'],
                          config['model_name'],
                          every_n_batches=config['save_freq'])
        ]

        # Set up beam search and sampling computation graphs if necessary
        if config['hook_samples'] >= 1 or config['bleu_script'] is not None:
            logger.info("Building sampling model")
            sampling_representation = encoder.apply(
                sampling_input, tensor.ones(sampling_input.shape))
            generated = decoder.generate(
                sampling_input, sampling_representation)
            search_model = Model(generated)
            _, samples = VariableFilter(
                bricks=[decoder.sequence_generator], name="outputs")(
                    ComputationGraph(generated[1]))

        # Add sampling
        if config['hook_samples'] >= 1:
            logger.info("Building sampler")
            extensions.append(
                Sampler(model=search_model, data_stream=tr_stream,
                        model_name=config['model_name'],
                        hook_samples=config['hook_samples'],
                        every_n_batches=config['sampling_freq'],
                        src_vocab_size=config['src_vocab_size']))

        # Add early stopping based on bleu
        if False:
            logger.info("Building bleu validator")
            extensions.append(
                BleuValidator(sampling_input, samples=samples, config=config,
                              model=search_model, data_stream=dev_stream,
                              normalize=config['normalized_bleu'],
                              every_n_batches=config['bleu_val_freq'],
                              n_best=3,
                              track_n_models=6))

        logger.info("Building perplexity validator")
        extensions.append(
                pplValidation(dev_source,dev_target, config=config,
                        model=costs_computer, data_stream=dev_stream,
                        model_name=config['model_name'],
                        every_n_batches=config['sampling_freq']))


        # Plot cost in bokeh if necessary
        if use_bokeh and BOKEH_AVAILABLE:
            extensions.append(
                Plot('Cs-En', channels=[['decoder_cost_cost']],
                     after_batch=True))

        # Reload model if necessary
        if config['reload']:
            extensions.append(LoadNMT(config['saveto']))

        initial_learning_rate = config['initial_learning_rate']
        log_path = os.path.join(config['saveto'], 'log')
        if config['reload'] and os.path.exists(log_path):
            with open(log_path, 'rb') as source:
                log = cPickle.load(source)
                last = max(log.keys()) - 1
                if 'learning_rate' in log[last]:
                    initial_learning_rate = log[last]['learning_rate']

        # Set up training algorithm
        logger.info("Initializing training algorithm")
        algorithm = GradientDescent(
            cost=cost, parameters=cg.parameters,
            step_rule=CompositeRule([Scale(initial_learning_rate),
                                     StepClipping(config['step_clipping']),
                                     eval(config['step_rule'])()]))

        _learning_rate = algorithm.step_rule.components[0].learning_rate
        if config['learning_rate_decay']:
            extensions.append(
                LearningRateHalver(record_name='validation_cost',
                                   comparator=lambda x, y: x > y,
                                   learning_rate=_learning_rate,
                                   patience_default=3))
        else:
            extensions.append(OldModelRemover(saveto=config['saveto']))

        if config['learning_rate_grow']:
            extensions.append(
                LearningRateDoubler(record_name='validation_cost',
                                    comparator=lambda x, y: x < y,
                                    learning_rate=_learning_rate,
                                    patience_default=3))

        extensions.append(
            SimplePrinting(config['model_name'], after_batch=True))

        # Initialize main loop
        logger.info("Initializing main loop")
        main_loop = MainLoop(
            model=training_model,
            algorithm=algorithm,
            data_stream=tr_stream,
            extensions=extensions
        )

        # Train!
        main_loop.run()

    elif mode == 'ppl':
        # Create Theano variables
        # Create Theano variables
        logger.info('Creating theano variables')
        context_sentences=[];
        context_sentence_masks=[];
        for i in range(config['ctx_num']):
            context_sentences.append(tensor.lmatrix('context_'+str(i)));
            context_sentence_masks.append(tensor.matrix('context_'+str(i)+'_mask'));
        source_sentence = tensor.lmatrix('source')
        source_sentence_mask = tensor.matrix('source_mask')
        target_sentence = tensor.lmatrix('target')
        target_sentence_mask = tensor.matrix('target_mask')

        # Get training and development set streams
        #tr_stream = get_tr_stream_withContext(**config)
        dev_stream = get_dev_stream_withContext_grdTruth(**config)

        # Get cost of the model
        sentence_representations_list=encoder.apply(source_sentence, source_sentence_mask);
        sentence_representations_list=sentence_representations_list.dimshuffle(['x',0,1,2]);
        sentence_masks_list=source_sentence_mask.T.dimshuffle(['x',0,1]);
        for i in range(config['ctx_num']):
            tmp_rep=encoder.apply(context_sentences[i],context_sentence_masks[i]);
            tmp_rep=tmp_rep.dimshuffle(['x',0,1,2]);
            sentence_representations_list=tensor.concatenate([sentence_representations_list,tmp_rep],axis=0);
            sentence_masks_list=tensor.concatenate([sentence_masks_list,context_sentence_masks[i].T.dimshuffle(['x',0,1])],axis=0);


        cost = decoder.cost(sentence_representations_list,
                            sentence_masks_list,
                            target_sentence,
                            target_sentence_mask)

        logger.info('Creating computational graph')
        costs_computer = function(context_sentences+context_sentence_masks+[target_sentence,
                                   target_sentence_mask,
                                   source_sentence,
                                   source_sentence_mask], (cost))


        logger.info("Loading the model..")
        model = Model(cost)
        #loader = LoadNMT(config['saveto'])
        loader = LoadNMT(config['validation_load']);
        loader.set_model_parameters(model, loader.load_parameters_default())
        logger.info("Started Validation: ")

        ts = dev_stream.get_epoch_iterator()
        total_cost = 0.0
        total_tokens=0.0
        #pbar = ProgressBar(max_value=len(ts)).start()#modified
        pbar = ProgressBar(max_value=10000).start();
        for i, (ctx_0,ctx_0_mask,ctx_1,ctx_1_mask,ctx_2,ctx_2_mask,src, src_mask, trg, trg_mask) in enumerate(ts):
            costs  = costs_computer(*[ctx_0,ctx_1,ctx_2,ctx_0_mask,ctx_1_mask,ctx_2_mask,trg, trg_mask,src, src_mask])
            cost = costs.sum()
            total_cost+=cost
            total_tokens+=trg_mask.sum()
            pbar.update(i + 1)
        total_cost/=total_tokens;
        pbar.finish()
        #dev_stream.reset()

        # run afterprocess
        # self.ap.main()
        total_cost=2**total_cost;
        print("Average validation cost: " + str(total_cost));
    elif mode == 'translate':

        logger.info('Creating theano variables')
        context_sentences=[];
        context_sentence_masks=[];
        for i in range(config['ctx_num']):
            context_sentences.append(tensor.lmatrix('context_'+str(i)));
            context_sentence_masks.append(tensor.matrix('context_'+str(i)+'_mask'));
        source_sentence = tensor.lmatrix('source')
        source_sentence_mask = tensor.matrix('source_mask')

        sutils = SamplingBase()
        unk_idx = config['unk_id']
        src_eos_idx = config['src_vocab_size'] - 1
        trg_eos_idx = config['trg_vocab_size'] - 1
        trg_vocab = _ensure_special_tokens(
            cPickle.load(open(config['trg_vocab'], 'rb')), bos_idx=0,
            eos_idx=trg_eos_idx, unk_idx=unk_idx)
        trg_ivocab = {v: k for k, v in trg_vocab.items()}
        config['batch_size'] = 1

        sentence_representations_list=encoder.apply(source_sentence, source_sentence_mask);
        sentence_representations_list=sentence_representations_list.dimshuffle(['x',0,1,2]);
        sentence_masks_list=source_sentence_mask.T.dimshuffle(['x',0,1]);
        for i in range(config['ctx_num']):
            tmp_rep=encoder.apply(context_sentences[i],context_sentence_masks[i]);
            tmp_rep=tmp_rep.dimshuffle(['x',0,1,2]);
            sentence_representations_list=tensor.concatenate([sentence_representations_list,tmp_rep],axis=0);
            sentence_masks_list=tensor.concatenate([sentence_masks_list,context_sentence_masks[i].T.dimshuffle(['x',0,1])],axis=0);
        generated = decoder.generate(sentence_representations_list,sentence_masks_list)
        _, samples = VariableFilter(
            bricks=[decoder.sequence_generator], name="outputs")(
                ComputationGraph(generated[1]))  # generated[1] is next_outputs
        beam_search = BeamSearch(samples=samples)

        logger.info("Loading the model..")
        model = Model(generated)
        #loader = LoadNMT(config['saveto'])
        loader = LoadNMT(config['validation_load']);
        loader.set_model_parameters(model, loader.load_parameters_default())

        logger.info("Started translation: ")
        test_stream = get_dev_stream_withContext(**config)
        ts = test_stream.get_epoch_iterator()
        rts = open(config['val_set_source']).readlines()
        ftrans_original = open(config['val_output_orig'], 'w')
        saved_weights = []
        total_cost = 0.0

        pbar = ProgressBar(max_value=len(rts)).start()
        for i, (line, line_raw) in enumerate(zip(ts, rts)):
            trans_in = line_raw[3].split()
            seqs=[];
            input_=[];
            input_mask=[];
            for j in range(config['ctx_num']+1):
                seqs.append(sutils._oov_to_unk(
                    line[2*j][0], config['src_vocab_size'], unk_idx))
                input_mask.append(numpy.tile(line[2*j+1][0],(config['beam_size'], 1)))
                input_.append(numpy.tile(seqs[j], (config['beam_size'], 1)))
            #v=costs_computer(input_[0]);
            # draw sample, checking to ensure we don't get an empty string back
            trans, costs, attendeds, weights = \
                beam_search.search(
                    input_values={source_sentence: input_[3],source_sentence_mask:input_mask[3],
                                  context_sentences[0]: input_[0],context_sentence_masks[0]:input_mask[0],
                                  context_sentences[1]: input_[1],context_sentence_masks[1]:input_mask[1],
                                  context_sentences[2]: input_[2],context_sentence_masks[2]:input_mask[2]},
                    max_length=3*len(seqs[2]), eol_symbol=trg_eos_idx,
                    ignore_first_eol=True)

            # normalize costs according to the sequence lengths
            if config['normalized_bleu']:
                lengths = numpy.array([len(s) for s in trans])
                costs = costs / lengths

            b = numpy.argsort(costs)[0]
            #best=numpy.argsort(costs)[0:config['beam_size']];
            #for b in best:
            try:
                total_cost += costs[b]
                trans_out = trans[b]
                totalLen=4*len(line[0][0]);
                #weight = weights[b][:, :totalLen]
                weight=weights
                trans_out = sutils._idx_to_word(trans_out, trg_ivocab)
            except ValueError:
                logger.info(
                    "Can NOT find a translation for line: {}".format(i+1))
                trans_out = '<UNK>'
            saved_weights.append(weight)
            print(' '.join(trans_out), file=ftrans_original)
            pbar.update(i + 1)

        pbar.finish()
        logger.info("Total cost of the test: {}".format(total_cost))
        cPickle.dump(saved_weights, open(config['attention_weights'], 'wb'))
        ftrans_original.close()
        ap = afterprocesser(config)
        ap.main()
Example #23
0
print("Cost graph is built", file=sys.stderr)

model = Model(cost)
parameters = model.get_parameter_dict()

for brick in model.get_top_bricks():  #{
    brick.initialize()
#}

cg = ComputationGraph(cost)

algo = GradientDescent(cost=cost,
                       parameters=cg.parameters,
                       step_rule=CompositeRule(
                           [StepClipping(10.0),
                            Scale(0.01)]))
#algo = RMSProp(learning_rate=1.0, decay_rate=0.9)

max_length = chars.shape[0].copy(name="max_length")
observables = [
    batch_size, max_length, algo.total_step_norm, algo.total_gradient_norm,
    cost
]

# Construct the main loop and start training!
average_monitoring = TrainingDataMonitoring(observables,
                                            prefix="average",
                                            every_n_batches=10)

checkpoint_after = n_epochs / 5
#checkpoint_after=100;
Example #24
0
def train():

    if os.path.isfile('trainingdata.tar'):
        with open('trainingdata.tar', 'rb') as f:
            main = load(f)
    else:
        hidden_size = 512
        filename = 'warpeace.hdf5'

        encoder = HDF5CharEncoder('warpeace_input.txt', 1000)
        encoder.write(filename)
        alphabet_len = encoder.length

        x = theano.tensor.lmatrix('x')

        readout = Readout(
            readout_dim=alphabet_len,
            feedback_brick=LookupFeedback(alphabet_len, hidden_size, name='feedback'),
            source_names=['states'],
            emitter=RandomSoftmaxEmitter(),
            name='readout'
        )

        transition = GatedRecurrent(
            activation=Tanh(),
            dim=hidden_size)
        transition.weights_init = IsotropicGaussian(0.01)

        gen = SequenceGenerator(readout=readout,
                                transition=transition,
                                weights_init=IsotropicGaussian(0.01),
                                biases_init=Constant(0),
                                name='sequencegenerator')

        gen.push_initialization_config()
        gen.initialize()

        cost = gen.cost(outputs=x)
        cost.name = 'cost'

        cg = ComputationGraph(cost)

        algorithm = GradientDescent(cost=cost,
                                    parameters=cg.parameters,
                                    step_rule=Scale(0.5))

        train_set = encoder.get_dataset()
        train_stream = DataStream.default_stream(
            train_set, iteration_scheme=SequentialScheme(
                train_set.num_examples, batch_size=128))

        main = MainLoop(
            model=Model(cost),
            data_stream=train_stream,
            algorithm=algorithm,
            extensions=[
                FinishAfter(),
                Printing(),
                Checkpoint('trainingdata.tar', every_n_epochs=10),
                ShowOutput(every_n_epochs=10)
            ])

    main.run()
Example #25
0
if mode is "data_server":
    data_train_stream = ServerDataStream(('image_features','targets'), False, port=5560)
    data_valid_stream = ServerDataStream(('image_features','targets'), False, port=5561)


### Setting up the model
probs = top_mlp.apply(conv_out)

cost = CategoricalCrossEntropy().apply(y.flatten(), probs).copy(name='cost')
error = MisclassificationRate().apply(y.flatten(), probs)
error_rate = error.copy(name='error_rate')
error_rate2 = error.copy(name='error_rate2')
cg = ComputationGraph([cost, error_rate])

### Gradient Descent
algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=Scale(learning_rate=learning_rate))

extensions = [Timing(),
              FinishAfter(after_n_epochs=num_epochs),
              DataStreamMonitoring(
                  [cost, error_rate, error_rate2],
                  data_valid_stream,
                  prefix="valid"),
              TrainingDataMonitoring(
                  [cost, error_rate,
                   aggregation.mean(algorithm.total_gradient_norm)],
                  prefix="train",
                  after_epoch=True),
             Checkpoint(save_to),
              ProgressBar(),
              Printing()]
Example #26
0
 def __init__(self, learning_rate=1.0, momentum=0.):
     scale = Scale(learning_rate=learning_rate)
     basic_nesterov_momentum = BasicNesterovMomentum(momentum=momentum)
     self.learning_rate = scale.learning_rate
     self.momentum = basic_nesterov_momentum.momentum
     self.components = [scale, basic_nesterov_momentum]
Example #27
0
def main(save_to, model, train, test, num_epochs, input_size = (150,150), learning_rate=0.01,
batch_size=50, num_batches=None, flatten_stream=False):
    """ 
    save_to : where to save trained model
    model : model given in input must be already initialised (works with convnet and mlp)
    
    input_size : the shape of the reshaped image in input (before flattening is applied if flatten_stream is True)
    
    """
    if flatten_stream :
        x = tensor.matrix('image_features')
    else :
        x = tensor.tensor4('image_features')
    y = tensor.lmatrix('targets')

    #Data augmentation
    #insert data augmentation here 
    
    #Generating stream
    train_stream = DataStream.default_stream(
        train,
        iteration_scheme=ShuffledScheme(train.num_examples, batch_size)
    )

    test_stream = DataStream.default_stream(
        test,
        iteration_scheme=ShuffledScheme(test.num_examples, batch_size)
    )
    
    
    #Reshaping procedure
    #Add a crop option in scikitresize so that the image is not deformed
    
    #Resize to desired square shape
    train_stream = ScikitResize(train_stream, input_size, which_sources=('image_features',))
    test_stream = ScikitResize(test_stream, input_size, which_sources=('image_features',))
    
    #Flattening the stream
    if flatten_stream is True:
        train_stream = Flatten(train_stream, which_sources=('image_features',))
        test_stream = Flatten(test_stream, which_sources=('image_features',))
    
    # Apply input to model
    probs = model.apply(x)
    
    #Defining cost and various indices to watch
    #print(probs)
    #cost = SquaredError().apply(y.flatten(),probs)

    cost = CategoricalCrossEntropy().apply(y.flatten(), probs).copy(name='cost')
    error_rate = MisclassificationRate().apply(y.flatten(), probs).copy(
            name='error_rate')

    #Building Computation Graph
    cg = ComputationGraph([cost, error_rate])

    # Train with simple SGD
    algorithm = GradientDescent(
        cost=cost, parameters=cg.parameters,
        step_rule=Scale(learning_rate=learning_rate))
    
    #Defining extensions
    extensions = [Timing(),
                  FinishAfter(after_n_epochs=num_epochs,
                              after_n_batches=num_batches),
                  TrainingDataMonitoring([cost, error_rate,aggregation.mean(algorithm.total_gradient_norm)], prefix="train", every_n_batches=5),
                  DataStreamMonitoring([cost, error_rate],test_stream,prefix="test", every_n_batches=25),
                  Checkpoint(save_to),
                  ProgressBar(),
                  Printing(every_n_batches=5)]

    # `Timing` extension reports time for reading data, aggregating a batch
    # and monitoring;
    # `ProgressBar` displays a nice progress bar during training.


    model = Model(cost)

    main_loop = MainLoop(
        algorithm,
        train_stream,
        model=model,
        extensions=extensions)

    main_loop.run()
Example #28
0
def run(get_model, model_name):
    train_stream = ServerDataStream(
        ('cases', 'image_features', 'image_targets', 'multiplier'),
        False,
        hwm=10)
    valid_stream = ServerDataStream(
        ('cases', 'image_features', 'image_targets', 'multiplier'),
        False,
        hwm=10,
        port=5558)

    input_var = tensor.tensor4('image_features')
    target_var = tensor.tensor4('image_targets')
    multiply_var = tensor.matrix('multiplier')
    multiply_var = T.addbroadcast(multiply_var, 1)

    test_prediction, prediction, params = get_model(input_var, target_var,
                                                    multiply_var)

    loss = binary_crossentropy(prediction, target_var).mean()

    loss.name = 'loss'

    valid_error = T.neq((test_prediction > 0.5) * 1., target_var).mean()
    valid_error.name = 'error'

    scale = Scale(0.1)
    algorithm = GradientDescent(
        cost=loss,
        parameters=params,
        step_rule=scale,
        #step_rule=Adam(),
        on_unused_sources='ignore')

    host_plot = 'http://localhost:5006'

    extensions = [
        Timing(),
        TrainingDataMonitoring([loss], after_epoch=True),
        DataStreamMonitoring(variables=[loss, valid_error],
                             data_stream=valid_stream,
                             prefix="valid"),
        Plot('%s %s %s' %
             (model_name, datetime.date.today(), time.strftime('%H:%M')),
             channels=[['loss', 'valid_loss'], ['valid_error']],
             after_epoch=True,
             server_url=host_plot),
        Printing(),
        # Checkpoint('train'),
        FinishAfter(after_n_epochs=10)
    ]

    main_loop = MainLoop(data_stream=train_stream,
                         algorithm=algorithm,
                         extensions=extensions)
    cg = ComputationGraph(test_prediction)
    while True:
        main_loop.run()
        scale.learning_rate.set_value(
            numpy.float32(scale.learning_rate.get_value() * 0.7))
        numpy.savez('best_weights.npz',
                    [param.get_value() for param in cg.shared_variables])
Example #29
0
mnist_train = MNIST("train")
train_stream = Flatten(
    DataStream.default_stream(dataset=mnist_train,
                              iteration_scheme=SequentialScheme(
                                  mnist_train.num_examples, 128)), )

# load testing data
mnist_test = MNIST("test")
test_stream = Flatten(
    DataStream.default_stream(dataset=mnist_test,
                              iteration_scheme=SequentialScheme(
                                  mnist_test.num_examples, 1024)), )

# train the model
from blocks.model import Model
main_loop = MainLoop(model=Model(cost),
                     data_stream=train_stream,
                     algorithm=GradientDescent(
                         cost=cost,
                         params=ComputationGraph(cost).parameters,
                         step_rule=Scale(learning_rate=0.1)),
                     extensions=[
                         FinishAfter(after_n_epochs=5),
                         DataStreamMonitoring(variables=[cost, error_rate],
                                              data_stream=test_stream,
                                              prefix="test"),
                         Printing()
                     ])

main_loop.run()
Example #30
0
seq_gen.initialize()

# z markov_tutorial
x = tensor.lvector('features')
x = x.reshape((x.shape[0], 1))
cost = aggregation.mean(seq_gen.cost_matrix(x[:, :]).sum(), x.shape[1])
cost.name = "negative log-likelihood"
cost_cg = ComputationGraph(cost)

print VariableFilter(roles=[WEIGHT])(cost_cg.variables)
# theano.printing.pydotprint(cost, outfile="./pics/symbolic_graph_unopt.png", var_with_name_simple=True)

algorithm = GradientDescent(cost=cost,
                            parameters=list(
                                Selector(seq_gen).get_parameters().values()),
                            step_rule=Scale(0.001))

# AUDIOSCOPE OBSERVABLES (some)
observables = []
observables += cost_cg.outputs
observables.append(algorithm.total_step_norm)
observables.append(algorithm.total_gradient_norm)

print observables

# AUDIOSCOPE EXTENSIONS
extensions = []
extensions.append(Timing(after_batch=True))
extensions.append(TrainingDataMonitoring(list(observables), after_batch=True))
averaging_frequency = 1000
average_monitor = TrainingDataMonitoring(observables,