Esempio n. 1
0
def test_adagrad():
    a = shared_floatx([3, 4])
    cost = (a**2).sum()
    steps, updates = AdaGrad().compute_steps(
        OrderedDict([(a, tensor.grad(cost, a))]))
    f = theano.function([], [steps[a]], updates=updates)

    rtol = 1e-4
    assert_allclose(f()[0], [0.002, 0.002], rtol=rtol)
    a.set_value([2, 3])
    assert_allclose(f()[0], [0.0011094, 0.0012], rtol=rtol)
    a.set_value([1, 1.5])
    assert_allclose(f()[0], [0.00053452, 0.0005747], rtol=rtol)
Esempio n. 2
0
def setup_algorithms(cost, cg, method, type="ff"):
    """Setup training algorithm.

    Parameters
    ----------
    cost : expression
        cost expression
    cg : ComputationGraph
        Computation graph
    method : string
        training method: SGD, momentum SGD, AdaGrad, RMSprop
    learning_rate : float
        learning rate for learning method

    Returns
    -------
    algorithm : GradientDescent
        Gradient Descent algorithm based on different optimization method
    """
    if method == "sgd":
        step_rule = Scale(learning_rate=0.01)
    elif method == "momentum":
        step_rule = Momentum(learning_rate=0.01, momentum=0.95)
    elif method == "adagrad":
        step_rule = AdaGrad()
    elif method == "rmsprop":
        step_rule = RMSProp()

    if type == "RNN":
        step_rule = CompositeRule([StepClipping(1.0), step_rule])

    algorithm = GradientDescent(cost=cost,
                                parameters=cg.parameters,
                                step_rule=step_rule)

    return algorithm
Esempio n. 3
0
def test_adagrad_broadcastable():
    verify_broadcastable_handling(AdaGrad())
Esempio n. 4
0
extra_updates = []

# Learning optimizer
if training_optimizer == 'Adam':
    step_rules = [
        Adam(learning_rate=learning_rate),
        StepClipping(step_clipping)
    ]  # , VariableClipping(threshold=max_norm_threshold)
elif training_optimizer == 'RMSProp':
    step_rules = [
        RMSProp(learning_rate=learning_rate, decay_rate=decay_rate),
        StepClipping(step_clipping)
    ]
elif training_optimizer == 'Adagrad':
    step_rules = [
        AdaGrad(learning_rate=learning_rate),
        StepClipping(step_clipping)
    ]
elif training_optimizer == 'Adadelta':
    step_rules = [AdaDelta(decay_rate=decay_rate), StepClipping(step_clipping)]

parameters_to_update = cg.parameters
algorithm = GradientDescent(cost=cg.outputs[0],
                            parameters=parameters_to_update,
                            step_rule=CompositeRule(step_rules))
algorithm.add_updates(extra_updates)

# Extensions
gradient_norm = aggregation.mean(algorithm.total_gradient_norm)
step_norm = aggregation.mean(algorithm.total_step_norm)
monitored_vars = [cost, step_rules[0].learning_rate, gradient_norm, step_norm]
Esempio n. 5
0
for p in weightList:
  reg2 += T.sum(p ** 2)
cost += 0.00001 * reg2

n_epochs = 15
if "n_epochs" in config:
  n_epochs = int(config["n_epochs"])

params = cg.parameters
model = Model([cost])
print "model parameters:"
print model.get_parameter_dict()

if "adagrad" in config:
  print "using adagrad"
  thisRule=AdaGrad(learning_rate=learning_rate)
elif "adadelta" in config:
  print "using adadelta"
  thisRule=AdaDelta()
elif "momentum" in config:
  print "using momentum"
  mWeight = float(config["momentum"])
  thisRule=Momentum(learning_rate=learning_rate, momentum=mWeight)
else:
  print "using traditional SGD"
  thisRule=Scale(learning_rate=learning_rate)

if "gradientClipping" in config:
  threshold = float(config["gradientClipping"])
  print "using gradient clipping with threshold ", threshold
  thisRule=CompositeRule([StepClipping(threshold), thisRule])
Esempio n. 6
0
def main(job_id, params):
    config = ConfigParser.ConfigParser()
    config.readfp(open('./params'))
    max_epoch = int(config.get('hyperparams', 'max_iter', 100))
    base_lr = float(config.get('hyperparams', 'base_lr', 0.01))
    train_batch = int(config.get('hyperparams', 'train_batch', 256))
    valid_batch = int(config.get('hyperparams', 'valid_batch', 512))
    test_batch = int(config.get('hyperparams', 'valid_batch', 512))

    W_sd = float(config.get('hyperparams', 'W_sd', 0.01))
    W_mu = float(config.get('hyperparams', 'W_mu', 0.0))
    b_sd = float(config.get('hyperparams', 'b_sd', 0.01))
    b_mu = float(config.get('hyperparams', 'b_mu', 0.0))

    hidden_units = int(config.get('hyperparams', 'hidden_units', 32))
    input_dropout_ratio = float(
        config.get('hyperparams', 'input_dropout_ratio', 0.2))
    dropout_ratio = float(config.get('hyperparams', 'dropout_ratio', 0.2))
    weight_decay = float(config.get('hyperparams', 'weight_decay', 0.001))
    max_norm = float(config.get('hyperparams', 'max_norm', 100.0))
    solver = config.get('hyperparams', 'solver_type', 'rmsprop')
    data_file = config.get('hyperparams', 'data_file')
    side = config.get('hyperparams', 'side', 'b')

    # Spearmint optimization parameters:
    if params:
        base_lr = float(params['base_lr'][0])
        dropout_ratio = float(params['dropout_ratio'][0])
        hidden_units = params['hidden_units'][0]
        weight_decay = params['weight_decay'][0]

    if 'adagrad' in solver:
        solver_type = CompositeRule([
            AdaGrad(learning_rate=base_lr),
            VariableClipping(threshold=max_norm)
        ])
    else:
        solver_type = CompositeRule([
            RMSProp(learning_rate=base_lr),
            VariableClipping(threshold=max_norm)
        ])

    input_dim = {'l': 11427, 'r': 10519, 'b': 10519 + 11427}
    data_file = config.get('hyperparams', 'data_file')

    if 'b' in side:
        train = H5PYDataset(data_file, which_set='train')
        valid = H5PYDataset(data_file, which_set='valid')
        test = H5PYDataset(data_file, which_set='test')
        x_l = tensor.matrix('l_features')
        x_r = tensor.matrix('r_features')
        x = tensor.concatenate([x_l, x_r], axis=1)

    else:
        train = H5PYDataset(data_file,
                            which_set='train',
                            sources=['{}_features'.format(side), 'targets'])
        valid = H5PYDataset(data_file,
                            which_set='valid',
                            sources=['{}_features'.format(side), 'targets'])
        test = H5PYDataset(data_file,
                           which_set='test',
                           sources=['{}_features'.format(side), 'targets'])
        x = tensor.matrix('{}_features'.format(side))

    y = tensor.lmatrix('targets')

    # Define a feed-forward net with an input, two hidden layers, and a softmax output:
    model = MLP(activations=[
        Rectifier(name='h1'),
        Rectifier(name='h2'),
        Softmax(name='output'),
    ],
                dims=[input_dim[side], hidden_units, hidden_units, 2],
                weights_init=IsotropicGaussian(std=W_sd, mean=W_mu),
                biases_init=IsotropicGaussian(b_sd, b_mu))

    # Don't forget to initialize params:
    model.initialize()

    # y_hat is the output of the neural net with x as its inputs
    y_hat = model.apply(x)

    # Define a cost function to optimize, and a classification error rate.
    # Also apply the outputs from the net and corresponding targets:
    cost = CategoricalCrossEntropy().apply(y.flatten(), y_hat)
    error = MisclassificationRate().apply(y.flatten(), y_hat)
    error.name = 'error'

    # This is the model: before applying dropout
    model = Model(cost)

    # Need to define the computation graph for the cost func:
    cost_graph = ComputationGraph([cost])

    # This returns a list of weight vectors for each layer
    W = VariableFilter(roles=[WEIGHT])(cost_graph.variables)

    # Add some regularization to this model:
    cost += weight_decay * l2_norm(W)
    cost.name = 'entropy'

    # computational graph with l2 reg
    cost_graph = ComputationGraph([cost])

    # Apply dropout to inputs:
    inputs = VariableFilter([INPUT])(cost_graph.variables)
    dropout_inputs = [
        input for input in inputs if input.name.startswith('linear_')
    ]
    dropout_graph = apply_dropout(cost_graph, [dropout_inputs[0]],
                                  input_dropout_ratio)
    dropout_graph = apply_dropout(dropout_graph, dropout_inputs[1:],
                                  dropout_ratio)
    dropout_cost = dropout_graph.outputs[0]
    dropout_cost.name = 'dropout_entropy'

    # Learning Algorithm (notice: we use the dropout cost for learning):
    algo = GradientDescent(step_rule=solver_type,
                           params=dropout_graph.parameters,
                           cost=dropout_cost)

    # algo.step_rule.learning_rate.name = 'learning_rate'

    # Data stream used for training model:
    training_stream = Flatten(
        DataStream.default_stream(dataset=train,
                                  iteration_scheme=ShuffledScheme(
                                      train.num_examples,
                                      batch_size=train_batch)))

    training_monitor = TrainingDataMonitoring([
        dropout_cost,
        aggregation.mean(error),
        aggregation.mean(algo.total_gradient_norm)
    ],
                                              after_batch=True)

    # Use the 'valid' set for validation during training:
    validation_stream = Flatten(
        DataStream.default_stream(dataset=valid,
                                  iteration_scheme=ShuffledScheme(
                                      valid.num_examples,
                                      batch_size=valid_batch)))

    validation_monitor = DataStreamMonitoring(variables=[cost, error],
                                              data_stream=validation_stream,
                                              prefix='validation',
                                              after_epoch=True)

    test_stream = Flatten(
        DataStream.default_stream(
            dataset=test,
            iteration_scheme=ShuffledScheme(test.num_examples,
                                            batch_size=test_batch)))

    test_monitor = DataStreamMonitoring(variables=[error],
                                        data_stream=test_stream,
                                        prefix='test',
                                        after_training=True)

    plotting = Plot('AdniNet_{}'.format(side),
                    channels=[
                        ['dropout_entropy', 'validation_entropy'],
                        ['error', 'validation_error'],
                    ],
                    after_batch=False)

    # Checkpoint class used to save model and log:
    stamp = datetime.datetime.fromtimestamp(
        time.time()).strftime('%Y-%m-%d-%H:%M')
    checkpoint = Checkpoint('./models/{}net/{}'.format(side, stamp),
                            save_separately=['model', 'log'],
                            every_n_epochs=1)

    # Home-brewed class for early stopping when we detect we have started to overfit
    early_stopper = FinishIfOverfitting(error_name='error',
                                        validation_name='validation_error',
                                        threshold=0.1,
                                        epochs=5,
                                        burn_in=100)

    # The main loop will train the network and output reports, etc
    main_loop = MainLoop(data_stream=training_stream,
                         model=model,
                         algorithm=algo,
                         extensions=[
                             validation_monitor,
                             training_monitor,
                             plotting,
                             FinishAfter(after_n_epochs=max_epoch),
                             early_stopper,
                             Printing(),
                             ProgressBar(),
                             checkpoint,
                             test_monitor,
                         ])
    main_loop.run()

    ve = float(main_loop.log.last_epoch_row['validation_error'])
    te = float(main_loop.log.last_epoch_row['error'])
    spearmint_loss = ve + abs(te - ve)
    print 'Spearmint Loss: {}'.format(spearmint_loss)
    return spearmint_loss
Esempio n. 7
0
max_iter = int(config.get('hyperparams', 'max_iter', 100))
base_lr = float(config.get('hyperparams', 'base_lr', 0.01))
train_batch = int(config.get('hyperparams', 'train_batch', 256))
valid_batch = int(config.get('hyperparams', 'valid_batch', 256))
test_batch = int(config.get('hyperparams', 'valid_batch', 256))

W_sd = float(config.get('hyperparams', 'W_sd', 0.01))
W_mu = float(config.get('hyperparams', 'W_mu', 0.0))
W_b = float(config.get('hyperparams', 'W_b', 0.01))
dropout_ratio = float(config.get('hyperparams', 'dropout_ratio', 0.2))
weight_decay = float(config.get('hyperparams', 'weight_decay', 0.001))
solver = config.get('hyperparams', 'solver_type', 'rmsprop')
data_file = config.get('hyperparams', 'data_file')

if 'adagrad' in solver:
    solver_type = AdaGrad()
else:
    solver_type = RMSProp(learning_rate=base_lr)

pre_trained_folder = '../models/'
input_dim = {'l': 11427, 'r': 10519, 'b': 10519 + 11427}

train = H5PYDataset(data_file, which_set='train', sources=['l_features', 'r_features'])
valid = H5PYDataset(data_file, which_set='valid', sources=['l_features', 'r_features'])
test = H5PYDataset(data_file, which_set='test', sources=['l_features', 'r_features'])
x_l = tensor.matrix('l_features')
x_r = tensor.matrix('r_features')
x = tensor.concatenate([x_l, x_r], axis=1)

# Define a feed-forward net with an input, two hidden layers, and a softmax output:
autoencoder = MLP(activations=[
Esempio n. 8
0
############ BLOCKS ########################
# wrap everything in Blocks objects and run!
######### training ##################
n_epochs = 15
if "n_epochs" in config:
    n_epochs = int(config["n_epochs"])
print "number of training epochs: ", n_epochs

model = Model([cost])
print "model parameters:"
print model.get_parameter_dict()
if "adagrad" in config:
    print "using adagrad"
    algorithm = GradientDescent(cost=cost,
                                parameters=params,
                                step_rule=AdaGrad(learning_rate=learning_rate),
                                on_unused_sources='warn')
elif "adadelta" in config:
    print "using adadelta"
    algorithm = GradientDescent(cost=cost,
                                parameters=params,
                                step_rule=AdaDelta(),
                                on_unused_sources='warn')
elif "momentum" in config:
    print "using momentum"
    mWeight = float(config["momentum"])
    algorithm = GradientDescent(cost=cost,
                                parameters=params,
                                step_rule=Momentum(learning_rate=learning_rate,
                                                   momentum=mWeight),
                                on_unused_sources='warn')
Esempio n. 9
0
 def set_adagrad(self, learning_rate):
     self.step_rules.append(AdaGrad(learning_rate))
Esempio n. 10
0
def main(name, epochs, batch_size, learning_rate, dim, mix_dim, old_model_name,
         max_length, bokeh, GRU, dropout, depth, max_grad, step_method,
         epsilon, sample):

    #----------------------------------------------------------------------
    datasource = name

    def shnum(x):
        """ Convert a positive float into a short tag-usable string
             E.g.: 0 -> 0, 0.005 -> 53, 100 -> 1-2
        """
        return '0' if x <= 0 else '%s%d' % (
            ("%e" % x)[0], -np.floor(np.log10(x)))

    jobname = "%s-%dX%dm%dd%dr%sb%de%s" % (
        datasource, depth, dim, mix_dim, int(
            dropout * 10), shnum(learning_rate), batch_size, shnum(epsilon))
    if max_length != 600:
        jobname += '-L%d' % max_length

    if GRU:
        jobname += 'g'
    if max_grad != 5.:
        jobname += 'G%g' % max_grad
    if step_method != 'adam':
        jobname += step_method

    if sample:
        print("Sampling")
    else:
        print("\nRunning experiment %s" % jobname)

    #----------------------------------------------------------------------
    if depth > 1:
        transition = LSTMstack(dim=dim,
                               depth=depth,
                               name="transition",
                               lstm_name="transition")
        assert not GRU
    elif GRU:
        transition = GatedRecurrent(dim=dim, name="transition")
    else:
        transition = LSTM(dim=dim, name="transition")

    emitter = SketchEmitter(mix_dim=mix_dim, epsilon=epsilon, name="emitter")
    readout = Readout(readout_dim=emitter.get_dim('inputs'),
                      source_names=['states'],
                      emitter=emitter,
                      name="readout")
    normal_inputs = [
        name for name in transition.apply.sequences if 'mask' not in name
    ]
    fork = Fork(normal_inputs, prototype=Linear(use_bias=True))
    generator = SequenceGenerator(readout=readout,
                                  transition=transition,
                                  fork=fork)

    # Initialization settings
    generator.weights_init = OrthogonalGlorot()
    generator.biases_init = Constant(0)

    # Build the cost computation graph [steps,batch_size, 3]
    x = T.tensor3('features', dtype=floatX)[:max_length, :, :]
    x.tag.test_value = np.ones((max_length, batch_size, 3)).astype(np.float32)
    cost = generator.cost(x)
    cost.name = "sequence_log_likelihood"

    # Give an idea of what's going on
    model = Model(cost)
    params = model.get_params()
    logger.info("Parameters:\n" +
                pprint.pformat([(key, value.get_value().shape)
                                for key, value in params.items()],
                               width=120))
    model_size = 0
    for v in params.itervalues():
        s = v.get_value().shape
        model_size += s[0] * (s[1] if len(s) > 1 else 1)
    logger.info("Total number of parameters %d" % model_size)

    #------------------------------------------------------------
    extensions = []
    if old_model_name == 'continue':
        extensions.append(LoadFromDump(jobname))
    elif old_model_name:
        # or you can just load the weights without state using:
        old_params = LoadFromDump(old_model_name).manager.load_parameters()
        model.set_param_values(old_params)
    else:
        # Initialize parameters
        for brick in model.get_top_bricks():
            brick.initialize()

    if sample:
        assert old_model_name and old_model_name != 'continue'
        Sample(generator, steps=max_length, path='.').do(None)
        exit(0)

    #------------------------------------------------------------
    # Define the training algorithm.
    cg = ComputationGraph(cost)
    if dropout > 0.:
        from blocks.roles import INPUT, OUTPUT
        dropout_target = VariableFilter(roles=[OUTPUT],
                                        bricks=[transition],
                                        name_regex='states')(cg.variables)
        cg = apply_dropout(cg, dropout_target, dropout)
        cost = cg.outputs[0]

    if step_method == 'adam':
        step_rule = Adam(learning_rate)
    elif step_method == 'rmsprop':
        step_rule = RMSProp(learning_rate, decay_rate=0.95)
    elif step_method == 'adagrad':
        step_rule = AdaGrad(learning_rate)
    elif step_method == 'adadelta':
        step_rule = AdaDelta()
    elif step_method == 'scale':
        step_rule = Scale(learning_rate=0.1)
    else:
        raise Exception('Unknown sttep method %s' % step_method)

    step_rule = CompositeRule([StepClipping(max_grad), step_rule])

    algorithm = GradientDescent(cost=cost,
                                params=cg.parameters,
                                step_rule=step_rule)

    #------------------------------------------------------------
    observables = [cost]

    # Fetch variables useful for debugging
    (energies, ) = VariableFilter(applications=[generator.readout.readout],
                                  name_regex="output")(cg.variables)
    (activations, ) = VariableFilter(
        applications=[generator.transition.apply],
        name=generator.transition.apply.states[0])(cg.variables)
    min_energy = named_copy(energies.min(), "min_energy")
    max_energy = named_copy(energies.max(), "max_energy")
    mean_activation = named_copy(abs(activations).mean(), "mean_activation")
    observables += [min_energy, max_energy, mean_activation]

    observables += [algorithm.total_step_norm, algorithm.total_gradient_norm]
    for name, param in params.items():
        observables.append(named_copy(param.norm(2), name + "_norm"))
        observables.append(
            named_copy(algorithm.gradients[param].norm(2),
                       name + "_grad_norm"))

    #------------------------------------------------------------
    datasource_fname = os.path.join(fuel.config.data_path, datasource,
                                    datasource + '.hdf5')

    train_ds = H5PYDataset(
        datasource_fname,  #max_length=max_length,
        which_set='train',
        sources=('features', ),
        load_in_memory=True)
    train_stream = DataStream(train_ds,
                              iteration_scheme=ShuffledScheme(
                                  train_ds.num_examples, batch_size))

    test_ds = H5PYDataset(
        datasource_fname,  #max_length=max_length,
        which_set='test',
        sources=('features', ),
        load_in_memory=True)
    test_stream = DataStream(test_ds,
                             iteration_scheme=SequentialScheme(
                                 test_ds.num_examples, batch_size))

    train_stream = Mapping(train_stream, _transpose)
    test_stream = Mapping(test_stream, _transpose)

    def stream_stats(ds, label):
        itr = ds.get_epoch_iterator(as_dict=True)
        batch_count = 0
        examples_count = 0
        for batch in itr:
            batch_count += 1
            examples_count += batch['features'].shape[1]
        print('%s #batch %d #examples %d' %
              (label, batch_count, examples_count))

    stream_stats(train_stream, 'train')
    stream_stats(test_stream, 'test')

    extensions += [
        Timing(every_n_batches=10),
        TrainingDataMonitoring(observables, prefix="train",
                               every_n_batches=10),
        DataStreamMonitoring(
            [cost],
            test_stream,
            prefix="test",
            on_resumption=True,
            after_epoch=False,  # by default this is True
            every_n_batches=100),
        # all monitored data is ready so print it...
        # (next steps may take more time and we want to see the
        # results as soon as possible so print as soon as you can)
        Printing(every_n_batches=10),
        # perform multiple dumps at different intervals
        # so if one of them breaks (has nan) we can hopefully
        # find a model from few batches ago in the other
        Dump(jobname, every_n_batches=11),
        Dump(jobname + '.test', every_n_batches=100),
        Sample(generator,
               steps=max_length,
               path=jobname + '.test',
               every_n_batches=100),
        ProgressBar(),
        FinishAfter(after_n_epochs=epochs)
        # This shows a way to handle NaN emerging during
        # training: simply finish it.
        .add_condition("after_batch", _is_nan),
    ]

    if bokeh:
        extensions.append(Plot('sketch', channels=[
            ['cost'],
        ]))

    # Construct the main loop and start training!
    main_loop = MainLoop(model=model,
                         data_stream=train_stream,
                         algorithm=algorithm,
                         extensions=extensions)

    main_loop.run()
Esempio n. 11
0
    def training(self,
                 fea2obj,
                 batch_size,
                 learning_rate=0.005,
                 steprule='adagrad',
                 wait_epochs=5,
                 kl_weight_init=None,
                 klw_ep=50,
                 klw_inc_rate=0,
                 num_epochs=None):
        networkfile = self._config['net']

        n_epochs = num_epochs or int(self._config['nepochs'])
        reg_weight = float(self._config['loss_weight'])
        reg_type = self._config['loss_reg']
        numtrain = int(
            self._config['num_train']) if 'num_train' in self._config else None
        train_stream, num_samples_train = get_comb_stream(
            fea2obj, 'train', batch_size, shuffle=True, num_examples=numtrain)
        dev_stream, num_samples_dev = get_comb_stream(fea2obj,
                                                      'dev',
                                                      batch_size=None,
                                                      shuffle=False)
        logger.info('sources: %s -- number of train/dev samples: %d/%d',
                    train_stream.sources, num_samples_train, num_samples_dev)

        t2idx = fea2obj['targets'].t2idx
        klw_init = kl_weight_init or float(
            self._config['kld_weight']) if 'kld_weight' in self._config else 1
        logger.info('kl_weight_init: %d', klw_init)
        kl_weight = shared_floatx(klw_init, 'kl_weight')
        entropy_weight = shared_floatx(1., 'entropy_weight')

        cost, p_at_1, _, KLD, logpy_xz, pat1_recog, misclassify_rate = build_model_new(
            fea2obj, len(t2idx), self._config, kl_weight, entropy_weight)

        cg = ComputationGraph(cost)

        weights = VariableFilter(roles=[WEIGHT])(cg.parameters)
        logger.info('Model weights are: %s', weights)
        if 'L2' in reg_type:
            cost += reg_weight * l2_norm(weights)
            logger.info('applying %s with weight: %f ', reg_type, reg_weight)

        dropout = -0.1
        if dropout > 0:
            cg = apply_dropout(cg, weights, dropout)
            cost = cg.outputs[0]

        cost.name = 'cost'
        logger.info('Our Algorithm is : %s, and learning_rate: %f', steprule,
                    learning_rate)
        if 'adagrad' in steprule:
            cnf_step_rule = AdaGrad(learning_rate)
        elif 'adadelta' in steprule:
            cnf_step_rule = AdaDelta(decay_rate=0.95)
        elif 'decay' in steprule:
            cnf_step_rule = RMSProp(learning_rate=learning_rate,
                                    decay_rate=0.90)
            cnf_step_rule = CompositeRule([cnf_step_rule, StepClipping(1)])
        elif 'momentum' in steprule:
            cnf_step_rule = Momentum(learning_rate=learning_rate, momentum=0.9)
        elif 'adam' in steprule:
            cnf_step_rule = Adam(learning_rate=learning_rate)
        else:
            logger.info('The steprule param is wrong! which is: %s', steprule)

        algorithm = GradientDescent(cost=cost,
                                    parameters=cg.parameters,
                                    step_rule=cnf_step_rule,
                                    on_unused_sources='warn')
        #algorithm.add_updates(updates)
        gradient_norm = aggregation.mean(algorithm.total_gradient_norm)
        step_norm = aggregation.mean(algorithm.total_step_norm)
        monitored_vars = [
            cost, gradient_norm, step_norm, p_at_1, KLD, logpy_xz, kl_weight,
            pat1_recog
        ]
        train_monitor = TrainingDataMonitoring(variables=monitored_vars,
                                               after_batch=True,
                                               before_first_epoch=True,
                                               prefix='tra')

        dev_monitor = DataStreamMonitoring(variables=[
            cost, p_at_1, KLD, logpy_xz, pat1_recog, misclassify_rate
        ],
                                           after_epoch=True,
                                           before_first_epoch=True,
                                           data_stream=dev_stream,
                                           prefix="dev")

        extensions = [
            dev_monitor,
            train_monitor,
            Timing(),
            TrackTheBest('dev_cost'),
            FinishIfNoImprovementAfter('dev_cost_best_so_far',
                                       epochs=wait_epochs),
            Printing(after_batch=False),  #, ProgressBar()
            FinishAfter(after_n_epochs=n_epochs),
            saveload.Load(networkfile + '.toload.pkl'),
        ] + track_best('dev_cost', networkfile + '.best.pkl')

        #extensions.append(SharedVariableModifier(kl_weight,
        #                                          lambda n, klw: numpy.cast[theano.config.floatX] (klw_inc_rate + klw), after_epoch=False, every_n_epochs=klw_ep, after_batch=False))
        #         extensions.append(SharedVariableModifier(entropy_weight,
        #                                                   lambda n, crw: numpy.cast[theano.config.floatX](crw - klw_inc_rate), after_epoch=False, every_n_epochs=klw_ep, after_batch=False))

        logger.info('number of parameters in the model: %d',
                    tensor.sum([p.size for p in cg.parameters]).eval())
        logger.info('Lookup table sizes: %s',
                    [p.size.eval() for p in cg.parameters if 'lt' in p.name])

        main_loop = MainLoop(data_stream=train_stream,
                             algorithm=algorithm,
                             model=Model(cost),
                             extensions=extensions)
        main_loop.run()
Esempio n. 12
0
cost = CategoricalCrossEntropy().apply(y, y_hat)

cg = ComputationGraph(cost)
W1, W2 = VariableFilter(roles=[WEIGHT])(cg.variables)

cost = cost + 0.005 * (W1**2).sum() + 0.005 * (W2**2).sum()
cost.name = "loss"

#
# the actual training of the model
#
main = MainLoop(
    data_stream=DataStream.default_stream(dataset,
                                          iteration_scheme=SequentialScheme(
                                              dataset.num_instances,
                                              batch_size=512)),
    algorithm=GradientDescent(cost=cost,
                              parameters=cg.parameters,
                              step_rule=AdaGrad()),
    extensions=[
        ProgressBar(),
        #FinishAfter(after_n_epochs=10),
        #Printing(),
        TrainingDataMonitoring(variables=[cost], after_batch=True),
        SaveWeights(layers=[W1, W2], prefixes=["./data/w1", "./data/w2"]),
        #VisualizeWordVectors(layers=[W1, W2], labels=dataset.word_dict),
    ])

main.run()
time2 = time.time()
print "time for building the model: " + str(time2 - time1)

######### training ##################
n_epochs = 15
if "n_epochs" in config:
    n_epochs = int(config["n_epochs"])

model = Model([cost])
print model.get_parameter_dict()

curStepRule = Scale(learning_rate=learning_rate)
if "adagrad" in config:
    print "using adagrad"
    curStepRule = AdaGrad(learning_rate=learning_rate)
elif "adadelta" in config:
    print "using adadelta"
    curStepRule = AdaDelta()
elif "momentum" in config:
    print "using momentum"
    mWeight = float(config["momentum"])
    curStepRule = Momentum(learning_rate=learning_rate, momentum=mWeight)
else:
    print "using traditional SGD"

algorithm = GradientDescent(cost=cost,
                            parameters=params,
                            step_rule=curStepRule,
                            on_unused_sources='warn')
extensions = []
Esempio n. 14
0
def main(job_id, params):

    config = ConfigParser.ConfigParser()
    config.readfp(open('./params'))

    max_epoch = int(config.get('hyperparams', 'max_iter', 100))
    base_lr = float(config.get('hyperparams', 'base_lr', 0.01))
    train_batch = int(config.get('hyperparams', 'train_batch', 256))
    valid_batch = int(config.get('hyperparams', 'valid_batch', 512))
    test_batch = int(config.get('hyperparams', 'valid_batch', 512))
    hidden_units = int(config.get('hyperparams', 'hidden_units', 16))

    W_sd = float(config.get('hyperparams', 'W_sd', 0.01))
    W_mu = float(config.get('hyperparams', 'W_mu', 0.0))
    b_sd = float(config.get('hyperparams', 'b_sd', 0.01))
    b_mu = float(config.get('hyperparams', 'b_mu', 0.0))

    dropout_ratio = float(config.get('hyperparams', 'dropout_ratio', 0.2))
    weight_decay = float(config.get('hyperparams', 'weight_decay', 0.001))
    max_norm = float(config.get('hyperparams', 'max_norm', 100.0))
    solver = config.get('hyperparams', 'solver_type', 'rmsprop')
    data_file = config.get('hyperparams', 'data_file')
    fine_tune = config.getboolean('hyperparams', 'fine_tune')

    # Spearmint optimization parameters:
    if params:
        base_lr = float(params['base_lr'][0])
        dropout_ratio = float(params['dropout_ratio'][0])
        hidden_units = params['hidden_units'][0]
        weight_decay = params['weight_decay'][0]

    if 'adagrad' in solver:
        solver_type = CompositeRule([
            AdaGrad(learning_rate=base_lr),
            VariableClipping(threshold=max_norm)
        ])
    else:
        solver_type = CompositeRule([
            RMSProp(learning_rate=base_lr),
            VariableClipping(threshold=max_norm)
        ])

    rn_file = '/projects/francisco/repositories/NI-ML/models/deepnets/blocks/ff/models/rnet/2015-06-25-18:13'
    ln_file = '/projects/francisco/repositories/NI-ML/models/deepnets/blocks/ff/models/lnet/2015-06-29-11:45'

    right_dim = 10519
    left_dim = 11427

    train = H5PYDataset(data_file, which_set='train')
    valid = H5PYDataset(data_file, which_set='valid')
    test = H5PYDataset(data_file, which_set='test')

    l_x = tensor.matrix('l_features')
    r_x = tensor.matrix('r_features')
    y = tensor.lmatrix('targets')

    lnet = load(ln_file).model.get_top_bricks()[0]
    rnet = load(rn_file).model.get_top_bricks()[0]

    # Pre-trained layers:

    # Inputs -> hidden_1 -> hidden 2
    for side, net in zip(['l', 'r'], [lnet, rnet]):
        for child in net.children:
            child.name = side + '_' + child.name

    ll1 = lnet.children[0]
    lr1 = lnet.children[1]
    ll2 = lnet.children[2]
    lr2 = lnet.children[3]

    rl1 = rnet.children[0]
    rr1 = rnet.children[1]
    rl2 = rnet.children[2]
    rr2 = rnet.children[3]

    l_h = lr2.apply(ll2.apply(lr1.apply(ll1.apply(l_x))))
    r_h = rr2.apply(rl2.apply(rr1.apply(rl1.apply(r_x))))

    input_dim = ll2.output_dim + rl2.output_dim

    # hidden_2 -> hidden_3 -> hidden_4 -> Logistic output
    output_mlp = MLP(activations=[
        Rectifier(name='h3'),
        Rectifier(name='h4'),
        Softmax(name='output'),
    ],
                     dims=[
                         input_dim,
                         hidden_units,
                         hidden_units,
                         2,
                     ],
                     weights_init=IsotropicGaussian(std=W_sd, mean=W_mu),
                     biases_init=IsotropicGaussian(std=W_sd, mean=W_mu))

    output_mlp.initialize()

    # # Concatenate the inputs from the two hidden subnets into a single variable
    # # for input into the next layer.
    merge = tensor.concatenate([l_h, r_h], axis=1)
    #
    y_hat = output_mlp.apply(merge)

    # Define a cost function to optimize, and a classification error rate.
    # Also apply the outputs from the net and corresponding targets:
    cost = CategoricalCrossEntropy().apply(y.flatten(), y_hat)
    error = MisclassificationRate().apply(y.flatten(), y_hat)
    error.name = 'error'

    # This is the model: before applying dropout
    model = Model(cost)

    # Need to define the computation graph for the cost func:
    cost_graph = ComputationGraph([cost])

    # This returns a list of weight vectors for each layer
    W = VariableFilter(roles=[WEIGHT])(cost_graph.variables)

    # Add some regularization to this model:
    cost += weight_decay * l2_norm(W)
    cost.name = 'entropy'

    # computational graph with l2 reg
    cost_graph = ComputationGraph([cost])

    # Apply dropout to inputs:
    inputs = VariableFilter([INPUT])(cost_graph.variables)
    dropout_inputs = [
        input for input in inputs if input.name.startswith('linear_')
    ]
    dropout_graph = apply_dropout(cost_graph, [dropout_inputs[0]], 0.2)
    dropout_graph = apply_dropout(dropout_graph, dropout_inputs[1:],
                                  dropout_ratio)
    dropout_cost = dropout_graph.outputs[0]
    dropout_cost.name = 'dropout_entropy'

    # If no fine-tuning of l-r models is wanted, find the params for only
    # the joint layers:
    if fine_tune:
        params_to_update = dropout_graph.parameters
    else:
        params_to_update = VariableFilter(
            [PARAMETER], bricks=output_mlp.children)(cost_graph)

    # Learning Algorithm:
    algo = GradientDescent(step_rule=solver_type,
                           params=params_to_update,
                           cost=dropout_cost)

    # algo.step_rule.learning_rate.name = 'learning_rate'

    # Data stream used for training model:
    training_stream = Flatten(
        DataStream.default_stream(dataset=train,
                                  iteration_scheme=ShuffledScheme(
                                      train.num_examples,
                                      batch_size=train_batch)))

    training_monitor = TrainingDataMonitoring([
        dropout_cost,
        aggregation.mean(error),
        aggregation.mean(algo.total_gradient_norm)
    ],
                                              after_batch=True)

    # Use the 'valid' set for validation during training:
    validation_stream = Flatten(
        DataStream.default_stream(dataset=valid,
                                  iteration_scheme=ShuffledScheme(
                                      valid.num_examples,
                                      batch_size=valid_batch)))

    validation_monitor = DataStreamMonitoring(variables=[cost, error],
                                              data_stream=validation_stream,
                                              prefix='validation',
                                              after_epoch=True)

    test_stream = Flatten(
        DataStream.default_stream(
            dataset=test,
            iteration_scheme=ShuffledScheme(test.num_examples,
                                            batch_size=test_batch)))

    test_monitor = DataStreamMonitoring(variables=[error],
                                        data_stream=test_stream,
                                        prefix='test',
                                        after_training=True)

    plotting = Plot(
        'AdniNet_LeftRight',
        channels=[
            ['dropout_entropy'],
            ['error', 'validation_error'],
        ],
    )

    # Checkpoint class used to save model and log:
    stamp = datetime.datetime.fromtimestamp(
        time.time()).strftime('%Y-%m-%d-%H:%M')
    checkpoint = Checkpoint('./models/{}'.format(stamp),
                            save_separately=['model', 'log'],
                            every_n_epochs=1)

    # The main loop will train the network and output reports, etc
    main_loop = MainLoop(data_stream=training_stream,
                         model=model,
                         algorithm=algo,
                         extensions=[
                             validation_monitor,
                             training_monitor,
                             plotting,
                             FinishAfter(after_n_epochs=max_epoch),
                             FinishIfNoImprovementAfter(
                                 notification_name='validation_error',
                                 epochs=1),
                             Printing(),
                             ProgressBar(),
                             checkpoint,
                             test_monitor,
                         ])
    main_loop.run()
    ve = float(main_loop.log.last_epoch_row['validation_error'])
    te = float(main_loop.log.last_epoch_row['error'])
    spearmint_loss = ve + abs(te - ve)
    print 'Spearmint Loss: {}'.format(spearmint_loss)
    return spearmint_loss
Esempio n. 15
0
    def build_theano_functions(self) :
        # shape of theano inpu is time+1 X features
        x = T.fmatrix('frequency_sequence')
        x = x.reshape((self.batch_dim, self.time_dim+1, self.input_dim))

        y = x[:,1:self.time_dim+1,:]
        x = x[:,:self.time_dim,:]

        layers_input = [x]
        dims =np.array([self.input_dim])
        for dim in self.lstm_layers_dim :
            dims = np.append(dims, dim)
        print "Dimensions =", dims

        # layer is just an index of the layer
        for layer in range(len(self.lstm_layers_dim)) :

            # before the cell, input, forget and output gates, x needs to
            # be transformed
            linear = Linear(dims[layer],
                            dims[layer+1]*4,
                            weights_init=Orthogonal(self.orth_scale),
                            #weights_init=IsotropicGaussian(mean=1.,std=1),
                            biases_init=Constant(0),
                            name="linear"+str(layer))
            linear.initialize()
            lstm_input = linear.apply(layers_input[layer])

            # the lstm wants batch X time X value
            lstm = LSTM(
                dim=dims[layer+1],
                weights_init=IsotropicGaussian(mean=0.,std=0.5),
                biases_init=Constant(1),
                name="lstm"+str(layer))
            lstm.initialize()
            # hack to use Orthogonal on lstm w_state
            lstm.W_state.set_value(
                self.orth_scale*Orthogonal().generate(np.random, lstm.W_state.get_value().shape))
            h, _dummy = lstm.apply(lstm_input)

            layers_input.append(h)

        # the idea is to have one gaussian parametrize every frequency bin
        print "Last linear transform dim :", dims[1:].sum()
        output_transform = Linear(dims[1:].sum(),
                                  self.output_dim,
                                  #weights_init=IsotropicGaussian(mean=0., std=1),
                                  weights_init=Orthogonal(self.orth_scale),
                                  biases_init=Constant(0),
                                  #use_bias=False,
                                  name="output_transform")
        output_transform.initialize()
        if len(self.lstm_layers_dim) == 1 :
            print "hallo there, only one layer speaking"
            y_hat = output_transform.apply(layers_input[-1])
        else :
            y_hat = output_transform.apply(T.concatenate(layers_input[1:], axis=2))

        sig = T.nnet.relu(y_hat[:,:,:self.output_dim/2])+0.05
        mus = y_hat[:,:,self.output_dim/2:]

        # sum likelihood with targets
        # sum inside log accross mixtures, sum outside log accross time
        inside_expo = -0.5*((y-mus)**2)/sig**2
        expo = T.exp(inside_expo)
        coeff = 1./(T.sqrt(2.*np.pi)*sig)
        inside_log = T.log(coeff*expo)
        inside_log_max = T.max(inside_log, axis=2, keepdims=True)
        LL = -(inside_log_max + T.log(T.sum(T.exp(inside_log - inside_log_max), axis=2, keepdims=True))).sum()
        LL.name = "summed_likelihood"

        model = Model(LL)
        self.model = model

        algorithm = GradientDescent(
            cost=LL,
            parameters=model.parameters,
            step_rule=AdaGrad())

        f = theano.function([x],[sig, mus])

        return algorithm, f