def visualize_singular_values(args):
    param_values = load_parameter_values(args.load_path)
    for d in range(args.layers):
        if args.rnn_type == 'lstm':
            ws = param_values["/recurrentstack/lstm_" + str(d) + ".W_state"]
            w_rec = ws[:, 3 * args.state_dim:]
        elif args.rnn_type == 'simple':
            w_rec = param_values["/recurrentstack/simplerecurrent_" + str(d) +
                                 ".W_state"]
        else:
            raise NotImplementedError
        U, s, V = np.linalg.svd(w_rec, full_matrices=True)
        plt.subplot(2, 1, 1)
        plt.plot(np.arange(s.shape[0]), s, label='Layer_' + str(d))
        plt.grid(True)
        plt.legend(loc='upper right')
        plt.title("Singular_values_of_recurrent_weights")
        plt.subplot(2, 1, 2)
        plt.plot(np.arange(s.shape[0]), np.log(s + 1E-15),
                 label='Layer_' + str(d))
        plt.grid(True)
        plt.title("Log_singular_values_of_recurrent_weights")
    plt.tight_layout()

    plt.savefig(args.save_path + "/visualize_singular_values.png")
    logger.info("Figure \"visualize_singular_values"
                ".png\" saved at directory: " + args.save_path)
def visualize_singular_values(args):
    param_values = load_parameter_values(args.load_path)
    for d in range(args.layers):
        if args.rnn_type == 'lstm':
            ws = param_values["/recurrentstack/lstm_" + str(d) + ".W_state"]
            w_rec = ws[:, 3 * args.state_dim:]
        elif args.rnn_type == 'simple':
            w_rec = param_values["/recurrentstack/simplerecurrent_" + str(d) +
                                 ".W_state"]
        else:
            raise NotImplementedError
        U, s, V = np.linalg.svd(w_rec, full_matrices=True)
        plt.subplot(2, 1, 1)
        plt.plot(np.arange(s.shape[0]), s, label='Layer_' + str(d))
        plt.grid(True)
        plt.legend(loc='upper right')
        plt.title("Singular_values_of_recurrent_weights")
        plt.subplot(2, 1, 2)
        plt.plot(np.arange(s.shape[0]),
                 np.log(s + 1E-15),
                 label='Layer_' + str(d))
        plt.grid(True)
        plt.title("Log_singular_values_of_recurrent_weights")
    plt.tight_layout()

    plt.savefig(args.save_path + "/visualize_singular_values.png")
    logger.info("Figure \"visualize_singular_values"
                ".png\" saved at directory: " + args.save_path)
 def load_params(self, path):
     graphs = [
         self.get_cost_graph().outputs[0],
         ComputationGraph(self.get_generate_graph()['outputs'])
     ]
     param_values = load_parameter_values(path)
     for graph in graphs:
         SpeechModel(graph).set_parameter_values(param_values)
Exemple #4
0
 def load_to(self, main_loop):
     main_loop.model.set_parameter_values(load_parameter_values(self.path))
     if self.load_iteration_state or self.load_log:
         with open(self.path, "rb") as source:
             loaded_main_loop = load(source)
         if self.load_log:
             main_loop.log = loaded_main_loop.log
         if self.load_iteration_state:
             main_loop.iteration_state = loaded_main_loop.iteration_state
Exemple #5
0
 def load_to(self, main_loop):
     main_loop.model.set_parameter_values(load_parameter_values(self.path))
     if self.load_iteration_state or self.load_log:
         with open(self.path, "rb") as source:
             loaded_main_loop = load(source)
         if self.load_log:
             main_loop.log = loaded_main_loop.log
         if self.load_iteration_state:
             main_loop.iteration_state = loaded_main_loop.iteration_state
def test_serialization():
    # Create a simple brick with two parameters
    mlp = MLP(activations=[None, None],
              dims=[10, 10, 10],
              weights_init=Constant(1.),
              use_bias=False)
    mlp.initialize()
    W = mlp.linear_transformations[1].W
    W.set_value(W.get_value() * 2)

    # Check the data using numpy.load
    with NamedTemporaryFile(delete=False) as f:
        dump(mlp, f)
    numpy_data = numpy.load(f.name)
    assert set(numpy_data.keys()) == \
        set(['mlp-linear_0.W', 'mlp-linear_1.W', 'pkl'])
    assert_allclose(numpy_data['mlp-linear_0.W'], numpy.ones((10, 10)))
    assert numpy_data['mlp-linear_0.W'].dtype == theano.config.floatX

    # Ensure that it can be unpickled
    mlp = load(f.name)
    assert_allclose(mlp.linear_transformations[1].W.get_value(),
                    numpy.ones((10, 10)) * 2)

    # Ensure that only parameters are saved as NPY files
    mlp.random_data = numpy.random.rand(10)
    with NamedTemporaryFile(delete=False) as f:
        dump(mlp, f)
    numpy_data = numpy.load(f.name)
    assert set(numpy_data.keys()) == \
        set(['mlp-linear_0.W', 'mlp-linear_1.W', 'pkl'])

    # Ensure that parameters can be loaded with correct names
    parameter_values = load_parameter_values(f.name)
    assert set(parameter_values.keys()) == \
        set(['/mlp/linear_0.W', '/mlp/linear_1.W'])

    # Ensure that duplicate names are dealt with
    for child in mlp.children:
        child.name = 'linear'
    with NamedTemporaryFile(delete=False) as f:
        dump(mlp, f)
    numpy_data = numpy.load(f.name)
    assert set(numpy_data.keys()) == \
        set(['mlp-linear.W', 'mlp-linear.W_2', 'pkl'])

    # Ensure warnings are raised when __main__ namespace objects are dumped
    foo.__module__ = '__main__'
    import __main__
    __main__.__dict__['foo'] = foo
    mlp.foo = foo
    with NamedTemporaryFile(delete=False) as f:
        with warnings.catch_warnings(record=True) as w:
            dump(mlp, f)
            assert len(w) == 1
            assert '__main__' in str(w[-1].message)
def run_visualizations(cost,
                       updates,
                       train_stream,
                       valid_stream,
                       args,
                       hidden_states=None,
                       gate_values=None):

    # Load the parameters from a dumped model
    assert args.load_path is not None
    model = Model(cost)
    model.set_parameter_values(load_parameter_values(args.load_path))

    # Run a visualization
    if args.visualize == "generate":
        visualize_generate(cost, hidden_states, updates, train_stream,
                           valid_stream, args)

    elif args.visualize == "gates" and (gate_values is not None):
        if args.rnn_type == "lstm":
            visualize_gates_lstm(gate_values, hidden_states, updates,
                                 train_stream, valid_stream, args)
        elif args.rnn_type == "soft":
            visualize_gates_soft(gate_values, hidden_states, updates,
                                 train_stream, valid_stream, args)
        else:
            assert False

    elif args.visualize == "states":
        visualize_states(hidden_states, updates, train_stream, valid_stream,
                         args)

    elif args.visualize == "gradients":
        visualize_gradients(hidden_states, updates, train_stream, valid_stream,
                            args)

    elif args.visualize == "jacobian":
        visualize_jacobian(hidden_states, updates, train_stream, valid_stream,
                           args)

    elif args.visualize == "presoft":
        visualize_presoft(cost, hidden_states, updates, train_stream,
                          valid_stream, args)

    elif args.visualize == "matrices":
        visualize_matrices(args)

    elif args.visualize == "trained_singular_values":
        visualize_singular_values(args)

    elif args.visualize == "gradients_flow_pie":
        visualize_gradients_flow_pie(hidden_states, updates, args)

    else:
        assert False
Exemple #8
0
def test_serialization():
    # Create a simple brick with two parameters
    mlp = MLP(activations=[None, None], dims=[10, 10, 10],
              weights_init=Constant(1.), use_bias=False)
    mlp.initialize()
    W = mlp.linear_transformations[1].W
    W.set_value(W.get_value() * 2)

    # Check the data using numpy.load
    with NamedTemporaryFile(delete=False) as f:
        dump(mlp, f)
    numpy_data = numpy.load(f.name)
    assert set(numpy_data.keys()) == \
        set(['mlp-linear_0.W', 'mlp-linear_1.W', 'pkl'])
    assert_allclose(numpy_data['mlp-linear_0.W'], numpy.ones((10, 10)))
    assert numpy_data['mlp-linear_0.W'].dtype == theano.config.floatX

    # Ensure that it can be unpickled
    mlp = load(f.name)
    assert_allclose(mlp.linear_transformations[1].W.get_value(),
                    numpy.ones((10, 10)) * 2)

    # Ensure that only parameters are saved as NPY files
    mlp.random_data = numpy.random.rand(10)
    with NamedTemporaryFile(delete=False) as f:
        dump(mlp, f)
    numpy_data = numpy.load(f.name)
    assert set(numpy_data.keys()) == \
        set(['mlp-linear_0.W', 'mlp-linear_1.W', 'pkl'])

    # Ensure that parameters can be loaded with correct names
    parameter_values = load_parameter_values(f.name)
    assert set(parameter_values.keys()) == \
        set(['/mlp/linear_0.W', '/mlp/linear_1.W'])

    # Ensure that duplicate names are dealt with
    for child in mlp.children:
        child.name = 'linear'
    with NamedTemporaryFile(delete=False) as f:
        dump(mlp, f)
    numpy_data = numpy.load(f.name)
    assert set(numpy_data.keys()) == \
        set(['mlp-linear.W', 'mlp-linear.W_2', 'pkl'])

    # Ensure warnings are raised when __main__ namespace objects are dumped
    foo.__module__ = '__main__'
    import __main__
    __main__.__dict__['foo'] = foo
    mlp.foo = foo
    with NamedTemporaryFile(delete=False) as f:
        with warnings.catch_warnings(record=True) as w:
            dump(mlp, f)
            assert len(w) == 1
            assert '__main__' in str(w[-1].message)
def fine_tuning(cost, args):
    param_values = load_parameter_values(args.fine_tuning)

    param_values[
        "/output_layer.W"] = np.concatenate((
            param_values["/output_layer.W"],
            0.1 * np.random.randn(args.state_dim, 40).astype(np.float32)))

    model = Model(cost)
    model.set_parameter_values(param_values)

    return cost
Exemple #10
0
    def load_to(self, main_loop):
        aux_param_step = load_parameter_values(self.path)
        #ugly hacking, but Im super lazy, and this only needs to work for a few days with a limited set of models
        # I changed stuff in set_paramter_values, remember, it is ugly

        main_loop.model.set_parameter_values(aux_param_step)
        if self.load_iteration_state or self.load_log:
            with open(self.path, "rb") as source:
                loaded_main_loop = load(source)
            if self.load_log:
                main_loop.log = loaded_main_loop.log
            if self.load_iteration_state:
                main_loop.iteration_state = loaded_main_loop.iteration_state
def visualize_eigenvalues(args):
    path = args.load_path
    layers = args.layers

    param_values = load_parameter_values(path)
    print(param_values.keys())

    matrices = {}
    for i in range(layers):
        matrices[i] = param_values["/recurrentstack/simplerecurrent_" +
                                   str(i) + ".W"]
        u, diag, v = svd(matrices[i])
        print(diag)
def visualize_eigenvalues(args):
    path = args.load_path
    layers = args.layers

    param_values = load_parameter_values(path)
    print(param_values.keys())

    matrices = {}
    for i in range(layers):
        matrices[i] = param_values[
            "/recurrentstack/simplerecurrent_" + str(i) + ".W"]
        u, diag, v = svd(matrices[i])
        print(diag)
def visualize_matrices(args):

    param_values = load_parameter_values(args.load_path)
    print(param_values.keys())

    input0 = param_values["/fork/fork_inputs/lookuptable.W_lookup"]
    input1 = param_values["/fork/fork_inputs_1/lookuptable.W_lookup"]
    input2 = param_values["/fork/fork_inputs_2/lookuptable.W_lookup"]
    input3 = param_values["/fork/fork_inputs_3/lookuptable.W_lookup"]

    inputall = np.abs(np.concatenate((input0, input1, input2, input3), axis=1))

    outputall = np.abs(param_values["/output_layer.W"])

    # plt.matshow(inputall / np.mean(inputall), cmap=plt.cm.gray)
    # plt.matshow(outputall / np.mean(outputall), cmap=plt.cm.gray)
    plt.plot(np.arange(outputall.shape[0]), np.sum(np.abs(outputall), axis=1))

    if args.local:
        plt.show()
    else:
        plt.savefig(args.save_path + "/visualize_matrices.png")
        logger.info("Figure \"visualize_matrices"
                    ".png\" saved at directory: " + args.save_path)
Exemple #14
0
def initialize_graph(recognizer, data, config, params):
    # Separate attention_params to be handled differently
    # when regularization is applied
    attentions = recognizer.all_children().generator.transition.attention.get()
    attention_params = [Selector(attention).get_parameters().values()
                        for attention in attentions]

    logger.info(
        "Initialization schemes for all bricks.\n"
        "Works well only in my branch with __repr__ added to all them,\n"
        "there is an issue #463 in Blocks to do that properly.")

    def show_init_scheme(cur):
        result = dict()
        for attr in dir(cur):
            if attr.endswith('_init'):
                result[attr] = getattr(cur, attr)
        for child in cur.children:
            result[child.name] = show_init_scheme(child)
        return result
    logger.info(pprint.pformat(show_init_scheme(recognizer)))

    observables = []  # monitored each batch
    cg = recognizer.get_cost_graph(batch=True)
    labels = []
    labels_mask = []
    for chld in recognizer.children:
        lbls = VariableFilter(applications=[chld.cost], name='labels'+chld.names_postfix)(cg)
        lbls_mask = VariableFilter(applications=[chld.cost], name='labels_mask'+chld.names_postfix)(cg)
        if len(lbls) == 1:
            labels += lbls
            labels_mask += lbls_mask

    batch_cost = cg.outputs[0].sum()
    batch_size = rename(labels[0].shape[1], "batch_size")
    # Assumes constant batch size. `aggregation.mean` is not used because
    # of Blocks #514.
    cost = batch_cost / batch_size

    cost.name = "sequence_total_cost"
    logger.info("Cost graph is built")

    # Fetch variables useful for debugging.
    # It is important not to use any aggregation schemes here,
    # as it's currently impossible to spread the effect of
    # regularization on their variables, see Blocks #514.
    cost_cg = ComputationGraph(cost)
    
    bottom_output = VariableFilter(
        # We need name_regex instead of name because LookupTable calls itsoutput output_0
        applications=recognizer.all_children().bottom.apply.get(), name_regex="output")(
            cost_cg)
    
    attended = VariableFilter(
        applications=recognizer.all_children().generator.transition.apply.get(), name="attended")(
            cost_cg)
    attended_mask = VariableFilter(
        applications=recognizer.all_children().generator.transition.apply.get(), name="attended_mask")(
            cost_cg)
    weights = VariableFilter(
        applications=recognizer.all_children().generator.evaluate.get(), name="weights")(
            cost_cg)
    
    def get_renamed_list(rlist, elem_func, elem_name):
        return [rename(elem_func(elem), elem_name+chld.names_postfix)
                    for elem,chld in zip(rlist, recognizer.children)]
        
    max_sentence_lengths = get_renamed_list(bottom_output,
                                            lambda e: e.shape[0],
                                            "max_sentence_length")
    max_attended_mask_lengths = get_renamed_list(attended_mask,
                                            lambda e: e.shape[0],
                                            "max_attended_mask_length")
    max_attended_lengths = get_renamed_list(attended,
                                            lambda e: e.shape[0],
                                            "max_attended_length")
    max_num_characters = get_renamed_list(labels,
                                            lambda e: e.shape[0],
                                            "max_num_characters")
    
    mean_attended = get_renamed_list(attended,
                                            lambda e: abs(e).mean(),
                                            "mean_attended")
    mean_bottom_output = get_renamed_list(bottom_output,
                                            lambda e: abs(e).mean(),
                                            "mean_bottom_output")
    
    mask_density = get_renamed_list(labels_mask,
                                            lambda e: e.mean(),
                                            "mask_density")
    weights_entropy = [rename(entropy(w, lm),
                             "weights_entropy"+chld.names_postfix)
                       for w, lm, chld in zip(weights, labels_mask, recognizer.children)]

    observables += max_attended_lengths + max_attended_mask_lengths + max_sentence_lengths
    #
    # Monitoring of cost terms is tricky because of Blocks #514 - since the
    # costs are annotations that are not part of the original output graph,
    # they are unaffected by replacements such as dropout!!
    #
    cost_terms = []
    for chld in recognizer.children:
        chld_cost_terms = VariableFilter(applications=[chld.generator.evaluate],
                                name_regex='.*_nll')(cost_cg)
        chld_cost_terms = [rename(var, var.name[:-4] + chld.names_postfix + '_nll')
                           for var in chld_cost_terms]
        cost_terms += chld_cost_terms
        
    cg = ComputationGraph([cost, batch_size] +
        weights_entropy + mean_attended +
        mean_bottom_output + max_num_characters +
        mask_density + cost_terms)

    # Regularization. It is applied explicitly to all variables
    # of interest, it could not be applied to the cost only as it
    # would not have effect on auxiliary variables, see Blocks #514.
    reg_config = config['regularization']
    regularized_cg = cg

    if reg_config.get('dropout'):
        drop_conf = reg_config['dropout']
        bot_drop = drop_conf.get('bottom', 0.0)
        if bot_drop:
            logger.info('apply bottom dropout')
            regularized_cg = apply_dropout(regularized_cg,
                                           bottom_output, bot_drop)
        enc_drop = drop_conf.get('encoder', 0.0)
        if enc_drop:
            logger.info('apply encoder dropout')
            enc_bricks = reduce(lambda acc,x: acc+list(x), recognizer.all_children().encoder.children.get(), [])
            enc_states = VariableFilter(bricks=enc_bricks,
                                        name_regex='states')(regularized_cg)
            regularized_cg = apply_dropout(regularized_cg,
                                           enc_states,
                                           enc_drop)
        post_merge_drop = drop_conf.get('post_merge', 0.0)
        if post_merge_drop:
            logger.info('apply post_merge dropout')
            pm_bricks = []
            for chld in recognizer.children:
                cpm_bricks = list(chld.generator.readout.post_merge.children)
                cpm_bricks += cpm_bricks[-1].children
                cpm_bricks = [b for b in cpm_bricks if
                             isinstance(b, type(chld.post_merge_activation))]
                pm_bricks += cpm_bricks
            regularized_cg = apply_dropout(
                regularized_cg,
                VariableFilter(bricks=pm_bricks, name='output')(regularized_cg),
                post_merge_drop)

    if reg_config.get('noise'):
        logger.info('apply noise')
        noise_subjects = [p for p in cg.parameters if p not in attention_params]
        regularized_cg = apply_noise(cg, noise_subjects, reg_config['noise'])

    train_cost = regularized_cg.outputs[0]

    if reg_config.get("penalty_coof", .0) > 0:
        # big warning!!!
        # here we assume that:
        # regularized_weights_penalty = regularized_cg.outputs[1]
        train_cost = (train_cost +
                      reg_config.get("penalty_coof", .0) *
                      regularized_cg.outputs[1] / batch_size)
        
    if reg_config.get("decay", .0) > 0:
        train_cost = (train_cost + reg_config.get("decay", .0) *
                      l2_norm(VariableFilter(roles=[WEIGHT])(cg.parameters)) ** 2)

    train_cost = train_cost.copy(name='train_cost')

    gradients = None
    if reg_config.get('adaptive_noise'):
        logger.info('apply adaptive noise')
        if ((reg_config.get("penalty_coof", .0) > 0) or
                (reg_config.get("decay", .0) > 0)):
            logger.error('using  adaptive noise with alignment weight panalty '
                         'or weight decay is probably stupid')
        train_cost, regularized_cg, gradients, noise_brick = apply_adaptive_noise(
            cg, cg.outputs[0],
            variables=cg.parameters,
            num_examples=data.get_dataset('train').num_examples,
            parameters=SpeechModel(regularized_cg.outputs[0]
                                   ).get_parameter_dict().values(),
            **reg_config.get('adaptive_noise')
        )
        train_cost.name = 'train_cost'
        adapt_noise_cg = ComputationGraph(train_cost)
        model_prior_mean = rename(
            VariableFilter(applications=[noise_brick.apply],
                           name='model_prior_mean')(adapt_noise_cg)[0],
            'model_prior_mean')
        model_cost = rename(
            VariableFilter(applications=[noise_brick.apply],
                           name='model_cost')(adapt_noise_cg)[0],
            'model_cost')
        model_prior_variance = rename(
            VariableFilter(applications=[noise_brick.apply],
                           name='model_prior_variance')(adapt_noise_cg)[0],
            'model_prior_variance')
        regularized_cg = ComputationGraph(
            [train_cost, model_cost] +
            regularized_cg.outputs +
            [model_prior_mean, model_prior_variance])
        observables += [
            regularized_cg.outputs[1],  # model cost
            regularized_cg.outputs[2],  # task cost
            regularized_cg.outputs[-2],  # model prior mean
            regularized_cg.outputs[-1]]  # model prior variance

    if len(cost_terms):
        # Please note - the aggragation (mean) is done in
        # "attach_aggregation_schemes"
        ct_names = [v.name for v in cost_terms]
        for v in regularized_cg.outputs:
            if v.name in ct_names:
                observables.append(rename(v.sum()/batch_size,
                                                  v.name))
    for chld in recognizer.children:
        if chld.train_tags:
            tags_cost = VariableFilter(applications=[chld.addTagCost],
                                       name='output')(regularized_cg)[0]
            observables += [rename(tags_cost.sum()/batch_size, 'tags_nll'+chld.names_postfix)]

    # Model is weird class, we spend lots of time arguing with Bart
    # what it should be. However it can already nice things, e.g.
    # one extract all the parameters from the computation graphs
    # and give them hierahical names. This help to notice when a
    # because of some bug a parameter is not in the computation
    # graph.
    model = SpeechModel(train_cost)
    if params:
        logger.info("Load parameters from " + params)
        # please note: we cannot use recognizer.load_params
        # as it builds a new computation graph that dies not have
        # shapred variables added by adaptive weight noise
        param_values = load_parameter_values(params)
        model.set_parameter_values(param_values)

    parameters = model.get_parameter_dict()

    logger.info("Parameters:\n" +
                pprint.pformat(
                    [(key, parameters[key].get_value().shape) for key
                     in sorted(parameters.keys())],
                    width=120))
    max_norm_rules = []
    if reg_config.get('max_norm', False) > 0:
        logger.info("Apply MaxNorm")
        maxnorm_subjects = VariableFilter(roles=[WEIGHT])(cg.parameters)
        if reg_config.get('max_norm_exclude_lookup', False):
            maxnorm_subjects = [v for v in maxnorm_subjects
                                if not isinstance(get_brick(v), LookupTable)]
        logger.info("Parameters covered by MaxNorm:\n"
                    + pprint.pformat([name for name, p in parameters.items()
                                      if p in maxnorm_subjects]))
        logger.info("Parameters NOT covered by MaxNorm:\n"
                    + pprint.pformat([name for name, p in parameters.items()
                                      if not p in maxnorm_subjects]))
        max_norm_rules = [
            Restrict(VariableClipping(reg_config['max_norm'], axis=0),
                     maxnorm_subjects)]

        
    return { 'observables': observables, 'max_norm_rules': max_norm_rules,
             'cg': cg, 'regularized_cg' : regularized_cg, 'train_cost' : train_cost,
             'cost' : cost, 'batch_size' : batch_size, 'batch_cost' : batch_cost,
             'parameters' : parameters, 'gradients': gradients, 
             'model' : model, 'data' : data, 'recognizer' : recognizer,
             'weights_entropy' : weights_entropy,
             'labels_mask' : labels_mask, 'labels' : labels }
Exemple #15
0
convnet.layers[0].weights_init = Uniform(width=0.2)
convnet.layers[1].weights_init = Uniform(width=0.2)
convnet.layers[2].weights_init = Uniform(width=0.2)
convnet.top_mlp.linear_transformations[0].weights_init = Uniform(width=0.2)
convnet.top_mlp.linear_transformations[1].weights_init = Uniform(width=0.2)
convnet.initialize()
'''
#########################################################
#Generate output and error signal
predict = convnet.apply(x)
cost = CategoricalCrossEntropy().apply(y.flatten(), predict).copy(name='cost')
cg = ComputationGraph(cost)

#Load the parameters of the model
params = load_parameter_values('catsVsDogs256.pkl')
mo = Model(predict)
mo.set_parameter_values(params)
print mo.inputs
print dir(mo.inputs)
print mo.outputs

f = theano.function(mo.inputs, mo.outputs, allow_input_downcast=True)
predictions = []
k = 0
for batch in stream_test.get_epoch_iterator():
    example = numpy.array([batch[0]])
    batch_predictions = f(example)
    #batch_predictions=batch_predictions[0] #get the array
    for result in batch_predictions:
        res = numpy.argmax(result)
Exemple #16
0
 def load_params(self, path):
     generated = self.get_generate_graph()
     param_values = load_parameter_values(path)
     SpeechModel(generated['outputs']).set_parameter_values(param_values)
Exemple #17
0
def main(argv):
    name = argv[1]
    files = map(lambda p: join(folder, p), listdir(folder))

    file = next(filter(lambda n: name in n, files))
    print(file)

    p = load_parameter_values(file)

    net = net_dvc((128,128))

    x = tensor.tensor4('image_features')
    y_hat = net.apply(x)

    g = Model(y_hat)

    for k,v in p.items():
        p[k] = v.astype('float32')

    g.set_parameter_values(p)

    a,t,v = get_dvc((128,128),trainning=False, shortcut=False)
    run = function([x], y_hat)

    def run_test(data):
        res = []
        for i in  data.get_epoch_iterator():
            res.extend(run(i[0]))
        return res

    def max_index(l):
        if l[0] > l[1]:
            return 0
        else:
            return 1

    def write_kaggle(f, l):
        f.write("id,label\n")
        for i,e in enumerate(l,start=1):
            f.write(str(i)+","+str(e)+"\n")

    def kaggle(file, data):
        write_kaggle(file,map(max_index, run_test(data)))

    def accuracy(data):
        res = []
        true = []
        for i in data.get_epoch_iterator():
            res.extend(run(i[0]))
            true.extend(i[1])
        res = map(max_index, res)

        total = 0
        equal = 0
        for r,t in zip(res,true):
            total += 1
            equal += 1 if r == t else 0

        return equal/total

    print("Training accuracy: ", accuracy(a))
    print("Test accuracy: ", accuracy(v))
    kaggle_file = join(result_folder, name+".kaggle")
    print(kaggle_file)
    with open(kaggle_file,'w') as f:
            kaggle(f, t)
    for p in cg.parameters:
      print(str(p), p.shape, p.dtype)

    print("Created ComputationGraph, inputs:");
    print(cg.inputs)
  
  # Strangely, all the examples use : DataStreamMonitoring in MainLoop

  model = Model(labels)
  print("Model.dict_of_inputs():");
  print(model.dict_of_inputs())
  print("Model list inputs:");
  print([ v.name for v in model.inputs])

  ## Model loading from saved file
  model.set_parameter_values(load_parameter_values(save_state_path))  

  examine_embedding(lookup.W.get_value())
    
  label_ner = model.get_theano_function()
  print(model.inputs)
  print("printed label_ner.params")

  for test_data in data_stream.get_epoch_iterator():
    ordered_batch = test_data[0:3]   # Explicitly strip off the pre-defined labels
    #print(ordered_batch)
    
    results = label_ner(*ordered_batch)
    #print(results)  # This is a pure array of labels
    
    inputs = _transpose(ordered_batch)
Exemple #19
0
#get the test stream
from fuel.datasets.dogs_vs_cats import DogsVsCats
from fuel.streams import DataStream, ServerDataStream
from fuel.schemes import ShuffledScheme, SequentialExampleScheme
from fuel.transformers.image import RandomFixedSizeCrop, MinimumImageDimensions, MaximumImageDimensions, Random2DRotation
from fuel.transformers import Flatten, Cast, ScaleAndShift
size = (128,128)
cats = DogsVsCats(('test',))
stream = DataStream.default_stream(cats, iteration_scheme=SequentialExampleScheme(cats.num_examples))
stream_upscale = MaximumImageDimensions(stream, size, which_sources=('image_features',))
stream_scale = ScaleAndShift(stream_upscale, 1./255, 0, which_sources=('image_features',))
stream_data = Cast(stream_scale, dtype='float32', which_sources=('image_features',))

#Load the parameters of the model
params = load_parameter_values('convnet_parameters.pkl')
mo = Model(predict)
mo.set_parameter_values(params)
#Create the forward propagation function
fprop = function(mo.inputs, mo.outputs[0], allow_input_downcast=True)
tab = []
i = 1
#Get the prediction for each example of the test set
for data in stream_data.get_epoch_iterator():
    predict = np.argmax(fprop(data))
    tab.append([i, predict])
    print str(i) + "," + str(predict)
    i = i + 1
#Save predictions in a csv file
np.savetxt("dump.csv", tab, delimiter=",", fmt='%d')
 def load_params(self, path):
     generated = self.get_generate_graph()
     param_values = load_parameter_values(path)
     SpeechModel(generated['outputs']).set_parameter_values(param_values)
Exemple #21
0
out2 = inception((None, None), 192, 64, 96, 128, 16, 32, 32, out, 10)
out3 = inception((None, None), 256, 128, 128, 192, 32, 96, 64, out2, 20)
out31 = MaxPooling((2, 2), name="poolLow").apply(out3)
out4 = inception((None, None), 480, 192, 96, 208, 16, 48, 64, out31, 30)
out5 = inception((None, None), 512, 160, 112, 224, 24, 64, 64, out4, 40)

out6 = inception((None, None), 512, 128, 128, 256, 24, 64, 64, out5, 50)
out7 = inception((None, None), 512, 112, 144, 288, 32, 64, 64, out6, 60)
out8 = inception((None, None), 528, 256, 160, 320, 32, 128, 128, out7, 70)
out81 = MaxPooling((20, 20), name="poolLow1").apply(out8)
out9 = inception((None, None), 832, 256, 160, 320, 32, 128, 128, out81, 80)
out10 = inception((None, None), 832, 384, 192, 384, 48, 128, 128, out9, 90)
out91 = AveragePooling((5, 5), name="poolLow2").apply(out10)

# Load the model
params = load_parameter_values("catsVsDogs_google_new_Ortho.pkl")
model1 = Model(out8)
model1.set_parameter_values(params)
cost = model1.outputs[0].sum() ** 2
# cost = (T.dot(model1.outputs[0].flatten().T,model1.outputs[0].flatten())**2).sum()
f_prop = theano.function(model1.inputs, model1.outputs[0])
grad = T.grad(cost, model1.inputs)
f_grad = theano.function(model1.inputs, grad)

######DEEP DREAM CODE #############
def preprocess(img):
    return np.float32(np.rollaxis(img, 2)[::-1]) - img.mean()


def deprocess(img):
    return np.dstack((img + img.mean())[::-1])
Exemple #22
0
def main(mode, save_path, num_batches, data_path=None):
    reverser = WordReverser(100, len(char2code), name="reverser")

    if mode == "train":
        # Data processing pipeline
        dataset_options = dict(dictionary=char2code,
                               level="character",
                               preprocess=_lower)
        if data_path:
            dataset = TextFile(data_path, **dataset_options)
        else:
            dataset = OneBillionWord("training", [99], **dataset_options)
        data_stream = dataset.get_example_stream()
        data_stream = Filter(data_stream, _filter_long)
        data_stream = Mapping(data_stream,
                              reverse_words,
                              add_sources=("targets", ))
        data_stream = Batch(data_stream, iteration_scheme=ConstantScheme(10))
        data_stream = Padding(data_stream)
        data_stream = Mapping(data_stream, _transpose)

        # Initialization settings
        reverser.weights_init = IsotropicGaussian(0.1)
        reverser.biases_init = Constant(0.0)
        reverser.push_initialization_config()
        reverser.encoder.weights_init = Orthogonal()
        reverser.generator.transition.weights_init = Orthogonal()

        # Build the cost computation graph
        chars = tensor.lmatrix("features")
        chars_mask = tensor.matrix("features_mask")
        targets = tensor.lmatrix("targets")
        targets_mask = tensor.matrix("targets_mask")
        batch_cost = reverser.cost(chars, chars_mask, targets,
                                   targets_mask).sum()
        batch_size = named_copy(chars.shape[1], "batch_size")
        cost = aggregation.mean(batch_cost, batch_size)
        cost.name = "sequence_log_likelihood"
        logger.info("Cost graph is built")

        # Give an idea of what's going on
        model = Model(cost)
        parameters = model.get_parameter_dict()
        logger.info("Parameters:\n" +
                    pprint.pformat([(key, value.get_value().shape)
                                    for key, value in parameters.items()],
                                   width=120))

        # Initialize parameters
        for brick in model.get_top_bricks():
            brick.initialize()

        # Define the training algorithm.
        cg = ComputationGraph(cost)
        algorithm = GradientDescent(cost=cost,
                                    parameters=cg.parameters,
                                    step_rule=CompositeRule(
                                        [StepClipping(10.0),
                                         Scale(0.01)]))

        # Fetch variables useful for debugging
        generator = reverser.generator
        (energies, ) = VariableFilter(applications=[generator.readout.readout],
                                      name_regex="output")(cg.variables)
        (activations, ) = VariableFilter(
            applications=[generator.transition.apply],
            name=generator.transition.apply.states[0])(cg.variables)
        max_length = named_copy(chars.shape[0], "max_length")
        cost_per_character = named_copy(
            aggregation.mean(batch_cost, batch_size * max_length),
            "character_log_likelihood")
        min_energy = named_copy(energies.min(), "min_energy")
        max_energy = named_copy(energies.max(), "max_energy")
        mean_activation = named_copy(
            abs(activations).mean(), "mean_activation")
        observables = [
            cost, min_energy, max_energy, mean_activation, batch_size,
            max_length, cost_per_character, algorithm.total_step_norm,
            algorithm.total_gradient_norm
        ]
        for name, parameter in parameters.items():
            observables.append(named_copy(parameter.norm(2), name + "_norm"))
            observables.append(
                named_copy(algorithm.gradients[parameter].norm(2),
                           name + "_grad_norm"))

        # Construct the main loop and start training!
        average_monitoring = TrainingDataMonitoring(observables,
                                                    prefix="average",
                                                    every_n_batches=10)
        main_loop = MainLoop(
            model=model,
            data_stream=data_stream,
            algorithm=algorithm,
            extensions=[
                Timing(),
                TrainingDataMonitoring(observables, after_batch=True),
                average_monitoring,
                FinishAfter(after_n_batches=num_batches)
                # This shows a way to handle NaN emerging during
                # training: simply finish it.
                .add_condition(["after_batch"], _is_nan),
                # Saving the model and the log separately is convenient,
                # because loading the whole pickle takes quite some time.
                Checkpoint(save_path,
                           every_n_batches=500,
                           save_separately=["model", "log"]),
                Printing(every_n_batches=1)
            ])
        main_loop.run()
    elif mode == "sample" or mode == "beam_search":
        chars = tensor.lmatrix("input")
        generated = reverser.generate(chars)
        model = Model(generated)
        logger.info("Loading the model..")
        model.set_parameter_values(load_parameter_values(save_path))

        def generate(input_):
            """Generate output sequences for an input sequence.

            Incapsulates most of the difference between sampling and beam
            search.

            Returns
            -------
            outputs : list of lists
                Trimmed output sequences.
            costs : list
                The negative log-likelihood of generating the respective
                sequences.

            """
            if mode == "beam_search":
                samples, = VariableFilter(bricks=[reverser.generator],
                                          name="outputs")(ComputationGraph(
                                              generated[1]))
                # NOTE: this will recompile beam search functions
                # every time user presses Enter. Do not create
                # a new `BeamSearch` object every time if
                # speed is important for you.
                beam_search = BeamSearch(samples)
                outputs, costs = beam_search.search({chars: input_},
                                                    char2code['</S>'],
                                                    3 * input_.shape[0])
            else:
                _1, outputs, _2, _3, costs = (
                    model.get_theano_function()(input_))
                outputs = list(outputs.T)
                costs = list(costs.T)
                for i in range(len(outputs)):
                    outputs[i] = list(outputs[i])
                    try:
                        true_length = outputs[i].index(char2code['</S>']) + 1
                    except ValueError:
                        true_length = len(outputs[i])
                    outputs[i] = outputs[i][:true_length]
                    costs[i] = costs[i][:true_length].sum()
            return outputs, costs

        while True:
            line = input("Enter a sentence\n")
            message = ("Enter the number of samples\n"
                       if mode == "sample" else "Enter the beam size\n")
            batch_size = int(input(message))

            encoded_input = [
                char2code.get(char, char2code["<UNK>"])
                for char in line.lower().strip()
            ]
            encoded_input = ([char2code['<S>']] + encoded_input +
                             [char2code['</S>']])
            print("Encoder input:", encoded_input)
            target = reverse_words((encoded_input, ))[0]
            print("Target: ", target)

            samples, costs = generate(
                numpy.repeat(numpy.array(encoded_input)[:, None],
                             batch_size,
                             axis=1))
            messages = []
            for sample, cost in equizip(samples, costs):
                message = "({})".format(cost)
                message += "".join(code2char[code] for code in sample)
                if sample == target:
                    message += " CORRECT!"
                messages.append((cost, message))
            messages.sort(key=operator.itemgetter(0), reverse=True)
            for _, message in messages:
                print(message)
Exemple #23
0
def run_visualizations(cost,
                       updates,
                       train_stream,
                       valid_stream,
                       args,
                       hidden_states=None,
                       gate_values=None):

    # Load the parameters from a dumped model
    assert args.load_path is not None

    param_values = load_parameter_values(args.load_path)
    if args.hide_all_except is not None:
        i = args.hide_all_except
        sdim = args.state_dim

        output_size = get_output_size(args.dataset)

        hidden = np.zeros((args.layers * sdim, output_size), dtype=np.float32)

        output_w = param_values["/output_layer.W"]

        hidden[i * sdim:(i + 1) * sdim, :] = output_w[i * sdim:(i + 1) *
                                                      sdim, :]

        param_values["/output_layer.W"] = hidden

    model = Model(cost)
    model.set_parameter_values(param_values)

    # Run a visualization
    if args.visualize == "generate":
        visualize_generate(cost, hidden_states, updates, train_stream,
                           valid_stream, args)

    elif args.visualize == "gates" and (gate_values is not None):
        if args.rnn_type == "lstm":
            visualize_gates_lstm(gate_values, hidden_states, updates,
                                 train_stream, valid_stream, args)
        elif args.rnn_type == "soft":
            visualize_gates_soft(gate_values, hidden_states, updates,
                                 train_stream, valid_stream, args)
        else:
            assert False

    elif args.visualize == "states":
        visualize_states(hidden_states, updates, train_stream, valid_stream,
                         args)

    elif args.visualize == "gradients":
        visualize_gradients(hidden_states, updates, train_stream, valid_stream,
                            args)

    elif args.visualize == "jacobian":
        visualize_jacobian(hidden_states, updates, train_stream, valid_stream,
                           args)

    elif args.visualize == "presoft":
        visualize_presoft(cost, hidden_states, updates, train_stream,
                          valid_stream, args)

    elif args.visualize == "matrices":
        visualize_matrices(args)

    elif args.visualize == "trained_singular_values":
        visualize_singular_values(args)

    elif args.visualize == "gradients_flow_pie":
        visualize_gradients_flow_pie(hidden_states, updates, args)

    else:
        assert False
Exemple #24
0
out2 = inception((None, None), 192, 64, 96, 128, 16, 32, 32, out, 10)
out3 = inception((None, None), 256, 128, 128, 192, 32, 96, 64, out2, 20)
out31 = MaxPooling((2, 2), name='poolLow').apply(out3)
out4 = inception((None, None), 480, 192, 96, 208, 16, 48, 64, out31, 30)
out5 = inception((None, None), 512, 160, 112, 224, 24, 64, 64, out4, 40)

out6 = inception((None, None), 512, 128, 128, 256, 24, 64, 64, out5, 50)
out7 = inception((None, None), 512, 112, 144, 288, 32, 64, 64, out6, 60)
out8 = inception((None, None), 528, 256, 160, 320, 32, 128, 128, out7, 70)
out81 = MaxPooling((20, 20), name='poolLow1').apply(out8)
out9 = inception((None, None), 832, 256, 160, 320, 32, 128, 128, out81, 80)
out10 = inception((None, None), 832, 384, 192, 384, 48, 128, 128, out9, 90)
out91 = AveragePooling((5, 5), name='poolLow2').apply(out10)

#Load the model
params = load_parameter_values('catsVsDogs_google_new_Ortho.pkl')
model1 = Model(out8)
model1.set_parameter_values(params)
cost = model1.outputs[0].sum()**2
#cost = (T.dot(model1.outputs[0].flatten().T,model1.outputs[0].flatten())**2).sum()
f_prop = theano.function(model1.inputs, model1.outputs[0])
grad = T.grad(cost, model1.inputs)
f_grad = theano.function(model1.inputs, grad)


######DEEP DREAM CODE #############
def preprocess(img):
    return np.float32(np.rollaxis(img, 2)[::-1]) - img.mean()


def deprocess(img):
Exemple #25
0
stream = DataStream.default_stream(cats,
                                   iteration_scheme=SequentialExampleScheme(
                                       cats.num_examples))
stream_upscale = MaximumImageDimensions(stream,
                                        size,
                                        which_sources=('image_features', ))
stream_scale = ScaleAndShift(stream_upscale,
                             1. / 255,
                             0,
                             which_sources=('image_features', ))
stream_data = Cast(stream_scale,
                   dtype='float32',
                   which_sources=('image_features', ))

#Load the parameters of the model
params = load_parameter_values('convnet_parameters.pkl')
mo = Model(predict)
mo.set_parameter_values(params)
#Create the forward propagation function
fprop = function(mo.inputs, mo.outputs[0], allow_input_downcast=True)
tab = []
i = 1
#Get the prediction for each example of the test set
for data in stream_data.get_epoch_iterator():
    predict = np.argmax(fprop(data))
    tab.append([i, predict])
    print str(i) + "," + str(predict)
    i = i + 1
#Save predictions in a csv file
np.savetxt("dump.csv", tab, delimiter=",", fmt='%d')
Exemple #26
0
def initialize_all(config, save_path, bokeh_name,
                   params, bokeh_server, bokeh, test_tag, use_load_ext,
                   load_log, fast_start):
    root_path, extension = os.path.splitext(save_path)

    data = Data(**config['data'])
    train_conf = config['training']
    recognizer = create_model(config, data, test_tag)

    # Separate attention_params to be handled differently
    # when regularization is applied
    attention = recognizer.generator.transition.attention
    attention_params = Selector(attention).get_parameters().values()

    logger.info(
        "Initialization schemes for all bricks.\n"
        "Works well only in my branch with __repr__ added to all them,\n"
        "there is an issue #463 in Blocks to do that properly.")

    def show_init_scheme(cur):
        result = dict()
        for attr in dir(cur):
            if attr.endswith('_init'):
                result[attr] = getattr(cur, attr)
        for child in cur.children:
            result[child.name] = show_init_scheme(child)
        return result
    logger.info(pprint.pformat(show_init_scheme(recognizer)))

    prediction, prediction_mask = add_exploration(recognizer, data, train_conf)

    #
    # Observables:
    #
    primary_observables = []  # monitored each batch
    secondary_observables = []  # monitored every 10 batches
    validation_observables = []  # monitored on the validation set

    cg = recognizer.get_cost_graph(
        batch=True, prediction=prediction, prediction_mask=prediction_mask)
    labels, = VariableFilter(
        applications=[recognizer.cost], name='labels')(cg)
    labels_mask, = VariableFilter(
        applications=[recognizer.cost], name='labels_mask')(cg)

    gain_matrix = VariableFilter(
        theano_name=RewardRegressionEmitter.GAIN_MATRIX)(cg)
    if len(gain_matrix):
        gain_matrix, = gain_matrix
        primary_observables.append(
            named_copy(gain_matrix.min(), 'min_gain'))
        primary_observables.append(
            named_copy(gain_matrix.max(), 'max_gain'))

    batch_cost = cg.outputs[0].sum()
    batch_size = named_copy(recognizer.recordings.shape[1], "batch_size")
    # Assumes constant batch size. `aggregation.mean` is not used because
    # of Blocks #514.
    cost = batch_cost / batch_size
    cost.name = "sequence_total_cost"
    logger.info("Cost graph is built")

    # Fetch variables useful for debugging.
    # It is important not to use any aggregation schemes here,
    # as it's currently impossible to spread the effect of
    # regularization on their variables, see Blocks #514.
    cost_cg = ComputationGraph(cost)
    r = recognizer
    energies, = VariableFilter(
        applications=[r.generator.readout.readout], name="output_0")(
            cost_cg)
    bottom_output = VariableFilter(
        applications=[r.bottom.apply], name="output")(
            cost_cg)[-1]
    attended, = VariableFilter(
        applications=[r.generator.transition.apply], name="attended")(
            cost_cg)
    attended_mask, = VariableFilter(
        applications=[r.generator.transition.apply], name="attended_mask")(
            cost_cg)
    weights, = VariableFilter(
        applications=[r.generator.evaluate], name="weights")(
            cost_cg)
    max_recording_length = named_copy(r.recordings.shape[0],
                                      "max_recording_length")
    # To exclude subsampling related bugs
    max_attended_mask_length = named_copy(attended_mask.shape[0],
                                          "max_attended_mask_length")
    max_attended_length = named_copy(attended.shape[0],
                                     "max_attended_length")
    max_num_phonemes = named_copy(labels.shape[0],
                                  "max_num_phonemes")
    min_energy = named_copy(energies.min(), "min_energy")
    max_energy = named_copy(energies.max(), "max_energy")
    mean_attended = named_copy(abs(attended).mean(),
                               "mean_attended")
    mean_bottom_output = named_copy(abs(bottom_output).mean(),
                                    "mean_bottom_output")
    weights_penalty = named_copy(monotonicity_penalty(weights, labels_mask),
                                 "weights_penalty")
    weights_entropy = named_copy(entropy(weights, labels_mask),
                                 "weights_entropy")
    mask_density = named_copy(labels_mask.mean(),
                              "mask_density")
    cg = ComputationGraph([
        cost, weights_penalty, weights_entropy,
        min_energy, max_energy,
        mean_attended, mean_bottom_output,
        batch_size, max_num_phonemes,
        mask_density])
    # Regularization. It is applied explicitly to all variables
    # of interest, it could not be applied to the cost only as it
    # would not have effect on auxiliary variables, see Blocks #514.
    reg_config = config['regularization']
    regularized_cg = cg
    if reg_config.get('dropout'):
        logger.info('apply dropout')
        regularized_cg = apply_dropout(cg, [bottom_output], 0.5)
    if reg_config.get('noise'):
        logger.info('apply noise')
        noise_subjects = [p for p in cg.parameters if p not in attention_params]
        regularized_cg = apply_noise(cg, noise_subjects, reg_config['noise'])

    train_cost = regularized_cg.outputs[0]
    if reg_config.get("penalty_coof", .0) > 0:
        # big warning!!!
        # here we assume that:
        # regularized_weights_penalty = regularized_cg.outputs[1]
        train_cost = (train_cost +
                      reg_config.get("penalty_coof", .0) *
                      regularized_cg.outputs[1] / batch_size)
    if reg_config.get("decay", .0) > 0:
        train_cost = (train_cost + reg_config.get("decay", .0) *
                      l2_norm(VariableFilter(roles=[WEIGHT])(cg.parameters)) ** 2)

    train_cost = named_copy(train_cost, 'train_cost')

    gradients = None
    if reg_config.get('adaptive_noise'):
        logger.info('apply adaptive noise')
        if ((reg_config.get("penalty_coof", .0) > 0) or
                (reg_config.get("decay", .0) > 0)):
            logger.error('using  adaptive noise with alignment weight panalty '
                         'or weight decay is probably stupid')
        train_cost, regularized_cg, gradients, noise_brick = apply_adaptive_noise(
            cg, cg.outputs[0],
            variables=cg.parameters,
            num_examples=data.get_dataset('train').num_examples,
            parameters=SpeechModel(regularized_cg.outputs[0]
                                   ).get_parameter_dict().values(),
            **reg_config.get('adaptive_noise')
        )
        train_cost.name = 'train_cost'
        adapt_noise_cg = ComputationGraph(train_cost)
        model_prior_mean = named_copy(
            VariableFilter(applications=[noise_brick.apply],
                           name='model_prior_mean')(adapt_noise_cg)[0],
            'model_prior_mean')
        model_cost = named_copy(
            VariableFilter(applications=[noise_brick.apply],
                           name='model_cost')(adapt_noise_cg)[0],
            'model_cost')
        model_prior_variance = named_copy(
            VariableFilter(applications=[noise_brick.apply],
                           name='model_prior_variance')(adapt_noise_cg)[0],
            'model_prior_variance')
        regularized_cg = ComputationGraph(
            [train_cost, model_cost] +
            regularized_cg.outputs +
            [model_prior_mean, model_prior_variance])
        primary_observables += [
            regularized_cg.outputs[1],  # model cost
            regularized_cg.outputs[2],  # task cost
            regularized_cg.outputs[-2],  # model prior mean
            regularized_cg.outputs[-1]]  # model prior variance

    # Model is weird class, we spend lots of time arguing with Bart
    # what it should be. However it can already nice things, e.g.
    # one extract all the parameters from the computation graphs
    # and give them hierahical names. This help to notice when a
    # because of some bug a parameter is not in the computation
    # graph.
    model = SpeechModel(train_cost)
    if params:
        logger.info("Load parameters from " + params)
        # please note: we cannot use recognizer.load_params
        # as it builds a new computation graph that dies not have
        # shapred variables added by adaptive weight noise
        param_values = load_parameter_values(params)
        model.set_parameter_values(param_values)

    parameters = model.get_parameter_dict()
    logger.info("Parameters:\n" +
                pprint.pformat(
                    [(key, parameters[key].get_value().shape) for key
                     in sorted(parameters.keys())],
                    width=120))

    # Define the training algorithm.
    clipping = StepClipping(train_conf['gradient_threshold'])
    clipping.threshold.name = "gradient_norm_threshold"
    rule_names = train_conf.get('rules', ['momentum'])
    core_rules = []
    if 'momentum' in rule_names:
        logger.info("Using scaling and momentum for training")
        core_rules.append(Momentum(train_conf['scale'], train_conf['momentum']))
    if 'adadelta' in rule_names:
        logger.info("Using AdaDelta for training")
        core_rules.append(AdaDelta(train_conf['decay_rate'], train_conf['epsilon']))
    max_norm_rules = []
    if reg_config.get('max_norm', False) > 0:
        logger.info("Apply MaxNorm")
        maxnorm_subjects = VariableFilter(roles=[WEIGHT])(cg.parameters)
        if reg_config.get('max_norm_exclude_lookup', False):
            maxnorm_subjects = [v for v in maxnorm_subjects
                                if not isinstance(get_brick(v), LookupTable)]
        logger.info("Parameters covered by MaxNorm:\n"
                    + pprint.pformat([name for name, p in parameters.items()
                                      if p in maxnorm_subjects]))
        logger.info("Parameters NOT covered by MaxNorm:\n"
                    + pprint.pformat([name for name, p in parameters.items()
                                      if not p in maxnorm_subjects]))
        max_norm_rules = [
            Restrict(VariableClipping(reg_config['max_norm'], axis=0),
                     maxnorm_subjects)]
    burn_in = []
    if train_conf.get('burn_in_steps', 0):
        burn_in.append(
            BurnIn(num_steps=train_conf['burn_in_steps']))
    algorithm = GradientDescent(
        cost=train_cost,
        parameters=parameters.values(),
        gradients=gradients,
        step_rule=CompositeRule(
            [clipping] + core_rules + max_norm_rules +
            # Parameters are not changed at all
            # when nans are encountered.
            [RemoveNotFinite(0.0)] + burn_in),
        on_unused_sources='warn')

    logger.debug("Scan Ops in the gradients")
    gradient_cg = ComputationGraph(algorithm.gradients.values())
    for op in ComputationGraph(gradient_cg).scans:
        logger.debug(op)

    # More variables for debugging: some of them can be added only
    # after the `algorithm` object is created.
    secondary_observables += list(regularized_cg.outputs)
    if not 'train_cost' in [v.name for v in secondary_observables]:
        secondary_observables += [train_cost]
    secondary_observables += [
        algorithm.total_step_norm, algorithm.total_gradient_norm,
        clipping.threshold]
    for name, param in parameters.items():
        num_elements = numpy.product(param.get_value().shape)
        norm = param.norm(2) / num_elements ** 0.5
        grad_norm = algorithm.gradients[param].norm(2) / num_elements ** 0.5
        step_norm = algorithm.steps[param].norm(2) / num_elements ** 0.5
        stats = tensor.stack(norm, grad_norm, step_norm, step_norm / grad_norm)
        stats.name = name + '_stats'
        secondary_observables.append(stats)

    primary_observables += [
        train_cost,
        algorithm.total_gradient_norm,
        algorithm.total_step_norm, clipping.threshold,
        max_recording_length,
        max_attended_length, max_attended_mask_length]

    validation_observables += [
        rename(aggregation.mean(batch_cost, batch_size), cost.name),
        rename(aggregation.sum_(batch_size), 'num_utterances'),
        weights_entropy, weights_penalty]

    def attach_aggregation_schemes(variables):
        # Aggregation specification has to be factored out as a separate
        # function as it has to be applied at the very last stage
        # separately to training and validation observables.
        result = []
        for var in variables:
            if var.name == 'weights_penalty':
                result.append(rename(aggregation.mean(var, batch_size),
                                     'weights_penalty_per_recording'))
            elif var.name == 'weights_entropy':
                result.append(rename(aggregation.mean(var, labels_mask.sum()),
                                     'weights_entropy_per_label'))
            else:
                result.append(var)
        return result

    mon_conf = config['monitoring']

    # Build main loop.
    logger.info("Initialize extensions")
    extensions = []
    if use_load_ext and params:
        extensions.append(Load(params, load_iteration_state=True, load_log=True))
    if load_log and params:
        extensions.append(LoadLog(params))
    extensions += [
        Timing(after_batch=True),
        CGStatistics(),
        #CodeVersion(['lvsr']),
    ]
    extensions.append(TrainingDataMonitoring(
        primary_observables, after_batch=True))
    average_monitoring = TrainingDataMonitoring(
        attach_aggregation_schemes(secondary_observables),
        prefix="average", every_n_batches=10)
    extensions.append(average_monitoring)
    validation = DataStreamMonitoring(
        attach_aggregation_schemes(validation_observables),
        data.get_stream("valid", shuffle=False), prefix="valid").set_conditions(
            before_first_epoch=not fast_start,
            every_n_epochs=mon_conf['validate_every_epochs'],
            every_n_batches=mon_conf['validate_every_batches'],
            after_training=False)
    extensions.append(validation)
    per = PhonemeErrorRate(recognizer, data, **config['monitoring']['search'])
    per_monitoring = DataStreamMonitoring(
        [per], data.get_stream("valid", batches=False, shuffle=False),
        prefix="valid").set_conditions(
            before_first_epoch=not fast_start,
            every_n_epochs=mon_conf['search_every_epochs'],
            every_n_batches=mon_conf['search_every_batches'],
            after_training=False)
    extensions.append(per_monitoring)
    track_the_best_per = TrackTheBest(
        per_monitoring.record_name(per)).set_conditions(
            before_first_epoch=True, after_epoch=True)
    track_the_best_cost = TrackTheBest(
        validation.record_name(cost)).set_conditions(
            before_first_epoch=True, after_epoch=True)
    extensions += [track_the_best_cost, track_the_best_per]
    extensions.append(AdaptiveClipping(
        algorithm.total_gradient_norm.name,
        clipping, train_conf['gradient_threshold'],
        decay_rate=0.998, burnin_period=500))
    extensions += [
        SwitchOffLengthFilter(
            data.length_filter,
            after_n_batches=train_conf.get('stop_filtering')),
        FinishAfter(after_n_batches=train_conf['num_batches'],
                    after_n_epochs=train_conf['num_epochs'])
            .add_condition(["after_batch"], _gradient_norm_is_none),
    ]
    channels = [
        # Plot 1: training and validation costs
        [average_monitoring.record_name(train_cost),
         validation.record_name(cost)],
        # Plot 2: gradient norm,
        [average_monitoring.record_name(algorithm.total_gradient_norm),
         average_monitoring.record_name(clipping.threshold)],
        # Plot 3: phoneme error rate
        [per_monitoring.record_name(per)],
        # Plot 4: training and validation mean weight entropy
        [average_monitoring._record_name('weights_entropy_per_label'),
         validation._record_name('weights_entropy_per_label')],
        # Plot 5: training and validation monotonicity penalty
        [average_monitoring._record_name('weights_penalty_per_recording'),
         validation._record_name('weights_penalty_per_recording')]]
    if bokeh:
        extensions += [
            Plot(bokeh_name if bokeh_name
                 else os.path.basename(save_path),
                 channels,
                 every_n_batches=10,
                 server_url=bokeh_server),]
    extensions += [
        Checkpoint(save_path,
                   before_first_epoch=not fast_start, after_epoch=True,
                   every_n_batches=train_conf.get('save_every_n_batches'),
                   save_separately=["model", "log"],
                   use_cpickle=True)
        .add_condition(
            ['after_epoch'],
            OnLogRecord(track_the_best_per.notification_name),
            (root_path + "_best" + extension,))
        .add_condition(
            ['after_epoch'],
            OnLogRecord(track_the_best_cost.notification_name),
            (root_path + "_best_ll" + extension,)),
        ProgressBar()]
    extensions.append(EmbedIPython(use_main_loop_run_caller_env=True))
    if config['net']['criterion']['name'].startswith('mse'):
        extensions.append(
            LogInputsGains(
                labels, cg, recognizer.generator.readout.emitter, data))

    if train_conf.get('patience'):
        patience_conf = train_conf['patience']
        if not patience_conf.get('notification_names'):
            # setdefault will not work for empty list
            patience_conf['notification_names'] = [
                track_the_best_per.notification_name,
                track_the_best_cost.notification_name]
        extensions.append(Patience(**patience_conf))

    extensions.append(Printing(every_n_batches=1,
                               attribute_filter=PrintingFilterList()))

    return model, algorithm, data, extensions
Exemple #27
0
def run_visualizations(cost, updates,
                       train_stream, valid_stream,
                       args,
                       hidden_states=None, gate_values=None):

    # Load the parameters from a dumped model
    assert args.load_path is not None
    model = Model(cost)
    model.set_parameter_values(load_parameter_values(args.load_path))

    # Run a visualization
    if args.visualize == "generate":
        visualize_generate(cost,
                           hidden_states, updates,
                           train_stream, valid_stream,
                           args)

    elif args.visualize == "gates" and (gate_values is not None):
        if args.rnn_type == "lstm":
            visualize_gates_lstm(gate_values, hidden_states, updates,
                                 train_stream, valid_stream,
                                 args)
        elif args.rnn_type == "soft":
            visualize_gates_soft(gate_values, hidden_states, updates,
                                 train_stream, valid_stream,
                                 args)
        else:
            assert False

    elif args.visualize == "states":
        visualize_states(hidden_states, updates,
                         train_stream, valid_stream,
                         args)

    elif args.visualize == "gradients":
        visualize_gradients(hidden_states, updates,
                            train_stream, valid_stream,
                            args)

    elif args.visualize == "jacobian":
        visualize_jacobian(hidden_states, updates,
                           train_stream, valid_stream,
                           args)

    elif args.visualize == "presoft":
        visualize_presoft(cost,
                          hidden_states, updates,
                          train_stream, valid_stream,
                          args)

    elif args.visualize == "matrices":
        visualize_matrices(args)

    elif args.visualize == "trained_singular_values":
        visualize_singular_values(args)

    elif args.visualize == "gradients_flow_pie":
        visualize_gradients_flow_pie(hidden_states, updates,
                                     args)

    else:
        assert False
Exemple #28
0
########### GET THE DATA #####################
stream_train = ServerDataStream(('image_features','targets'), False, port=5652, hwm=50)
stream_valid = ServerDataStream(('image_features','targets'), False, port=5653, hwm=50)

########### DEFINE THE ALGORITHM #############
track_cost = TrackTheBest("cost", after_epoch=True, after_batch=False)
algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=Momentum(learning_rate=0.0001, momentum=0.9))
extensions = [Timing(),
              FinishAfter(after_n_epochs=num_epochs),
              TrainingDataMonitoring([cost, error_rate,
                aggregation.mean(algorithm.total_gradient_norm)],
                prefix="train",
                after_epoch=True),
              DataStreamMonitoring([cost, error_rate, error_rate2], stream_valid, prefix="valid", after_epoch=True),
              Checkpoint("google_Ortho2_pretrain2_l0001.pkl", after_epoch=True),
              ProgressBar(),
              Printing()]

#Adding a live plot with the bokeh server
extensions.append(Plot(
    'CatsVsDogs160_GoogleNet_Reload2_l0001',
    channels=[['train_error_rate', 'valid_error_rate'],
              ['valid_cost', 'valid_error_rate2'],
              ['train_total_gradient_norm']], after_batch=True))

params = load_parameter_values('GoogleParameters.pkl')
model = Model(cost)
model.set_parameter_values(params)
main_loop = MainLoop(algorithm,data_stream=stream_train,model=model,extensions=extensions)
main_loop.run()
Exemple #29
0
        for p in cg.parameters:
            print(str(p), p.shape, p.dtype)

        print("Created ComputationGraph, inputs:")
        print(cg.inputs)

    # Strangely, all the examples use : DataStreamMonitoring in MainLoop

    model = Model(labels)
    print("Model.dict_of_inputs():")
    print(model.dict_of_inputs())
    print("Model list inputs:")
    print([v.name for v in model.inputs])

    ## Model loading from saved file
    model.set_parameter_values(load_parameter_values(save_state_path))

    examine_embedding(lookup.W.get_value())

    label_ner = model.get_theano_function()
    print(model.inputs)
    print("printed label_ner.params")

    for test_data in data_stream.get_epoch_iterator():
        ordered_batch = test_data[
            0:3]  # Explicitly strip off the pre-defined labels
        #print(ordered_batch)

        results = label_ner(*ordered_batch)
        #print(results)  # This is a pure array of labels
def main(mode, save_path, steps, num_batches, load_params):
    chars = (list(string.ascii_uppercase) + list(range(10)) +
             [' ', '.', ',', '\'', '"', '!', '?', '<UNK>'])
    char_to_ind = {char: i for i, char in enumerate(chars)}
    ind_to_char = {v: k for k, v in char_to_ind.iteritems()}

    train_dataset = TextFile(['/Tmp/serdyuk/data/wsj_text_train'],
                             char_to_ind, bos_token=None, eos_token=None,
                             level='character')
    valid_dataset = TextFile(['/Tmp/serdyuk/data/wsj_text_valid'],
                             char_to_ind, bos_token=None, eos_token=None,
                             level='character')

    vocab_size = len(char_to_ind)
    logger.info('Dictionary size: {}'.format(vocab_size))
    if mode == 'continue':
        continue_training(save_path)
        return
    elif mode == "sample":
        main_loop = load(open(save_path, "rb"))
        generator = main_loop.model.get_top_bricks()[-1]

        sample = ComputationGraph(generator.generate(
            n_steps=steps, batch_size=1, iterate=True)).get_theano_function()

        states, outputs, costs = [data[:, 0] for data in sample()]
        print("".join([ind_to_char[s] for s in outputs]))

        numpy.set_printoptions(precision=3, suppress=True)
        print("Generation cost:\n{}".format(costs.sum()))

        freqs = numpy.bincount(outputs).astype(floatX)
        freqs /= freqs.sum()

        trans_freqs = numpy.zeros((vocab_size, vocab_size), dtype=floatX)
        for a, b in zip(outputs, outputs[1:]):
            trans_freqs[a, b] += 1
        trans_freqs /= trans_freqs.sum(axis=1)[:, None]
        return

    # Experiment configuration
    batch_size = 20
    dim = 650
    feedback_dim = 650

    valid_stream = valid_dataset.get_example_stream()
    valid_stream = Batch(valid_stream,
                         iteration_scheme=ConstantScheme(batch_size))
    valid_stream = Padding(valid_stream)
    valid_stream = Mapping(valid_stream, _transpose)

    # Build the bricks and initialize them

    transition = GatedRecurrent(name="transition", dim=dim,
                                activation=Tanh())
    generator = SequenceGenerator(
        Readout(readout_dim=vocab_size, source_names=transition.apply.states,
                emitter=SoftmaxEmitter(name="emitter"),
                feedback_brick=LookupFeedback(
                    vocab_size, feedback_dim, name='feedback'),
                name="readout"),
        transition,
        weights_init=Uniform(std=0.04), biases_init=Constant(0),
        name="generator")
    generator.push_initialization_config()
    transition.weights_init = Orthogonal()
    transition.push_initialization_config()
    generator.initialize()

    # Build the cost computation graph.
    features = tensor.lmatrix('features')
    features_mask = tensor.matrix('features_mask')
    cost_matrix = generator.cost_matrix(
        features, mask=features_mask)
    batch_cost = cost_matrix.sum()
    cost = aggregation.mean(
        batch_cost,
        features.shape[1])
    cost.name = "sequence_log_likelihood"
    char_cost = aggregation.mean(
        batch_cost, features_mask.sum())
    char_cost.name = 'character_log_likelihood'
    ppl = 2 ** (cost / numpy.log(2))
    ppl.name = 'ppl'
    bits_per_char = char_cost / tensor.log(2)
    bits_per_char.name = 'bits_per_char'
    length = features.shape[0]
    length.name = 'length'

    model = Model(batch_cost)
    if load_params:
        params = load_parameter_values(save_path)
        model.set_parameter_values(params)

    if mode == "train":
        # Give an idea of what's going on.
        logger.info("Parameters:\n" +
                    pprint.pformat(
                        [(key, value.get_value().shape) for key, value
                         in Selector(generator).get_parameters().items()],
                        width=120))

        train_stream = train_dataset.get_example_stream()
        train_stream = Mapping(train_stream, _truncate)
        train_stream = Batch(train_stream,
                             iteration_scheme=ConstantScheme(batch_size))
        train_stream = Padding(train_stream)
        train_stream = Mapping(train_stream, _transpose)

        parameters = model.get_parameter_dict()
        maxnorm_subjects = VariableFilter(roles=[WEIGHT])(parameters.values())
        algorithm = GradientDescent(
            cost=batch_cost,
            parameters=parameters.values(),
            step_rule=CompositeRule([StepClipping(1000.), 
                AdaDelta(epsilon=1e-8) #, Restrict(VariableClipping(1.0, axis=0), maxnorm_subjects)
                                     ]))
        ft = features[:6, 0]
        ft.name = 'feature_example'

        observables = [cost, ppl, char_cost, length, bits_per_char]
        for name, param in parameters.items():
            num_elements = numpy.product(param.get_value().shape)
            norm = param.norm(2) / num_elements ** 0.5
            grad_norm = algorithm.gradients[param].norm(2) / num_elements ** 0.5
            step_norm = algorithm.steps[param].norm(2) / num_elements ** 0.5
            stats = tensor.stack(norm, grad_norm, step_norm, step_norm / grad_norm)
            stats.name = name + '_stats'
            observables.append(stats)
        track_the_best_bpc = TrackTheBest('valid_bits_per_char')
        root_path, extension = os.path.splitext(save_path)

        this_step_monitoring = TrainingDataMonitoring(
            observables + [ft], prefix="this_step", after_batch=True)
        average_monitoring = TrainingDataMonitoring(
            observables + [algorithm.total_step_norm,
                           algorithm.total_gradient_norm], 
            prefix="average",
            every_n_batches=10)
        valid_monitoring = DataStreamMonitoring(
            observables, prefix="valid",
            every_n_batches=1500, before_training=False,
            data_stream=valid_stream)
        main_loop = MainLoop(
            algorithm=algorithm,
            data_stream=train_stream,
            model=model,
            extensions=[
                this_step_monitoring,
                average_monitoring,
                valid_monitoring,
                track_the_best_bpc,
                Checkpoint(save_path, ),
                Checkpoint(save_path,
                           every_n_batches=500,
                           save_separately=["model", "log"],
                           use_cpickle=True)
                    .add_condition(
                    ['after_epoch'],
                    OnLogRecord(track_the_best_bpc.notification_name),
                    (root_path + "_best" + extension,)),
                Timing(after_batch=True),
                Printing(every_n_batches=10),
                Plot(root_path,
                     [[average_monitoring.record_name(cost),
                       valid_monitoring.record_name(cost)],
                      [average_monitoring.record_name(algorithm.total_step_norm)],
                      [average_monitoring.record_name(algorithm.total_gradient_norm)],
                      [average_monitoring.record_name(ppl),
                       valid_monitoring.record_name(ppl)],
                      [average_monitoring.record_name(char_cost),
                       valid_monitoring.record_name(char_cost)],
                      [average_monitoring.record_name(bits_per_char),
                       valid_monitoring.record_name(bits_per_char)]],
                     every_n_batches=10)
            ])
        main_loop.run()

    elif mode == 'evaluate':
        with open('/data/lisatmp3/serdyuk/wsj_lms/lms/wsj_trigram_with_initial_eos/lexicon.txt') as f:
            raw_words = [line.split()[1:-1] for line in f.readlines()]
            words = [[char_to_ind[c] if c in char_to_ind else char_to_ind['<UNK>'] for c in w] 
                     for w in raw_words]
        max_word_length = max([len(w) for w in words])
        
        initial_states = tensor.matrix('init_states')
        cost_matrix_step = generator.cost_matrix(features, mask=features_mask,
                                                 states=initial_states)
        cg = ComputationGraph(cost_matrix_step)
        states = cg.auxiliary_variables[-2]
        compute_cost = theano.function([features, features_mask, initial_states], 
                                       [cost_matrix_step.sum(axis=0), states])

        cost_matrix = generator.cost_matrix(features, mask=features_mask)
        initial_cg = ComputationGraph(cost_matrix)
        initial_states = initial_cg.auxiliary_variables[-2]

        total_word_cost = 0
        num_words = 0
        examples = numpy.zeros((max_word_length + 1, len(words)),
                               dtype='int64')
        all_masks = numpy.zeros((max_word_length + 1, len(words)),
                                dtype=floatX)

        for i, word in enumerate(words):
            examples[:len(word), i] = word
            all_masks[:len(word), i] = 1.

        single_space = numpy.array([char_to_ind[' ']])[:, None]

        for batch in valid_stream.get_epoch_iterator():
            for example, mask in equizip(batch[0].T, batch[1].T):
                example = example[:(mask.sum())]
                spc_inds = list(numpy.where(example == char_to_ind[" "])[0])
                state = generator.transition.transition.initial_states_.get_value()[None, :]
                for i, j in equizip([-1] + spc_inds, spc_inds + [-1]):
                    word = example[(i+1):j, None]
                    word_cost, states = compute_cost(
                        word, numpy.ones_like(word, dtype=floatX), state)
                    state = states[-1]

                    costs = numpy.exp(-compute_cost(
                        examples, all_masks, numpy.tile(state, [examples.shape[1], 1]))[0])

                    _, space_states = compute_cost(
                        single_space, numpy.ones_like(single_space, dtype=floatX), state)
                    state = space_states[-1]

                    word_prob = numpy.exp(-word_cost)
                    total_word_cost += word_cost + numpy.log(numpy.sum(costs))
                    num_words += 1
                    print(word_prob)
                    print(numpy.sum(costs))
                    print("Average cost", total_word_cost / num_words)
                    print("PPL", numpy.exp(total_word_cost / num_words))

        print("Word-level perplexity")
        print(total_word_cost / num_words)
    else:
        assert False
Exemple #31
0
    TrainingDataMonitoring(
        [cost, error_rate,
         aggregation.mean(algorithm.total_gradient_norm)],
        prefix="train",
        after_epoch=True),
    DataStreamMonitoring([cost, error_rate, error_rate2],
                         stream_valid,
                         prefix="valid",
                         after_epoch=True),
    Checkpoint("google_Ortho2_pretrain2_l0001.pkl", after_epoch=True),
    ProgressBar(),
    Printing()
]

#Adding a live plot with the bokeh server
extensions.append(
    Plot('CatsVsDogs160_GoogleNet_Reload2_l0001',
         channels=[['train_error_rate', 'valid_error_rate'],
                   ['valid_cost', 'valid_error_rate2'],
                   ['train_total_gradient_norm']],
         after_batch=True))

params = load_parameter_values('GoogleParameters.pkl')
model = Model(cost)
model.set_parameter_values(params)
main_loop = MainLoop(algorithm,
                     data_stream=stream_train,
                     model=model,
                     extensions=extensions)
main_loop.run()
Exemple #32
0
def initialize_all(config, save_path, bokeh_name, params, bokeh_server, bokeh,
                   test_tag, use_load_ext, load_log, fast_start):
    root_path, extension = os.path.splitext(save_path)

    data = Data(**config['data'])
    train_conf = config['training']
    recognizer = create_model(config, data, test_tag)

    # Separate attention_params to be handled differently
    # when regularization is applied
    attention = recognizer.generator.transition.attention
    attention_params = Selector(attention).get_parameters().values()

    logger.info(
        "Initialization schemes for all bricks.\n"
        "Works well only in my branch with __repr__ added to all them,\n"
        "there is an issue #463 in Blocks to do that properly.")

    def show_init_scheme(cur):
        result = dict()
        for attr in dir(cur):
            if attr.endswith('_init'):
                result[attr] = getattr(cur, attr)
        for child in cur.children:
            result[child.name] = show_init_scheme(child)
        return result

    logger.info(pprint.pformat(show_init_scheme(recognizer)))

    prediction, prediction_mask = add_exploration(recognizer, data, train_conf)

    #
    # Observables:
    #
    primary_observables = []  # monitored each batch
    secondary_observables = []  # monitored every 10 batches
    validation_observables = []  # monitored on the validation set

    cg = recognizer.get_cost_graph(batch=True,
                                   prediction=prediction,
                                   prediction_mask=prediction_mask)
    labels, = VariableFilter(applications=[recognizer.cost], name='labels')(cg)
    labels_mask, = VariableFilter(applications=[recognizer.cost],
                                  name='labels_mask')(cg)

    gain_matrix = VariableFilter(
        theano_name=RewardRegressionEmitter.GAIN_MATRIX)(cg)
    if len(gain_matrix):
        gain_matrix, = gain_matrix
        primary_observables.append(named_copy(gain_matrix.min(), 'min_gain'))
        primary_observables.append(named_copy(gain_matrix.max(), 'max_gain'))

    batch_cost = cg.outputs[0].sum()
    batch_size = named_copy(recognizer.recordings.shape[1], "batch_size")
    # Assumes constant batch size. `aggregation.mean` is not used because
    # of Blocks #514.
    cost = batch_cost / batch_size
    cost.name = "sequence_total_cost"
    logger.info("Cost graph is built")

    # Fetch variables useful for debugging.
    # It is important not to use any aggregation schemes here,
    # as it's currently impossible to spread the effect of
    # regularization on their variables, see Blocks #514.
    cost_cg = ComputationGraph(cost)
    r = recognizer
    energies, = VariableFilter(applications=[r.generator.readout.readout],
                               name="output_0")(cost_cg)
    bottom_output = VariableFilter(applications=[r.bottom.apply],
                                   name="output")(cost_cg)[-1]
    attended, = VariableFilter(applications=[r.generator.transition.apply],
                               name="attended")(cost_cg)
    attended_mask, = VariableFilter(applications=[
        r.generator.transition.apply
    ],
                                    name="attended_mask")(cost_cg)
    weights, = VariableFilter(applications=[r.generator.evaluate],
                              name="weights")(cost_cg)
    max_recording_length = named_copy(r.recordings.shape[0],
                                      "max_recording_length")
    # To exclude subsampling related bugs
    max_attended_mask_length = named_copy(attended_mask.shape[0],
                                          "max_attended_mask_length")
    max_attended_length = named_copy(attended.shape[0], "max_attended_length")
    max_num_phonemes = named_copy(labels.shape[0], "max_num_phonemes")
    min_energy = named_copy(energies.min(), "min_energy")
    max_energy = named_copy(energies.max(), "max_energy")
    mean_attended = named_copy(abs(attended).mean(), "mean_attended")
    mean_bottom_output = named_copy(
        abs(bottom_output).mean(), "mean_bottom_output")
    weights_penalty = named_copy(monotonicity_penalty(weights, labels_mask),
                                 "weights_penalty")
    weights_entropy = named_copy(entropy(weights, labels_mask),
                                 "weights_entropy")
    mask_density = named_copy(labels_mask.mean(), "mask_density")
    cg = ComputationGraph([
        cost, weights_penalty, weights_entropy, min_energy, max_energy,
        mean_attended, mean_bottom_output, batch_size, max_num_phonemes,
        mask_density
    ])
    # Regularization. It is applied explicitly to all variables
    # of interest, it could not be applied to the cost only as it
    # would not have effect on auxiliary variables, see Blocks #514.
    reg_config = config['regularization']
    regularized_cg = cg
    if reg_config.get('dropout'):
        logger.info('apply dropout')
        regularized_cg = apply_dropout(cg, [bottom_output], 0.5)
    if reg_config.get('noise'):
        logger.info('apply noise')
        noise_subjects = [
            p for p in cg.parameters if p not in attention_params
        ]
        regularized_cg = apply_noise(cg, noise_subjects, reg_config['noise'])

    train_cost = regularized_cg.outputs[0]
    if reg_config.get("penalty_coof", .0) > 0:
        # big warning!!!
        # here we assume that:
        # regularized_weights_penalty = regularized_cg.outputs[1]
        train_cost = (train_cost + reg_config.get("penalty_coof", .0) *
                      regularized_cg.outputs[1] / batch_size)
    if reg_config.get("decay", .0) > 0:
        train_cost = (
            train_cost + reg_config.get("decay", .0) *
            l2_norm(VariableFilter(roles=[WEIGHT])(cg.parameters))**2)

    train_cost = named_copy(train_cost, 'train_cost')

    gradients = None
    if reg_config.get('adaptive_noise'):
        logger.info('apply adaptive noise')
        if ((reg_config.get("penalty_coof", .0) > 0)
                or (reg_config.get("decay", .0) > 0)):
            logger.error('using  adaptive noise with alignment weight panalty '
                         'or weight decay is probably stupid')
        train_cost, regularized_cg, gradients, noise_brick = apply_adaptive_noise(
            cg,
            cg.outputs[0],
            variables=cg.parameters,
            num_examples=data.get_dataset('train').num_examples,
            parameters=SpeechModel(
                regularized_cg.outputs[0]).get_parameter_dict().values(),
            **reg_config.get('adaptive_noise'))
        train_cost.name = 'train_cost'
        adapt_noise_cg = ComputationGraph(train_cost)
        model_prior_mean = named_copy(
            VariableFilter(applications=[noise_brick.apply],
                           name='model_prior_mean')(adapt_noise_cg)[0],
            'model_prior_mean')
        model_cost = named_copy(
            VariableFilter(applications=[noise_brick.apply],
                           name='model_cost')(adapt_noise_cg)[0], 'model_cost')
        model_prior_variance = named_copy(
            VariableFilter(applications=[noise_brick.apply],
                           name='model_prior_variance')(adapt_noise_cg)[0],
            'model_prior_variance')
        regularized_cg = ComputationGraph(
            [train_cost, model_cost] + regularized_cg.outputs +
            [model_prior_mean, model_prior_variance])
        primary_observables += [
            regularized_cg.outputs[1],  # model cost
            regularized_cg.outputs[2],  # task cost
            regularized_cg.outputs[-2],  # model prior mean
            regularized_cg.outputs[-1]
        ]  # model prior variance

    # Model is weird class, we spend lots of time arguing with Bart
    # what it should be. However it can already nice things, e.g.
    # one extract all the parameters from the computation graphs
    # and give them hierahical names. This help to notice when a
    # because of some bug a parameter is not in the computation
    # graph.
    model = SpeechModel(train_cost)
    if params:
        logger.info("Load parameters from " + params)
        # please note: we cannot use recognizer.load_params
        # as it builds a new computation graph that dies not have
        # shapred variables added by adaptive weight noise
        param_values = load_parameter_values(params)
        model.set_parameter_values(param_values)

    parameters = model.get_parameter_dict()
    logger.info("Parameters:\n" +
                pprint.pformat([(key, parameters[key].get_value().shape)
                                for key in sorted(parameters.keys())],
                               width=120))

    # Define the training algorithm.
    clipping = StepClipping(train_conf['gradient_threshold'])
    clipping.threshold.name = "gradient_norm_threshold"
    rule_names = train_conf.get('rules', ['momentum'])
    core_rules = []
    if 'momentum' in rule_names:
        logger.info("Using scaling and momentum for training")
        core_rules.append(Momentum(train_conf['scale'],
                                   train_conf['momentum']))
    if 'adadelta' in rule_names:
        logger.info("Using AdaDelta for training")
        core_rules.append(
            AdaDelta(train_conf['decay_rate'], train_conf['epsilon']))
    max_norm_rules = []
    if reg_config.get('max_norm', False) > 0:
        logger.info("Apply MaxNorm")
        maxnorm_subjects = VariableFilter(roles=[WEIGHT])(cg.parameters)
        if reg_config.get('max_norm_exclude_lookup', False):
            maxnorm_subjects = [
                v for v in maxnorm_subjects
                if not isinstance(get_brick(v), LookupTable)
            ]
        logger.info("Parameters covered by MaxNorm:\n" + pprint.pformat(
            [name for name, p in parameters.items() if p in maxnorm_subjects]))
        logger.info("Parameters NOT covered by MaxNorm:\n" + pprint.pformat([
            name for name, p in parameters.items() if not p in maxnorm_subjects
        ]))
        max_norm_rules = [
            Restrict(VariableClipping(reg_config['max_norm'], axis=0),
                     maxnorm_subjects)
        ]
    burn_in = []
    if train_conf.get('burn_in_steps', 0):
        burn_in.append(BurnIn(num_steps=train_conf['burn_in_steps']))
    algorithm = GradientDescent(
        cost=train_cost,
        parameters=parameters.values(),
        gradients=gradients,
        step_rule=CompositeRule(
            [clipping] + core_rules + max_norm_rules +
            # Parameters are not changed at all
            # when nans are encountered.
            [RemoveNotFinite(0.0)] + burn_in),
        on_unused_sources='warn')

    logger.debug("Scan Ops in the gradients")
    gradient_cg = ComputationGraph(algorithm.gradients.values())
    for op in ComputationGraph(gradient_cg).scans:
        logger.debug(op)

    # More variables for debugging: some of them can be added only
    # after the `algorithm` object is created.
    secondary_observables += list(regularized_cg.outputs)
    if not 'train_cost' in [v.name for v in secondary_observables]:
        secondary_observables += [train_cost]
    secondary_observables += [
        algorithm.total_step_norm, algorithm.total_gradient_norm,
        clipping.threshold
    ]
    for name, param in parameters.items():
        num_elements = numpy.product(param.get_value().shape)
        norm = param.norm(2) / num_elements**0.5
        grad_norm = algorithm.gradients[param].norm(2) / num_elements**0.5
        step_norm = algorithm.steps[param].norm(2) / num_elements**0.5
        stats = tensor.stack(norm, grad_norm, step_norm, step_norm / grad_norm)
        stats.name = name + '_stats'
        secondary_observables.append(stats)

    primary_observables += [
        train_cost, algorithm.total_gradient_norm, algorithm.total_step_norm,
        clipping.threshold, max_recording_length, max_attended_length,
        max_attended_mask_length
    ]

    validation_observables += [
        rename(aggregation.mean(batch_cost, batch_size), cost.name),
        rename(aggregation.sum_(batch_size), 'num_utterances'),
        weights_entropy, weights_penalty
    ]

    def attach_aggregation_schemes(variables):
        # Aggregation specification has to be factored out as a separate
        # function as it has to be applied at the very last stage
        # separately to training and validation observables.
        result = []
        for var in variables:
            if var.name == 'weights_penalty':
                result.append(
                    rename(aggregation.mean(var, batch_size),
                           'weights_penalty_per_recording'))
            elif var.name == 'weights_entropy':
                result.append(
                    rename(aggregation.mean(var, labels_mask.sum()),
                           'weights_entropy_per_label'))
            else:
                result.append(var)
        return result

    mon_conf = config['monitoring']

    # Build main loop.
    logger.info("Initialize extensions")
    extensions = []
    if use_load_ext and params:
        extensions.append(
            Load(params, load_iteration_state=True, load_log=True))
    if load_log and params:
        extensions.append(LoadLog(params))
    extensions += [
        Timing(after_batch=True),
        CGStatistics(),
        #CodeVersion(['lvsr']),
    ]
    extensions.append(
        TrainingDataMonitoring(primary_observables, after_batch=True))
    average_monitoring = TrainingDataMonitoring(
        attach_aggregation_schemes(secondary_observables),
        prefix="average",
        every_n_batches=10)
    extensions.append(average_monitoring)
    validation = DataStreamMonitoring(
        attach_aggregation_schemes(validation_observables),
        data.get_stream("valid", shuffle=False),
        prefix="valid").set_conditions(
            before_first_epoch=not fast_start,
            every_n_epochs=mon_conf['validate_every_epochs'],
            every_n_batches=mon_conf['validate_every_batches'],
            after_training=False)
    extensions.append(validation)
    per = PhonemeErrorRate(recognizer, data, **config['monitoring']['search'])
    per_monitoring = DataStreamMonitoring(
        [per],
        data.get_stream("valid", batches=False, shuffle=False),
        prefix="valid").set_conditions(
            before_first_epoch=not fast_start,
            every_n_epochs=mon_conf['search_every_epochs'],
            every_n_batches=mon_conf['search_every_batches'],
            after_training=False)
    extensions.append(per_monitoring)
    track_the_best_per = TrackTheBest(
        per_monitoring.record_name(per)).set_conditions(
            before_first_epoch=True, after_epoch=True)
    track_the_best_cost = TrackTheBest(
        validation.record_name(cost)).set_conditions(before_first_epoch=True,
                                                     after_epoch=True)
    extensions += [track_the_best_cost, track_the_best_per]
    extensions.append(
        AdaptiveClipping(algorithm.total_gradient_norm.name,
                         clipping,
                         train_conf['gradient_threshold'],
                         decay_rate=0.998,
                         burnin_period=500))
    extensions += [
        SwitchOffLengthFilter(
            data.length_filter,
            after_n_batches=train_conf.get('stop_filtering')),
        FinishAfter(after_n_batches=train_conf['num_batches'],
                    after_n_epochs=train_conf['num_epochs']).add_condition(
                        ["after_batch"], _gradient_norm_is_none),
    ]
    channels = [
        # Plot 1: training and validation costs
        [
            average_monitoring.record_name(train_cost),
            validation.record_name(cost)
        ],
        # Plot 2: gradient norm,
        [
            average_monitoring.record_name(algorithm.total_gradient_norm),
            average_monitoring.record_name(clipping.threshold)
        ],
        # Plot 3: phoneme error rate
        [per_monitoring.record_name(per)],
        # Plot 4: training and validation mean weight entropy
        [
            average_monitoring._record_name('weights_entropy_per_label'),
            validation._record_name('weights_entropy_per_label')
        ],
        # Plot 5: training and validation monotonicity penalty
        [
            average_monitoring._record_name('weights_penalty_per_recording'),
            validation._record_name('weights_penalty_per_recording')
        ]
    ]
    if bokeh:
        extensions += [
            Plot(bokeh_name if bokeh_name else os.path.basename(save_path),
                 channels,
                 every_n_batches=10,
                 server_url=bokeh_server),
        ]
    extensions += [
        Checkpoint(save_path,
                   before_first_epoch=not fast_start,
                   after_epoch=True,
                   every_n_batches=train_conf.get('save_every_n_batches'),
                   save_separately=["model", "log"],
                   use_cpickle=True).add_condition(
                       ['after_epoch'],
                       OnLogRecord(track_the_best_per.notification_name),
                       (root_path + "_best" + extension, )).add_condition(
                           ['after_epoch'],
                           OnLogRecord(track_the_best_cost.notification_name),
                           (root_path + "_best_ll" + extension, )),
        ProgressBar()
    ]
    extensions.append(EmbedIPython(use_main_loop_run_caller_env=True))
    if config['net']['criterion']['name'].startswith('mse'):
        extensions.append(
            LogInputsGains(labels, cg, recognizer.generator.readout.emitter,
                           data))

    if train_conf.get('patience'):
        patience_conf = train_conf['patience']
        if not patience_conf.get('notification_names'):
            # setdefault will not work for empty list
            patience_conf['notification_names'] = [
                track_the_best_per.notification_name,
                track_the_best_cost.notification_name
            ]
        extensions.append(Patience(**patience_conf))

    extensions.append(
        Printing(every_n_batches=1, attribute_filter=PrintingFilterList()))

    return model, algorithm, data, extensions
Exemple #33
0
def main(mode, save_path, num_batches, data_path=None):
    reverser = WordReverser(100, len(char2code), name="reverser")

    if mode == "train":
        # Data processing pipeline
        dataset_options = dict(dictionary=char2code, level="character",
                               preprocess=_lower)
        if data_path:
            dataset = TextFile(data_path, **dataset_options)
        else:
            dataset = OneBillionWord("training", [99], **dataset_options)
        data_stream = dataset.get_example_stream()
        data_stream = Filter(data_stream, _filter_long)
        data_stream = Mapping(data_stream, reverse_words,
                              add_sources=("targets",))
        data_stream = Batch(data_stream, iteration_scheme=ConstantScheme(10))
        data_stream = Padding(data_stream)
        data_stream = Mapping(data_stream, _transpose)

        # Initialization settings
        reverser.weights_init = IsotropicGaussian(0.1)
        reverser.biases_init = Constant(0.0)
        reverser.push_initialization_config()
        reverser.encoder.weights_init = Orthogonal()
        reverser.generator.transition.weights_init = Orthogonal()

        # Build the cost computation graph
        chars = tensor.lmatrix("features")
        chars_mask = tensor.matrix("features_mask")
        targets = tensor.lmatrix("targets")
        targets_mask = tensor.matrix("targets_mask")
        batch_cost = reverser.cost(
            chars, chars_mask, targets, targets_mask).sum()
        batch_size = chars.shape[1].copy(name="batch_size")
        cost = aggregation.mean(batch_cost, batch_size)
        cost.name = "sequence_log_likelihood"
        logger.info("Cost graph is built")

        # Give an idea of what's going on
        model = Model(cost)
        parameters = model.get_parameter_dict()
        logger.info("Parameters:\n" +
                    pprint.pformat(
                        [(key, value.get_value().shape) for key, value
                         in parameters.items()],
                        width=120))

        # Initialize parameters
        for brick in model.get_top_bricks():
            brick.initialize()

        # Define the training algorithm.
        cg = ComputationGraph(cost)
        algorithm = GradientDescent(
            cost=cost, parameters=cg.parameters,
            step_rule=CompositeRule([StepClipping(10.0), Scale(0.01)]))

        # Fetch variables useful for debugging
        generator = reverser.generator
        (energies,) = VariableFilter(
            applications=[generator.readout.readout],
            name_regex="output")(cg.variables)
        (activations,) = VariableFilter(
            applications=[generator.transition.apply],
            name=generator.transition.apply.states[0])(cg.variables)
        max_length = chars.shape[0].copy(name="max_length")
        cost_per_character = aggregation.mean(
            batch_cost, batch_size * max_length).copy(
                name="character_log_likelihood")
        min_energy = energies.min().copy(name="min_energy")
        max_energy = energies.max().copy(name="max_energy")
        mean_activation = abs(activations).mean().copy(
                name="mean_activation")
        observables = [
            cost, min_energy, max_energy, mean_activation,
            batch_size, max_length, cost_per_character,
            algorithm.total_step_norm, algorithm.total_gradient_norm]
        for name, parameter in parameters.items():
            observables.append(parameter.norm(2).copy(name + "_norm"))
            observables.append(algorithm.gradients[parameter].norm(2).copy(
                name + "_grad_norm"))

        # Construct the main loop and start training!
        average_monitoring = TrainingDataMonitoring(
            observables, prefix="average", every_n_batches=10)
        main_loop = MainLoop(
            model=model,
            data_stream=data_stream,
            algorithm=algorithm,
            extensions=[
                Timing(),
                TrainingDataMonitoring(observables, after_batch=True),
                average_monitoring,
                FinishAfter(after_n_batches=num_batches)
                # This shows a way to handle NaN emerging during
                # training: simply finish it.
                .add_condition(["after_batch"], _is_nan),
                # Saving the model and the log separately is convenient,
                # because loading the whole pickle takes quite some time.
                Checkpoint(save_path, every_n_batches=500,
                           save_separately=["model", "log"]),
                Printing(every_n_batches=1)])
        main_loop.run()
    elif mode == "sample" or mode == "beam_search":
        chars = tensor.lmatrix("input")
        generated = reverser.generate(chars)
        model = Model(generated)
        logger.info("Loading the model..")
        model.set_parameter_values(load_parameter_values(save_path))

        def generate(input_):
            """Generate output sequences for an input sequence.

            Incapsulates most of the difference between sampling and beam
            search.

            Returns
            -------
            outputs : list of lists
                Trimmed output sequences.
            costs : list
                The negative log-likelihood of generating the respective
                sequences.

            """
            if mode == "beam_search":
                samples, = VariableFilter(
                    applications=[reverser.generator.generate], name="outputs")(
                        ComputationGraph(generated[1]))
                # NOTE: this will recompile beam search functions
                # every time user presses Enter. Do not create
                # a new `BeamSearch` object every time if
                # speed is important for you.
                beam_search = BeamSearch(samples)
                outputs, costs = beam_search.search(
                    {chars: input_}, char2code['</S>'],
                    3 * input_.shape[0])
            else:
                _1, outputs, _2, _3, costs = (
                    model.get_theano_function()(input_))
                outputs = list(outputs.T)
                costs = list(costs.T)
                for i in range(len(outputs)):
                    outputs[i] = list(outputs[i])
                    try:
                        true_length = outputs[i].index(char2code['</S>']) + 1
                    except ValueError:
                        true_length = len(outputs[i])
                    outputs[i] = outputs[i][:true_length]
                    costs[i] = costs[i][:true_length].sum()
            return outputs, costs

        while True:
            try:
                line = input("Enter a sentence\n")
                message = ("Enter the number of samples\n" if mode == "sample"
                        else "Enter the beam size\n")
                batch_size = int(input(message))
            except EOFError:
                break
            except Exception:
                traceback.print_exc()
                continue

            encoded_input = [char2code.get(char, char2code["<UNK>"])
                             for char in line.lower().strip()]
            encoded_input = ([char2code['<S>']] + encoded_input +
                             [char2code['</S>']])
            print("Encoder input:", encoded_input)
            target = reverse_words((encoded_input,))[0]
            print("Target: ", target)

            samples, costs = generate(
                numpy.repeat(numpy.array(encoded_input)[:, None],
                             batch_size, axis=1))
            messages = []
            for sample, cost in equizip(samples, costs):
                message = "({})".format(cost)
                message += "".join(code2char[code] for code in sample)
                if sample == target:
                    message += " CORRECT!"
                messages.append((cost, message))
            messages.sort(key=operator.itemgetter(0), reverse=True)
            for _, message in messages:
                print(message)
Exemple #34
0
def main(argv):
    name = argv[1]
    files = map(lambda p: join(folder, p), listdir(folder))

    file = next(filter(lambda n: name in n, files))
    print(file)

    p = load_parameter_values(file)

    net = net_dvc((128, 128))

    x = tensor.tensor4('image_features')
    y_hat = net.apply(x)

    g = Model(y_hat)

    for k, v in p.items():
        p[k] = v.astype('float32')

    g.set_parameter_values(p)

    a, t, v = get_dvc((128, 128), trainning=False, shortcut=False)
    run = function([x], y_hat)

    def run_test(data):
        res = []
        for i in data.get_epoch_iterator():
            res.extend(run(i[0]))
        return res

    def max_index(l):
        if l[0] > l[1]:
            return 0
        else:
            return 1

    def write_kaggle(f, l):
        f.write("id,label\n")
        for i, e in enumerate(l, start=1):
            f.write(str(i) + "," + str(e) + "\n")

    def kaggle(file, data):
        write_kaggle(file, map(max_index, run_test(data)))

    def accuracy(data):
        res = []
        true = []
        for i in data.get_epoch_iterator():
            res.extend(run(i[0]))
            true.extend(i[1])
        res = map(max_index, res)

        total = 0
        equal = 0
        for r, t in zip(res, true):
            total += 1
            equal += 1 if r == t else 0

        return equal / total

    print("Training accuracy: ", accuracy(a))
    print("Test accuracy: ", accuracy(v))
    kaggle_file = join(result_folder, name + ".kaggle")
    print(kaggle_file)
    with open(kaggle_file, 'w') as f:
        kaggle(f, t)