Exemple #1
0
    def __init__(self, numpy_rng, theano_rng=None, 
                 cfg = None,
                 dnn_shared = None, shared_layers=[]):

        self.layers = []
        self.dropout_layers = []
        self.params = []
        self.delta_params   = []

        self.cfg = cfg
        self.n_ins = cfg.n_ins; self.n_outs = cfg.n_outs if cfg.n_outs !=1 else 2
        self.hidden_layers_sizes = cfg.hidden_layers_sizes
        self.hidden_layers_number = len(self.hidden_layers_sizes)        
        self.activation = cfg.activation

        self.do_maxout = cfg.do_maxout; self.pool_size = cfg.pool_size
        self.input_dropout_factor = cfg.input_dropout_factor; self.dropout_factor = cfg.dropout_factor

        self.max_col_norm = cfg.max_col_norm
        self.l1_reg = cfg.l1_reg
        self.l2_reg = cfg.l2_reg

        if not theano_rng:
            theano_rng = RandomStreams(numpy_rng.randint(2 ** 30))
        # allocate symbolic variables for the data
        self.x = T.matrix('x') 
        self.y = T.ivector('y')

        for i in xrange(self.hidden_layers_number):
            # construct the hidden layer
            if i == 0:
                input_size = self.n_ins
                layer_input = self.x
                if self.input_dropout_factor > 0.0:
                    dropout_layer_input = _dropout_from_layer(theano_rng, self.x, self.input_dropout_factor)
                else:
                    dropout_layer_input = self.x
            else:
                input_size = self.hidden_layers_sizes[i - 1]
                layer_input = (1 - self.dropout_factor[i - 1]) * self.layers[-1].output
                dropout_layer_input = self.dropout_layers[-1].dropout_output

            W = None; b = None
            if (i in shared_layers) :
                W = dnn_shared.layers[i].W; b = dnn_shared.layers[i].b

            if self.do_maxout == False:
                dropout_layer = DropoutHiddenLayer(rng=numpy_rng,
                                        input=dropout_layer_input,
                                        n_in=input_size,
                                        n_out=self.hidden_layers_sizes[i],
                                        W = W, b = b,
                                        activation= self.activation,
                                        dropout_factor=self.dropout_factor[i])
                hidden_layer = HiddenLayer(rng=numpy_rng,
                                        input=layer_input,
                                        n_in=input_size,
                                        n_out=self.hidden_layers_sizes[i],
                                        activation= self.activation,
                                        W=dropout_layer.W, b=dropout_layer.b)
            else:
                dropout_layer = DropoutHiddenLayer(rng=numpy_rng,
                                        input=dropout_layer_input,
                                        n_in=input_size,
                                        n_out=self.hidden_layers_sizes[i] * self.pool_size,
                                        W = W, b = b,
                                        activation= (lambda x: 1.0*x),
                                        dropout_factor=self.dropout_factor[i],
                                        do_maxout = True, pool_size = self.pool_size)
                hidden_layer = HiddenLayer(rng=numpy_rng,
                                        input=layer_input,
                                        n_in=input_size,
                                        n_out=self.hidden_layers_sizes[i] * self.pool_size,
                                        activation= (lambda x: 1.0*x),
                                        W=dropout_layer.W, b=dropout_layer.b,
                                        do_maxout = True, pool_size = self.pool_size)
            # add the layer to our list of layers
            self.layers.append(hidden_layer)
            self.dropout_layers.append(dropout_layer)
            self.params.extend(dropout_layer.params)
            self.delta_params.extend(dropout_layer.delta_params)
        # We now need to add a logistic layer on top of the MLP
        self.dropout_logLayer = LogisticRegression(
                                 input=self.dropout_layers[-1].dropout_output,
                                 n_in=self.hidden_layers_sizes[-1], n_out=self.n_outs)

        self.logLayer = LogisticRegression(
                         input=(1 - self.dropout_factor[-1]) * self.layers[-1].output,
                         n_in=self.hidden_layers_sizes[-1], n_out=self.n_outs,
                         W=self.dropout_logLayer.W, b=self.dropout_logLayer.b)

        self.dropout_layers.append(self.dropout_logLayer)
        self.layers.append(self.logLayer)
        self.params.extend(self.dropout_logLayer.params)
        self.delta_params.extend(self.dropout_logLayer.delta_params)

        # compute the cost
        self.finetune_cost = self.dropout_logLayer.negative_log_likelihood(self.y)
        self.errors = self.logLayer.errors(self.y)

        if self.l1_reg is not None:
            for i in xrange(self.hidden_layers_number):
                W = self.layers[i].W
                self.finetune_cost += self.l1_reg * (abs(W).sum())

        if self.l2_reg is not None:
            for i in xrange(self.n_layers):
                W = self.layers[i].W
                self.finetune_cost += self.l2_reg * T.sqr(W).sum()
Exemple #2
0
    def __init__(
        self,
        numpy_rng,
        theano_rng=None,
        cfg=None,  # the network configuration
        dnn_shared=None,
        shared_layers=[],
        input=None,
    ):

        self.layers = []
        self.params = []
        self.delta_params = []

        self.cfg = cfg
        self.n_ins = cfg.n_ins
        self.n_outs = cfg.n_outs if cfg.n_outs != 1 else 2
        self.hidden_layers_sizes = cfg.hidden_layers_sizes
        self.hidden_layers_number = len(self.hidden_layers_sizes)
        self.activation = cfg.activation

        self.do_maxout = cfg.do_maxout
        self.pool_size = cfg.pool_size

        self.max_col_norm = cfg.max_col_norm
        self.l1_reg = cfg.l1_reg
        self.l2_reg = cfg.l2_reg

        self.non_updated_layers = cfg.non_updated_layers

        if not theano_rng:
            theano_rng = RandomStreams(numpy_rng.randint(2 ** 30))
        # allocate symbolic variables for the data
        if input == None:
            self.x = T.matrix("x")
        else:
            self.x = input
        self.y = T.ivector("y")

        for i in xrange(self.hidden_layers_number):
            # construct the hidden layer
            if i == 0:
                input_size = self.n_ins
                layer_input = self.x
            else:
                input_size = self.hidden_layers_sizes[i - 1]
                layer_input = self.layers[-1].output

            W = None
            b = None
            if i in shared_layers:
                W = dnn_shared.layers[i].W
                b = dnn_shared.layers[i].b
            if self.do_maxout == True:
                hidden_layer = HiddenLayer(
                    rng=numpy_rng,
                    input=layer_input,
                    n_in=input_size,
                    n_out=self.hidden_layers_sizes[i] * self.pool_size,
                    W=W,
                    b=b,
                    activation=(lambda x: 1.0 * x),
                    do_maxout=True,
                    pool_size=self.pool_size,
                )
            else:
                hidden_layer = HiddenLayer(
                    rng=numpy_rng,
                    input=layer_input,
                    n_in=input_size,
                    n_out=self.hidden_layers_sizes[i],
                    W=W,
                    b=b,
                    activation=self.activation,
                )
            # add the layer to our list of layers
            self.layers.append(hidden_layer)
            # if the layer index is included in self.non_updated_layers, parameters of this layer will not be updated
            if i not in self.non_updated_layers:
                self.params.extend(hidden_layer.params)
                self.delta_params.extend(hidden_layer.delta_params)
        # We now need to add a logistic layer on top of the MLP
        self.logLayer = LogisticRegression(
            input=self.layers[-1].output, n_in=self.hidden_layers_sizes[-1], n_out=self.n_outs
        )

        if self.n_outs > 0:
            self.layers.append(self.logLayer)
            self.params.extend(self.logLayer.params)
            self.delta_params.extend(self.logLayer.delta_params)

        # compute the cost for second phase of training,
        # defined as the negative log likelihood
        self.finetune_cost = self.logLayer.negative_log_likelihood(self.y)
        self.errors = self.logLayer.errors(self.y)

        if self.l1_reg is not None:
            for i in xrange(self.hidden_layers_number):
                W = self.layers[i].W
                self.finetune_cost += self.l1_reg * (abs(W).sum())

        if self.l2_reg is not None:
            for i in xrange(self.hidden_layers_number):
                W = self.layers[i].W
                self.finetune_cost += self.l2_reg * T.sqr(W).sum()
Exemple #3
0
class DNN_Dropout(DNN):

    def __init__(self, numpy_rng, theano_rng=None, 
                 cfg = None,
                 dnn_shared = None, shared_layers=[]):

        self.layers = []
        self.dropout_layers = []
        self.params = []
        self.delta_params   = []

        self.cfg = cfg
        self.n_ins = cfg.n_ins; self.n_outs = cfg.n_outs if cfg.n_outs !=1 else 2
        self.hidden_layers_sizes = cfg.hidden_layers_sizes
        self.hidden_layers_number = len(self.hidden_layers_sizes)        
        self.activation = cfg.activation

        self.do_maxout = cfg.do_maxout; self.pool_size = cfg.pool_size
        self.input_dropout_factor = cfg.input_dropout_factor; self.dropout_factor = cfg.dropout_factor

        self.max_col_norm = cfg.max_col_norm
        self.l1_reg = cfg.l1_reg
        self.l2_reg = cfg.l2_reg

        if not theano_rng:
            theano_rng = RandomStreams(numpy_rng.randint(2 ** 30))
        # allocate symbolic variables for the data
        self.x = T.matrix('x') 
        self.y = T.ivector('y')

        for i in xrange(self.hidden_layers_number):
            # construct the hidden layer
            if i == 0:
                input_size = self.n_ins
                layer_input = self.x
                if self.input_dropout_factor > 0.0:
                    dropout_layer_input = _dropout_from_layer(theano_rng, self.x, self.input_dropout_factor)
                else:
                    dropout_layer_input = self.x
            else:
                input_size = self.hidden_layers_sizes[i - 1]
                layer_input = (1 - self.dropout_factor[i - 1]) * self.layers[-1].output
                dropout_layer_input = self.dropout_layers[-1].dropout_output

            W = None; b = None
            if (i in shared_layers) :
                W = dnn_shared.layers[i].W; b = dnn_shared.layers[i].b

            if self.do_maxout == False:
                dropout_layer = DropoutHiddenLayer(rng=numpy_rng,
                                        input=dropout_layer_input,
                                        n_in=input_size,
                                        n_out=self.hidden_layers_sizes[i],
                                        W = W, b = b,
                                        activation= self.activation,
                                        dropout_factor=self.dropout_factor[i])
                hidden_layer = HiddenLayer(rng=numpy_rng,
                                        input=layer_input,
                                        n_in=input_size,
                                        n_out=self.hidden_layers_sizes[i],
                                        activation= self.activation,
                                        W=dropout_layer.W, b=dropout_layer.b)
            else:
                dropout_layer = DropoutHiddenLayer(rng=numpy_rng,
                                        input=dropout_layer_input,
                                        n_in=input_size,
                                        n_out=self.hidden_layers_sizes[i] * self.pool_size,
                                        W = W, b = b,
                                        activation= (lambda x: 1.0*x),
                                        dropout_factor=self.dropout_factor[i],
                                        do_maxout = True, pool_size = self.pool_size)
                hidden_layer = HiddenLayer(rng=numpy_rng,
                                        input=layer_input,
                                        n_in=input_size,
                                        n_out=self.hidden_layers_sizes[i] * self.pool_size,
                                        activation= (lambda x: 1.0*x),
                                        W=dropout_layer.W, b=dropout_layer.b,
                                        do_maxout = True, pool_size = self.pool_size)
            # add the layer to our list of layers
            self.layers.append(hidden_layer)
            self.dropout_layers.append(dropout_layer)
            self.params.extend(dropout_layer.params)
            self.delta_params.extend(dropout_layer.delta_params)
        # We now need to add a logistic layer on top of the MLP
        self.dropout_logLayer = LogisticRegression(
                                 input=self.dropout_layers[-1].dropout_output,
                                 n_in=self.hidden_layers_sizes[-1], n_out=self.n_outs)

        self.logLayer = LogisticRegression(
                         input=(1 - self.dropout_factor[-1]) * self.layers[-1].output,
                         n_in=self.hidden_layers_sizes[-1], n_out=self.n_outs,
                         W=self.dropout_logLayer.W, b=self.dropout_logLayer.b)

        self.dropout_layers.append(self.dropout_logLayer)
        self.layers.append(self.logLayer)
        self.params.extend(self.dropout_logLayer.params)
        self.delta_params.extend(self.dropout_logLayer.delta_params)

        # compute the cost
        self.finetune_cost = self.dropout_logLayer.negative_log_likelihood(self.y)
        self.errors = self.logLayer.errors(self.y)

        if self.l1_reg is not None:
            for i in xrange(self.hidden_layers_number):
                W = self.layers[i].W
                self.finetune_cost += self.l1_reg * (abs(W).sum())

        if self.l2_reg is not None:
            for i in xrange(self.n_layers):
                W = self.layers[i].W
                self.finetune_cost += self.l2_reg * T.sqr(W).sum()

    def build_finetune_functions(self, train_shared_xy, valid_shared_xy, batch_size):

        (train_set_x, train_set_y) = train_shared_xy
        (valid_set_x, valid_set_y) = valid_shared_xy

        index = T.lscalar('index')  # index to a [mini]batch
        learning_rate = T.fscalar('learning_rate')
        momentum = T.fscalar('momentum')

        # compute the gradients with respect to the model parameters
        gparams = T.grad(self.finetune_cost, self.params)

        # compute list of fine-tuning updates
        updates = collections.OrderedDict()
        for dparam, gparam in zip(self.delta_params, gparams):
            updates[dparam] = momentum * dparam - gparam*learning_rate
        for dparam, param in zip(self.delta_params, self.params):
            updates[param] = param + updates[dparam]

        if self.max_col_norm is not None:
            for i in xrange(self.hidden_layers_number):
                W = self.layers[i].W
                if W in updates:
                    updated_W = updates[W]
                    col_norms = T.sqrt(T.sum(T.sqr(updated_W), axis=0))
                    desired_norms = T.clip(col_norms, 0, self.max_col_norm)
                    updates[W] = updated_W * (desired_norms / (1e-7 + col_norms))

        train_fn = theano.function(inputs=[index, theano.Param(learning_rate, default = 0.0001),
              theano.Param(momentum, default = 0.5)],
              outputs=self.errors,
              updates=updates,
              givens={
                self.x: train_set_x[index * batch_size:
                                    (index + 1) * batch_size],
                self.y: train_set_y[index * batch_size:
                                    (index + 1) * batch_size]})

        valid_fn = theano.function(inputs=[index],
              outputs=self.errors,
              givens={
                self.x: valid_set_x[index * batch_size:
                                    (index + 1) * batch_size],
                self.y: valid_set_y[index * batch_size:
                                    (index + 1) * batch_size]})

        return train_fn, valid_fn

    def write_model_to_raw(self, file_path):
        # output the model to tmp_path; this format is readable by PDNN
        _nnet2file(self.layers, filename=file_path, input_factor = self.input_dropout_factor, factor = self.dropout_factor)

    def write_model_to_kaldi(self, file_path, with_softmax = True):

        # determine whether it's BNF based on layer sizes
        output_layer_number = -1;
        for layer_index in range(1, self.hidden_layers_number - 1):
            cur_layer_size = self.hidden_layers_sizes[layer_index]
            prev_layer_size = self.hidden_layers_sizes[layer_index-1]
            next_layer_size = self.hidden_layers_sizes[layer_index+1]
            if cur_layer_size < prev_layer_size and cur_layer_size < next_layer_size:
                output_layer_number = layer_index+1; break

        layer_number = len(self.layers)
        if output_layer_number == -1:
            output_layer_number = layer_number

        fout = open(file_path, 'wb')
        for i in xrange(output_layer_number):
            # decide the dropout factor for this layer
            dropout_factor = 0.0
            if i == 0:
                dropout_factor = self.input_dropout_factor
            if i > 0 and len(self.dropout_factor) > 0:
                dropout_factor = self.dropout_factor[i-1]

            activation_text = '<' + self.cfg.activation_text + '>'
            if i == (layer_number-1) and with_softmax:   # we assume that the last layer is a softmax layer
                activation_text = '<softmax>'
            W_mat = (1.0 - dropout_factor) * self.layers[i].W.get_value()
            b_vec = self.layers[i].b.get_value()
            input_size, output_size = W_mat.shape
            W_layer = []; b_layer = ''
            for rowX in xrange(output_size):
                W_layer.append('')

            for x in xrange(input_size):
                for t in xrange(output_size):
                    W_layer[t] = W_layer[t] + str(W_mat[x][t]) + ' '

            for x in xrange(output_size):
                b_layer = b_layer + str(b_vec[x]) + ' '

            fout.write('<affinetransform> ' + str(output_size) + ' ' + str(input_size) + '\n')
            fout.write('[' + '\n')
            for x in xrange(output_size):
                fout.write(W_layer[x].strip() + '\n')
            fout.write(']' + '\n')
            fout.write('[ ' + b_layer.strip() + ' ]' + '\n')
            if activation_text == '<maxout>':
                fout.write(activation_text + ' ' + str(output_size/self.pool_size) + ' ' + str(output_size) + '\n')
            else:
                fout.write(activation_text + ' ' + str(output_size) + ' ' + str(output_size) + '\n')
        fout.close()
Exemple #4
0
class DNN(object):
    def __init__(
        self,
        numpy_rng,
        theano_rng=None,
        cfg=None,  # the network configuration
        dnn_shared=None,
        shared_layers=[],
        input=None,
    ):

        self.layers = []
        self.params = []
        self.delta_params = []

        self.cfg = cfg
        self.n_ins = cfg.n_ins
        self.n_outs = cfg.n_outs if cfg.n_outs != 1 else 2
        self.hidden_layers_sizes = cfg.hidden_layers_sizes
        self.hidden_layers_number = len(self.hidden_layers_sizes)
        self.activation = cfg.activation

        self.do_maxout = cfg.do_maxout
        self.pool_size = cfg.pool_size

        self.max_col_norm = cfg.max_col_norm
        self.l1_reg = cfg.l1_reg
        self.l2_reg = cfg.l2_reg

        self.non_updated_layers = cfg.non_updated_layers

        if not theano_rng:
            theano_rng = RandomStreams(numpy_rng.randint(2 ** 30))
        # allocate symbolic variables for the data
        if input == None:
            self.x = T.matrix("x")
        else:
            self.x = input
        self.y = T.ivector("y")

        for i in xrange(self.hidden_layers_number):
            # construct the hidden layer
            if i == 0:
                input_size = self.n_ins
                layer_input = self.x
            else:
                input_size = self.hidden_layers_sizes[i - 1]
                layer_input = self.layers[-1].output

            W = None
            b = None
            if i in shared_layers:
                W = dnn_shared.layers[i].W
                b = dnn_shared.layers[i].b
            if self.do_maxout == True:
                hidden_layer = HiddenLayer(
                    rng=numpy_rng,
                    input=layer_input,
                    n_in=input_size,
                    n_out=self.hidden_layers_sizes[i] * self.pool_size,
                    W=W,
                    b=b,
                    activation=(lambda x: 1.0 * x),
                    do_maxout=True,
                    pool_size=self.pool_size,
                )
            else:
                hidden_layer = HiddenLayer(
                    rng=numpy_rng,
                    input=layer_input,
                    n_in=input_size,
                    n_out=self.hidden_layers_sizes[i],
                    W=W,
                    b=b,
                    activation=self.activation,
                )
            # add the layer to our list of layers
            self.layers.append(hidden_layer)
            # if the layer index is included in self.non_updated_layers, parameters of this layer will not be updated
            if i not in self.non_updated_layers:
                self.params.extend(hidden_layer.params)
                self.delta_params.extend(hidden_layer.delta_params)
        # We now need to add a logistic layer on top of the MLP
        self.logLayer = LogisticRegression(
            input=self.layers[-1].output, n_in=self.hidden_layers_sizes[-1], n_out=self.n_outs
        )

        if self.n_outs > 0:
            self.layers.append(self.logLayer)
            self.params.extend(self.logLayer.params)
            self.delta_params.extend(self.logLayer.delta_params)

        # compute the cost for second phase of training,
        # defined as the negative log likelihood
        self.finetune_cost = self.logLayer.negative_log_likelihood(self.y)
        self.errors = self.logLayer.errors(self.y)

        if self.l1_reg is not None:
            for i in xrange(self.hidden_layers_number):
                W = self.layers[i].W
                self.finetune_cost += self.l1_reg * (abs(W).sum())

        if self.l2_reg is not None:
            for i in xrange(self.hidden_layers_number):
                W = self.layers[i].W
                self.finetune_cost += self.l2_reg * T.sqr(W).sum()

    def predict(self, x_dataset):
        predict_fn = theano.function([], self.logLayer.y_pred, givens={self.x: x_dataset})
        predicted = predict_fn()
        return predicted

    def predict_p(self, x_dataset):
        predict_p_fn = theano.function([], self.logLayer.p_y_given_x, givens={self.x: x_dataset})
        predicted_p = predict_p_fn()
        return predicted_p

    def build_finetune_functions(self, train_shared_xy, valid_shared_xy, batch_size):

        (train_set_x, train_set_y) = train_shared_xy
        (valid_set_x, valid_set_y) = valid_shared_xy

        index = T.lscalar("index")  # index to a [mini]batch
        learning_rate = T.fscalar("learning_rate")
        momentum = T.fscalar("momentum")

        # compute the gradients with respect to the model parameters
        gparams = T.grad(self.finetune_cost, self.params)

        # compute list of fine-tuning updates
        updates = collections.OrderedDict()
        for dparam, gparam in zip(self.delta_params, gparams):
            updates[dparam] = momentum * dparam - gparam * learning_rate
        for dparam, param in zip(self.delta_params, self.params):
            updates[param] = param + updates[dparam]

        if self.max_col_norm is not None:
            for i in xrange(self.hidden_layers_number):
                W = self.layers[i].W
                if W in updates:
                    updated_W = updates[W]
                    col_norms = T.sqrt(T.sum(T.sqr(updated_W), axis=0))
                    desired_norms = T.clip(col_norms, 0, self.max_col_norm)
                    updates[W] = updated_W * (desired_norms / (1e-7 + col_norms))

        train_fn = theano.function(
            inputs=[index, theano.Param(learning_rate, default=0.0001), theano.Param(momentum, default=0.5)],
            outputs=self.errors,
            updates=updates,
            givens={
                self.x: train_set_x[index * batch_size : (index + 1) * batch_size],
                self.y: train_set_y[index * batch_size : (index + 1) * batch_size],
            },
        )

        valid_fn = theano.function(
            inputs=[index],
            outputs=self.errors,
            givens={
                self.x: valid_set_x[index * batch_size : (index + 1) * batch_size],
                self.y: valid_set_y[index * batch_size : (index + 1) * batch_size],
            },
        )

        return train_fn, valid_fn

    def build_extract_feat_function(self, output_layer):

        feat = T.matrix("feat")
        out_da = theano.function(
            [feat], self.layers[output_layer].output, updates=None, givens={self.x: feat}, on_unused_input="warn"
        )
        return out_da

    def transform(self, x):
        fn = self.build_extract_feat_function(self.hidden_layers_number - 1)
        new_x = fn(x)
        return new_x

    def build_finetune_functions_kaldi(self, train_shared_xy, valid_shared_xy):

        (train_set_x, train_set_y) = train_shared_xy
        (valid_set_x, valid_set_y) = valid_shared_xy

        index = T.lscalar("index")  # index to a [mini]batch
        learning_rate = T.fscalar("learning_rate")
        momentum = T.fscalar("momentum")

        # compute the gradients with respect to the model parameters
        gparams = T.grad(self.finetune_cost, self.params)

        # compute list of fine-tuning updates
        updates = collections.OrderedDict()
        for dparam, gparam in zip(self.delta_params, gparams):
            updates[dparam] = momentum * dparam - gparam * learning_rate
        for dparam, param in zip(self.delta_params, self.params):
            updates[param] = param + updates[dparam]

        if self.max_col_norm is not None:
            for i in xrange(self.hidden_layers_number):
                W = self.layers[i].W
                if W in updates:
                    updated_W = updates[W]
                    col_norms = T.sqrt(T.sum(T.sqr(updated_W), axis=0))
                    desired_norms = T.clip(col_norms, 0, self.max_col_norm)
                    updates[W] = updated_W * (desired_norms / (1e-7 + col_norms))

        train_fn = theano.function(
            inputs=[theano.Param(learning_rate, default=0.0001), theano.Param(momentum, default=0.5)],
            outputs=self.errors,
            updates=updates,
            givens={self.x: train_set_x, self.y: train_set_y},
        )

        valid_fn = theano.function(inputs=[], outputs=self.errors, givens={self.x: valid_set_x, self.y: valid_set_y})

        return train_fn, valid_fn

    def write_model_to_raw(self, file_path):
        # output the model to tmp_path; this format is readable by PDNN
        _nnet2file(self.layers, filename=file_path)

    def write_model_to_kaldi(self, file_path, with_softmax=True):
        # determine whether it's BNF based on layer sizes
        output_layer_number = -1
        for layer_index in range(1, self.hidden_layers_number - 1):
            cur_layer_size = self.hidden_layers_sizes[layer_index]
            prev_layer_size = self.hidden_layers_sizes[layer_index - 1]
            next_layer_size = self.hidden_layers_sizes[layer_index + 1]
            if cur_layer_size < prev_layer_size and cur_layer_size < next_layer_size:
                output_layer_number = layer_index + 1
                break

        layer_number = len(self.layers)
        if output_layer_number == -1:
            output_layer_number = layer_number

        fout = open(file_path, "wb")
        for i in xrange(output_layer_number):
            activation_text = "<" + self.cfg.activation_text + ">"
            if i == (layer_number - 1) and with_softmax:  # we assume that the last layer is a softmax layer
                activation_text = "<softmax>"
            W_mat = self.layers[i].W.get_value()
            b_vec = self.layers[i].b.get_value()
            input_size, output_size = W_mat.shape
            W_layer = []
            b_layer = ""
            for rowX in xrange(output_size):
                W_layer.append("")

            for x in xrange(input_size):
                for t in xrange(output_size):
                    W_layer[t] = W_layer[t] + str(W_mat[x][t]) + " "

            for x in xrange(output_size):
                b_layer = b_layer + str(b_vec[x]) + " "

            fout.write("<affinetransform> " + str(output_size) + " " + str(input_size) + "\n")
            fout.write("[" + "\n")
            for x in xrange(output_size):
                fout.write(W_layer[x].strip() + "\n")
            fout.write("]" + "\n")
            fout.write("[ " + b_layer.strip() + " ]" + "\n")
            if activation_text == "<maxout>":
                fout.write(activation_text + " " + str(output_size / self.pool_size) + " " + str(output_size) + "\n")
            else:
                fout.write(activation_text + " " + str(output_size) + " " + str(output_size) + "\n")
        fout.close()

    def finetuning(self, train_xy, valid_xy):
        # get the training, validation and testing function for the model
        log("> ... getting the finetuning functions")
        train_shared_xy = shared_dataset(train_xy, borrow=True)
        valid_shared_xy = shared_dataset(valid_xy, borrow=True)
        train_fn, valid_fn = self.build_finetune_functions(
            train_shared_xy=train_shared_xy, valid_shared_xy=valid_shared_xy, batch_size=self.cfg.batch_size
        )

        log("> ... finetuning the model")
        while self.cfg.lrate.get_rate() != 0:
            # one epoch of sgd training
            train_error = train_sgd_without_streaming(train_fn, train_xy[0].shape[0], self.cfg)
            log("> epoch %d, training error %f " % (self.cfg.lrate.epoch, 100 * numpy.mean(train_error)) + "(%)")
            # validation
            valid_error = validate_by_minibatch_without_streaming(valid_fn, valid_xy[0].shape[0], self.cfg)
            log(
                "> epoch %d, lrate %f, validation error %f "
                % (self.cfg.lrate.epoch, self.cfg.lrate.get_rate(), 100 * numpy.mean(valid_error))
                + "(%)"
            )
            self.cfg.lrate.get_next_rate(current_error=100 * numpy.mean(valid_error))
        log("> ... finetuning finished")
        # output nnet parameters and lrate, for training resume
        #                if self.cfg.lrate.epoch % self.cfg.model_save_step == 0:
        #                    _nnet2file(dnn.layers, filename=wdir + '/nnet.tmp')
        #                   _lrate2file(cfg.lrate, wdir + '/training_state.tmp')
        """
            # save the model and network configuration
            if self.cfg.param_output_file != '':
                _nnet2file(self.layers, filename=self.cfg.param_output_file, input_factor = self.cfg.input_dropout_factor, factor = self.cfg.dropout_factor)
                log('> ... the final PDNN model parameter is ' + self.cfg.param_output_file)
            if self.cfg.cfg_output_file != '':
                _cfg2file(self.cfg, filename=self.cfg.cfg_output_file)
                log('> ... the final PDNN model config is ' + self.cfg.cfg_output_file)

            # output the model into Kaldi-compatible format
            if self.cfg.kaldi_output_file != '':
                self.write_model_to_kaldi(self.cfg.kaldi_output_file)
                log('> ... the final Kaldi model is ' + self.cfg.kaldi_output_file) 

            # remove the tmp files (which have been generated from resuming training) 
#            os.remove(wdir + '/nnet.tmp')
#            os.remove(wdir + '/training_state.tmp')         
        
            """

    def load_pretrain_from_Sda(self, sda_xy=None):

        for i in xrange(len(self.cfg.hidden_layers_sizes)):
            layer = self.layers[i]
            sda_xy_layer = sda_xy.dA_layers[i]
            if layer.type == "fc":
                W_xy = sda_xy_layer.W.get_value()
                W_dnn = layer.W.get_value()
                x_size, y_size = W_dnn.shape
                b_size = layer.b.get_value().shape[0]
                layer.W.set_value(W_xy[:x_size, :y_size])
                layer.b.set_value(sda_xy_layer.b.get_value()[:b_size])
            elif layer.type == "conv":
                pass