def __init__(self, n_in, hidden_layer_size, n_out, L1_reg, L2_reg, hidden_layer_type, output_type='LINEAR', network_type='S2S', ed_type='HED', dropout_rate=0.0, optimizer='sgd', MLU_div_lengths=[], loss_function='MMSE', rnn_batch_training=False): """ This function initialises a neural network :param n_in: Dimensionality of input features :type in: Integer :param hidden_layer_size: The layer size for each hidden layer :type hidden_layer_size: A list of integers :param n_out: Dimensionality of output features :type n_out: Integrer :param hidden_layer_type: the activation types of each hidden layers, e.g., TANH, LSTM, GRU, BLSTM :param L1_reg: the L1 regulasation weight :param L2_reg: the L2 regulasation weight :param output_type: the activation type of the output layer, by default is 'LINEAR', linear regression. :param dropout_rate: probability of dropout, a float number between 0 and 1. """ logger = logging.getLogger("DNN initialization") self.n_in = int(n_in) self.n_out = int(n_out) self.n_layers = len(hidden_layer_size) self.dropout_rate = dropout_rate self.optimizer = optimizer self.loss_function = loss_function self.is_train = T.iscalar('is_train') self.rnn_batch_training = rnn_batch_training assert len(hidden_layer_size) == len(hidden_layer_type) self.list_of_activations = [ 'TANH', 'SIGMOID', 'SOFTMAX', 'RELU', 'RESU' ] BLSTM_variants = ['BLSTM', 'BSLSTM', 'BLSTME', 'BSLSTME'] Encoder_variants = ['RNNE', 'LSTME', 'BLSTME', 'SLSTME', 'TANHE'] Decoder_variants = ['RNND', 'LSTMD', 'SLSTMD'] if self.rnn_batch_training: self.x = T.tensor3('x') self.y = T.tensor3('y') else: self.x = T.matrix('x') self.y = T.matrix('y') if network_type == "S2S": self.d = T.ivector('d') self.f = T.matrix('f') self.L1_reg = L1_reg self.L2_reg = L2_reg self.rnn_layers = [] self.params = [] self.delta_params = [] rng = np.random.RandomState(123) prev_seg_end = 0 encoder_count = 0 MLU_div = MLU_div_lengths for i in range(self.n_layers): if i == 0: input_size = n_in else: input_size = hidden_layer_size[i - 1] if hidden_layer_type[i - 1] in BLSTM_variants: input_size = hidden_layer_size[i - 1] * 2 if i == 0: layer_input = self.x else: layer_input = self.rnn_layers[i - 1].output ### sequence-to-sequence mapping ### if hidden_layer_type[i - 1] in Encoder_variants: dur_input = self.d frame_feat_input = self.f # vanilla encoder-decoder (phone-level features) if ed_type == "VED": seq2seq_model = DistributedSequenceEncoder( rng, layer_input, dur_input) layer_input = T.concatenate( (seq2seq_model.encoded_output, frame_feat_input), axis=1) input_size = input_size + 4 # hierarchical encoder-decoder elif ed_type == "HED": seg_len = layer_input.size // input_size seg_dur_input = dur_input[prev_seg_end:prev_seg_end + seg_len] num_of_segs = T.sum(seg_dur_input) seq2seq_model = DistributedSequenceEncoder( rng, layer_input, seg_dur_input) addfeat_input = frame_feat_input[ 0:num_of_segs, MLU_div[encoder_count]:MLU_div[encoder_count + 1]] layer_input = T.concatenate( (seq2seq_model.encoded_output, addfeat_input), axis=1) input_size = input_size + (MLU_div[encoder_count + 1] - MLU_div[encoder_count]) prev_seg_end = prev_seg_end + seg_len encoder_count = encoder_count + 1 # hidden layer activation if hidden_layer_type[i] in self.list_of_activations: hidden_activation = hidden_layer_type[i].lower() hidden_layer = GeneralLayer(rng, layer_input, input_size, hidden_layer_size[i], activation=hidden_activation, p=self.dropout_rate, training=self.is_train) elif hidden_layer_type[i] == 'TANHE' or hidden_layer_type[ i] == 'SIGMOIDE': hidden_activation = hidden_layer_type[i][0:-1].lower() hidden_layer = GeneralLayer(rng, layer_input, input_size, hidden_layer_size[i], activation=hidden_activation, p=self.dropout_rate, training=self.is_train) elif hidden_layer_type[i] == 'TANH_LHUC': hidden_layer = SigmoidLayer_LHUC(rng, layer_input, input_size, hidden_layer_size[i], activation=T.tanh, p=self.dropout_rate, training=self.is_train) elif hidden_layer_type[i] == 'SLSTM' or hidden_layer_type[ i] == 'SLSTME': hidden_layer = SimplifiedLstm( rng, layer_input, input_size, hidden_layer_size[i], p=self.dropout_rate, training=self.is_train, rnn_batch_training=self.rnn_batch_training) elif hidden_layer_type[i] == 'SLSTMD': hidden_layer = SimplifiedLstmDecoder( rng, layer_input, input_size, hidden_layer_size[i], self.n_out, p=self.dropout_rate, training=self.is_train, rnn_batch_training=self.rnn_batch_training) elif hidden_layer_type[i] == 'SGRU': hidden_layer = SimplifiedGRU( rng, layer_input, input_size, hidden_layer_size[i], p=self.dropout_rate, training=self.is_train, rnn_batch_training=self.rnn_batch_training) elif hidden_layer_type[i] == 'GRU': hidden_layer = GatedRecurrentUnit( rng, layer_input, input_size, hidden_layer_size[i], p=self.dropout_rate, training=self.is_train, rnn_batch_training=self.rnn_batch_training) elif hidden_layer_type[i] == 'LSTM' or hidden_layer_type[ i] == 'LSTME': hidden_layer = VanillaLstm( rng, layer_input, input_size, hidden_layer_size[i], p=self.dropout_rate, training=self.is_train, rnn_batch_training=self.rnn_batch_training) elif hidden_layer_type[i] == 'LSTMD': hidden_layer = VanillaLstmDecoder( rng, layer_input, input_size, hidden_layer_size[i], self.n_out, p=self.dropout_rate, training=self.is_train, rnn_batch_training=self.rnn_batch_training) elif hidden_layer_type[i] == 'BSLSTM' or hidden_layer_type[ i] == 'BSLSTME': hidden_layer = BidirectionSLstm( rng, layer_input, input_size, hidden_layer_size[i], hidden_layer_size[i], p=self.dropout_rate, training=self.is_train, rnn_batch_training=self.rnn_batch_training) elif hidden_layer_type[i] == 'BLSTM' or hidden_layer_type[ i] == 'BLSTME': hidden_layer = BidirectionLstm( rng, layer_input, input_size, hidden_layer_size[i], hidden_layer_size[i], p=self.dropout_rate, training=self.is_train, rnn_batch_training=self.rnn_batch_training) elif hidden_layer_type[i] == 'RNN' or hidden_layer_type[ i] == 'RNNE': hidden_layer = VanillaRNN( rng, layer_input, input_size, hidden_layer_size[i], p=self.dropout_rate, training=self.is_train, rnn_batch_training=self.rnn_batch_training) elif hidden_layer_type[i] == 'RNND': hidden_layer = VanillaRNNDecoder( rng, layer_input, input_size, hidden_layer_size[i], self.n_out, p=self.dropout_rate, training=self.is_train, rnn_batch_training=self.rnn_batch_training) elif hidden_layer_type[i] == 'LSTM_LHUC': hidden_layer = VanillaLstm_LHUC( rng, layer_input, input_size, hidden_layer_size[i], p=self.dropout_rate, training=self.is_train, rnn_batch_training=self.rnn_batch_training) else: logger.critical( "This hidden layer type: %s is not supported right now! \n Please use one of the following: SLSTM, BSLSTM, TANH, SIGMOID\n" % (hidden_layer_type[i])) sys.exit(1) self.rnn_layers.append(hidden_layer) self.params.extend(hidden_layer.params) input_size = hidden_layer_size[-1] if hidden_layer_type[-1] in BLSTM_variants: input_size = hidden_layer_size[-1] * 2 if hidden_layer_type[-1] in Decoder_variants: self.final_layer = self.rnn_layers[-1] else: output_activation = output_type.lower() if output_activation == 'linear': self.final_layer = LinearLayer(rng, self.rnn_layers[-1].output, input_size, self.n_out) elif output_activation == 'recurrent': self.final_layer = RecurrentOutputLayer( rng, self.rnn_layers[-1].output, input_size, self.n_out, rnn_batch_training=self.rnn_batch_training) elif output_type.upper() in self.list_of_activations: self.final_layer = GeneralLayer(rng, self.rnn_layers[-1].output, input_size, self.n_out, activation=output_activation) else: logger.critical( "This output layer type: %s is not supported right now! \n Please use one of the following: LINEAR, BSLSTM\n" % (output_type)) sys.exit(1) self.params.extend(self.final_layer.params) self.updates = {} for param in self.params: self.updates[param] = theano.shared( value=np.zeros(param.get_value(borrow=True).shape, dtype=theano.config.floatX), name='updates') if self.loss_function == 'CCE': self.finetune_cost = self.categorical_crossentropy_loss( self.final_layer.output, self.y) self.errors = self.categorical_crossentropy_loss( self.final_layer.output, self.y) elif self.loss_function == 'Hinge': self.finetune_cost = self.multiclass_hinge_loss( self.final_layer.output, self.y) self.errors = self.multiclass_hinge_loss(self.final_layer.output, self.y) elif self.loss_function == 'MMSE': if self.rnn_batch_training: self.y_mod = T.reshape(self.y, (-1, n_out)) self.final_layer_output = T.reshape(self.final_layer.output, (-1, n_out)) nonzero_rows = T.any(self.y_mod, 1).nonzero() self.y_mod = self.y_mod[nonzero_rows] self.final_layer_output = self.final_layer_output[nonzero_rows] self.finetune_cost = T.mean( T.sum((self.final_layer_output - self.y_mod)**2, axis=1)) self.errors = T.mean( T.sum((self.final_layer_output - self.y_mod)**2, axis=1)) else: self.finetune_cost = T.mean( T.sum((self.final_layer.output - self.y)**2, axis=1)) self.errors = T.mean( T.sum((self.final_layer.output - self.y)**2, axis=1))
def __init__(self, n_in, hidden_layer_size, n_out, L1_reg, L2_reg, hidden_layer_type, output_type='LINEAR', network_type='DNN', dropout_rate=0.0, loss_function='CCE'): """ This function initialises a neural network :param n_in: Dimensionality of input features :type in: Integer :param hidden_layer_size: The layer size for each hidden layer :type hidden_layer_size: A list of integers :param n_out: Dimensionality of output features :type n_out: Integrer :param hidden_layer_type: the activation types of each hidden layers, e.g., TANH, LSTM, GRU, BLSTM :param L1_reg: the L1 regulasation weight :param L2_reg: the L2 regulasation weight :param output_type: the activation type of the output layer, by default is 'LINEAR', linear regression. :param dropout_rate: probability of dropout, a float number between 0 and 1. """ logger = logging.getLogger("DNN initialization") self.n_in = int(n_in) self.n_out = int(n_out) self.n_layers = len(hidden_layer_size) self.dropout_rate = dropout_rate self.loss_function = loss_function self.is_train = T.iscalar('is_train') assert len(hidden_layer_size) == len(hidden_layer_type) self.x = T.matrix('x') self.y = T.matrix('y') if network_type == "S2S": self.d = T.ivector('d') self.L1_reg = L1_reg self.L2_reg = L2_reg self.rnn_layers = [] self.params = [] self.delta_params = [] rng = np.random.RandomState(123) Encoder_variants = ['RNNE', 'LSTME', 'BLSTME', 'SLSTME', 'TANHE'] Decoder_variants = ['RNND', 'LSTMD', 'SLSTMD'] for i in xrange(self.n_layers): if i == 0: input_size = n_in else: input_size = hidden_layer_size[i - 1] if i == 0: layer_input = self.x else: layer_input = self.rnn_layers[i - 1].output if hidden_layer_type[i - 1] == 'BSLSTM' or hidden_layer_type[ i - 1] == 'BLSTM': input_size = hidden_layer_size[i - 1] * 2 if hidden_layer_type[i - 1] in Encoder_variants: dur_input = T.extra_ops.cumsum(self.d) - 1 layer_input = layer_input[dur_input] seq2seq_model = DistributedSequenceEncoder( rng, layer_input, self.d) layer_input = seq2seq_model.encoded_output #seg_len = layer_input.size//input_size #seq2seq_model = VanillaSequenceEncoder(rng, layer_input, seg_len) if hidden_layer_type[i] == 'SLSTM' or hidden_layer_type[ i] == 'SLSTME': hidden_layer = SimplifiedLstm(rng, layer_input, input_size, hidden_layer_size[i], p=self.dropout_rate, training=self.is_train) elif hidden_layer_type[i] == 'SGRU': hidden_layer = SimplifiedGRU(rng, layer_input, input_size, hidden_layer_size[i], p=self.dropout_rate, training=self.is_train) elif hidden_layer_type[i] == 'GRU': hidden_layer = GatedRecurrentUnit(rng, layer_input, input_size, hidden_layer_size[i], p=self.dropout_rate, training=self.is_train) elif hidden_layer_type[i] == 'LSTM_NFG': hidden_layer = LstmNFG(rng, layer_input, input_size, hidden_layer_size[i], p=self.dropout_rate, training=self.is_train) elif hidden_layer_type[i] == 'LSTM_NOG': hidden_layer = LstmNOG(rng, layer_input, input_size, hidden_layer_size[i], p=self.dropout_rate, training=self.is_train) elif hidden_layer_type[i] == 'LSTM_NIG': hidden_layer = LstmNIG(rng, layer_input, input_size, hidden_layer_size[i], p=self.dropout_rate, training=self.is_train) elif hidden_layer_type[i] == 'LSTM_NPH': hidden_layer = LstmNoPeepholes(rng, layer_input, input_size, hidden_layer_size[i], p=self.dropout_rate, training=self.is_train) elif hidden_layer_type[i] == 'LSTM' or hidden_layer_type[ i] == 'LSTME': hidden_layer = VanillaLstm(rng, layer_input, input_size, hidden_layer_size[i], p=self.dropout_rate, training=self.is_train) elif hidden_layer_type[i] == 'LSTMD': hidden_layer = VanillaLstmDecoder(rng, layer_input, input_size, hidden_layer_size[i], self.n_out, p=self.dropout_rate, training=self.is_train) elif hidden_layer_type[i] == 'BSLSTM': hidden_layer = BidirectionSLstm(rng, layer_input, input_size, hidden_layer_size[i], hidden_layer_size[i], p=self.dropout_rate, training=self.is_train) elif hidden_layer_type[i] == 'BLSTM': hidden_layer = BidirectionLstm(rng, layer_input, input_size, hidden_layer_size[i], hidden_layer_size[i], p=self.dropout_rate, training=self.is_train) elif hidden_layer_type[i] == 'RNN' or hidden_layer_type[i] == 'RNN': hidden_layer = VanillaRNN(rng, layer_input, input_size, hidden_layer_size[i], p=self.dropout_rate, training=self.is_train) elif hidden_layer_type[i] == 'RNND': hidden_layer = VanillaRNNDecoder(rng, layer_input, self.y, input_size, hidden_layer_size[i], self.n_out, p=self.dropout_rate, training=self.is_train) elif hidden_layer_type[i] == 'TANH': hidden_layer = SigmoidLayer(rng, layer_input, input_size, hidden_layer_size[i], activation=T.tanh, p=self.dropout_rate, training=self.is_train) elif hidden_layer_type[i] == 'SIGMOID': hidden_layer = SigmoidLayer(rng, layer_input, input_size, hidden_layer_size[i], activation=T.nnet.sigmoid, p=self.dropout_rate, training=self.is_train) elif hidden_layer_type[i] == 'SOFTMAX': hidden_layer = SoftmaxLayer(rng, layer_input, input_size, hidden_layer_size[i] + 1) else: logger.critical( "This hidden layer type: %s is not supported right now! \n Please use one of the following: SLSTM, BSLSTM, TANH, SIGMOID\n" % (hidden_layer_type[i])) sys.exit(1) self.rnn_layers.append(hidden_layer) self.params.extend(hidden_layer.params) input_size = hidden_layer_size[-1] if hidden_layer_type[-1] == 'BSLSTM' or hidden_layer_type[ -1] == 'BLSTM': input_size = hidden_layer_size[-1] * 2 if hidden_layer_type[-1] in Decoder_variants: self.final_layer = self.rnn_layers[-1] else: if output_type == 'LINEAR': self.final_layer = LinearLayer(rng, self.rnn_layers[-1].output, input_size, self.n_out) elif output_type == 'SOFTMAX': self.final_layer = SoftmaxLayer(rng, self.rnn_layers[-1].output, input_size, self.n_out) elif output_type == 'SIGMOID': self.final_layer = SigmoidLayer(rng, self.rnn_layers[-1].output, input_size, self.n_out, activation=T.nnet.sigmoid) # elif output_type == 'BSLSTM': # self.final_layer = BidirectionLSTM(rng, self.rnn_layers[-1].output, input_size, hidden_layer_size[-1], self.n_out) else: logger.critical( "This output layer type: %s is not supported right now! \n Please use one of the following: LINEAR, BSLSTM\n" % (output_type)) sys.exit(1) self.params.extend(self.final_layer.params) self.updates = {} for param in self.params: self.updates[param] = theano.shared( value=np.zeros(param.get_value(borrow=True).shape, dtype=theano.config.floatX), name='updates') if self.loss_function == 'CCE': self.finetune_cost = self.categorical_crossentropy_loss( self.final_layer.output, self.y) self.errors = self.categorical_crossentropy_loss( self.final_layer.output, self.y) elif self.loss_function == 'Hinge': self.finetune_cost = self.multiclass_hinge_loss( self.final_layer.output, self.y) self.errors = self.multiclass_hinge_loss(self.final_layer.output, self.y) elif self.loss_function == 'MMSE': self.finetune_cost = T.mean( T.sum((self.final_layer.output - self.y)**2, axis=1)) self.errors = T.mean( T.sum((self.final_layer.output - self.y)**2, axis=1))