def __init__(self, prenet_layers, model_target_dim, attn_dim, location_filters, location_kernel_size, num_rnn_cells, rnn_cell_size, dropout, max_decoder_steps): """Constructor """ super().__init__() self.prenet_layers = prenet_layers self.model_target_dim = model_target_dim self.attn_dim = attn_dim self.location_filters = location_filters self.location_kernel_size = location_kernel_size self.num_rnn_cells = num_rnn_cells self.rnn_cell_size = rnn_cell_size self.dropout = dropout self.max_decoder_steps = max_decoder_steps self.prenet = Prenet(prenet_layers=prenet_layers, dropout=dropout) self.attention = LocationSensitiveAttention( attn_dim=attn_dim, location_filters=location_filters, location_kernel_size=location_kernel_size) self.decoder = [ tf.keras.layers.LSTMCell(units=rnn_cell_size, use_bias=True, kernel_initializer="glorot_uniform") for _ in range(num_rnn_cells) ] self.acoustic_projection = Linear(hidden_dim=model_target_dim, bias=True) self.stop_token_projection = Linear(hidden_dim=1, bias=True)
def __init__(self, attn_dim, location_filters, location_kernel_size): """Constructor """ super().__init__() self.attn_dim = attn_dim self.location_filters = location_filters self.location_kernel_size = location_kernel_size self.query_layer = Linear(hidden_dim=attn_dim, bias=True) self.memory_layer = Linear(hidden_dim=attn_dim, bias=True) self.location_layer = LocationLayer( attn_dim=attn_dim, location_filters=location_filters, location_kernel_size=location_kernel_size) self.v = Linear(hidden_dim=1, bias=True) self.score_mask_value = -float("inf")
def __init__(self, n_inputs, n_hidden, n_output, activation='sigmoid'): super().__init__() self.n_inputs = n_inputs # кол-во данных во входе self.n_hidden = n_hidden # кол-во данных в скрытом слое self.n_output = n_output # кол-во данных в выходном слое if activation == 'sigmoid': self.activation = Sigmoid() elif activation == 'tanh': self.activation == Tanh() else: raise Exception("Non-linearity not found") self.w_ih = Linear( n_inputs, n_hidden) # вес для перобразование из входного слоя в скрытый self.w_hh = Linear( n_hidden, n_hidden) # вес для перобразование из скрытого слоя в скрытый self.w_ho = Linear( n_hidden, n_output) # вес для перобразование из скрытого слоя в выходной self.parameters += self.w_ih.get_parameters() self.parameters += self.w_hh.get_parameters() self.parameters += self.w_ho.get_parameters()
def __init__(self, attn_dim, location_filters, location_kernel_size): """Constructor """ super().__init__() self.attn_dim = attn_dim self.location_filters = location_filters self.location_kernel_size = location_kernel_size self.location_conv = Conv1D(filters=location_filters, kernel_size=location_kernel_size, bias=True) self.location_dense = Linear(hidden_dim=attn_dim, bias=True)
def get_linear_logit(features, feature_columns, use_bias=False, init_std=0.0001, seed=1024, prefix='linear', l2_reg=0): linear_emb_list, dense_input_list = input_from_feature_columns( features, feature_columns, 1, l2_reg, init_std, seed, prefix=prefix) if len(linear_emb_list) > 0 and len(dense_input_list) > 0: sparse_input = concat_fun(linear_emb_list) dense_input = concat_fun(dense_input_list) linear_logit = Linear(l2_reg, mode=2, use_bias=use_bias)([sparse_input, dense_input]) elif len(linear_emb_list) > 0: # 只有sparse特征 sparse_input = concat_fun(linear_emb_list) linear_logit = Linear(l2_reg, mode=0, use_bias=use_bias)(sparse_input) elif len(dense_input_list) > 0: # 只有dense特征 dense_input = concat_fun(dense_input_list) linear_logit = Linear(l2_reg, mode=1, use_bias=use_bias)(dense_input) else: raise NotImplementedError return linear_logit
class RNNCell(Layer): def __init__(self, n_inputs, n_hidden, n_output, activation='sigmoid'): super().__init__() self.n_inputs = n_inputs # кол-во данных во входе self.n_hidden = n_hidden # кол-во данных в скрытом слое self.n_output = n_output # кол-во данных в выходном слое if activation == 'sigmoid': self.activation = Sigmoid() elif activation == 'tanh': self.activation == Tanh() else: raise Exception("Non-linearity not found") self.w_ih = Linear( n_inputs, n_hidden) # вес для перобразование из входного слоя в скрытый self.w_hh = Linear( n_hidden, n_hidden) # вес для перобразование из скрытого слоя в скрытый self.w_ho = Linear( n_hidden, n_output) # вес для перобразование из скрытого слоя в выходной self.parameters += self.w_ih.get_parameters() self.parameters += self.w_hh.get_parameters() self.parameters += self.w_ho.get_parameters() def forward(self, input, hidden): from_prev_hidden = self.w_hh.forward( hidden ) # преобразование скрытого слоя в скрытый для нового "нейрона" combined = self.w_ih.forward( input ) + from_prev_hidden # объединяем обработанный вход и получившийся новый скрытый слов new_hidden = self.activation.forward( combined ) # создание скрытого слоя из рекуррентного "нейрона" для слебудещго "нейрона"(== память сети) output = self.w_ho.forward( new_hidden) # создание выходных данных из рекуррентного "нейрона" return output, new_hidden def init_hidden(self, batch_size=1): return Tensor(np.zeros((batch_size, self.n_hidden)), autograd=True)
def train(epochs, batch_size, lr, verbose): """Main method that trains the network""" # autograd globally off torch.set_grad_enabled(False) # generate training and testing datasets train_data, train_label = generate_data() test_data, test_label = generate_data() # normalize data be centered at 0 train_data, test_data = normalize(train_data, test_data) if verbose: print("--- Dataset ---") print("Train X: ", train_data.size(), " | Train y: ", train_label.size()) print(" Test X: ", test_data.size(), " | Test y: ", test_label.size()) layers = [] # input layer (2 input units) linear1 = Linear(2, 25, bias=True, weight_init=xavier_uniform) # 3 hidden layers (each 25 units) linear2 = Linear(25, 25, bias=True, weight_init=xavier_uniform) linear3 = Linear(25, 25, bias=True, weight_init=xavier_uniform) linear4 = Linear(25, 25, bias=True, weight_init=xavier_uniform) # output layer (2 output units) linear5 = Linear(25, 2, bias=True, weight_init=xavier_uniform) layers.append(linear1) layers.append(Relu()) layers.append(linear2) layers.append(Relu()) layers.append(linear3) layers.append(Relu()) layers.append(linear4) layers.append(Tanh()) layers.append(linear5) model = Sequential(layers) if verbose: print("Number of model parameters: {}".format( sum([len(p) for p in model.param()]))) criterion = MSE() optimizer = SGD(model, lr=lr) train_losses, test_losses = [], [] train_accuracies, test_accuracies = [], [] train_errors, test_errors = [], [] if verbose: print("--- Training ---") for epoch in range(1, epochs + 1): if verbose: print("Epoch: {}".format(epoch)) # TRAINING for batch_idx in range(0, train_data.size(0), batch_size): # axis 0, start from batch_idx until batch_idx+batch_size output = model.forward(train_data.narrow(0, batch_idx, batch_size)) # Calculate loss loss = criterion.forward( output, train_label.narrow(0, batch_idx, batch_size)) train_losses.append(loss) if verbose: print("Train Loss: {:.2f}".format(loss.item())) # put to zero weights and bias optimizer.zero_grad() ## Backpropagation # Calculate grad of loss loss_grad = criterion.backward() # Grad of the model model.backward(loss_grad) # Update parameters optimizer.step() train_prediction = model.forward(train_data) acc = accuracy(train_prediction, train_label) train_accuracies.append(acc) train_errors.append(1 - acc) if verbose: print("Train Accuracy: {:.2f}".format(acc.item())) # EVALUATION for batch_idx in range(0, test_data.size(0), batch_size): # axis 0, start from batch_idx until batch_idx+batch_size output = model.forward(test_data.narrow(0, batch_idx, batch_size)) # Calculate loss loss = criterion.forward( output, test_label.narrow(0, batch_idx, batch_size)) test_losses.append(loss) if verbose: print("Test Loss: {:.2f}".format(loss.item())) test_prediction = model.forward(test_data) acc = accuracy(test_prediction, test_label) test_accuracies.append(acc) test_errors.append(1 - acc) if verbose: print("Test Accuracy: {:.2f}".format(acc.item())) return train_losses, test_losses, train_accuracies, test_accuracies, train_errors, test_errors
def __init__(self, config): self.model = None self.check_list = { 'text_maxlen', 'sentence_maxnum', 'sentence_maxlen', 'hidden_size', 'delimiter', 'pad_word', 'unk_word', 'start_sent', 'end_sent', 'vocab_size', 'embed_size', 'learning_rate' } self.config = config assert self.check(), 'parametre check failed' self.size = self.config['hidden_size'] self.Emb = Embedding(self.config['vocab_size'], self.config['embed_size'], trainable=True) self.Splitlayer_keephead = SplitLayer( delimiter=self.config['delimiter'], output_sentence_len=self.config['sentence_maxlen'], output_sentence_num=self.config['sentence_maxnum'], pad_word=self.config['pad_word'], cut_head=False, name='Split_Layer_keep_head') self.Splitlayer_cuthead = SplitLayer( delimiter=self.config['delimiter'], output_sentence_len=self.config['sentence_maxlen'], output_sentence_num=self.config['sentence_maxnum'], pad_word=self.config['pad_word'], cut_head=True, name='Split_Layer_cut_head') self.Sentence_reshape1D = Reshape((self.config['sentence_maxnum'] * self.config['sentence_maxlen'], ), name='Sentence_reshape1D') self.Sentence_reshape2D = Reshape(( self.config['sentence_maxnum'], self.config['sentence_maxlen'], self.config['embed_size'], ), name='Sentence_reshape2D') self.Encoder_word = CuDNNLSTM(units=self.size, name='Encoder_word', return_state=True) self.Decoder_word_cell = LSTMCell(units=self.size, name='Decoder_word_cell') self.AttentionMapper = Linear(output_size=self.size, bias=True, bias_start=0.0, activation='tanh') self.Join = Dense(units=1, use_bias=False, name='Join') # shape : [attention_vec_size] self.Exp = Lambda(lambda x: K.exp(x), name='Exp') self.Calcprob = Dense(units=self.config['vocab_size'], activation='softmax', name='Calcprob') self.ArgMax = Lambda(lambda x: K.argmax(x, axis=-1), dtype='int32') self.Printer = Lambda(lambda x: K.tf.Print(x, [x])) self.Identical = Lambda(lambda x: x, name='Identical') self.EncoderModel = None self.DecoderModel_onestep = None self._mask = None self._targets = None self.optim = optimizers.SGD(config['learning_rate']) return
def attention_decoder(self, decoder_inputs, initial_state, encoder_states, enc_padding_mask, Cell, initial_state_attention=False, pointer_gen=True, use_coverage=False, prev_coverage=None): # Requirements: # decoder_inputs: A list of 2D Tensors [batch_size x input_size]. # # initial_state: 2D Tensor [batch_size x cell.state_size]. # for the initialization of decoder states # encoder_states: (batchsize, timestep, 2*hiddenunits) # [batch_size, attn_length, attn_size]. # # enc_padding_mask: 2D Tensor [batch_size x attn_length] containing 1s and 0s; # indicates which of the encoder locations are padding (0) or a real token (1). # cell: rnn_cell.RNNCell defining the cell function and size. # # initial_state_attention: # Note that this attention decoder passes each decoder input through a linear layer # with the previous step's context vector to get a modified version of the input. # If initial_state_attention is False, # on the first decoder step the "previous context vector" is just a zero vector. # If initial_state_attention is True, we use initial_state to (re)calculate the previous step's context vector. # We set this to False for train/eval mode (because we call attention_decoder once for all decoder steps) # and True for decode mode (because we call attention_decoder once for each decoder step). # # pointer_gen: boolean. If True, calculate the generation probability p_gen for each decoder step. # # use_coverage: boolean. If True, use coverage mechanism. # # prev_coverage: # If not None, a tensor with shape (batch_size, attn_length). The previous step's coverage vector. # This is only not None in decode mode when using coverage. # NOTE: # To initialize a keras CUDNNLSTM layer's state: # ################################################## # if isinstance(inputs, list): # initial_state = inputs[1:] # inputs = inputs[0] # elif initial_state is not None: # pass # elif self.stateful: # initial_state = self.states # else: # initial_state = self.get_initial_state(inputs) # # ################################################## attn_size = K.int_shape(encoder_states)[2] input_size = K.int_shape(decoder_inputs[0])[1] encoder_states = Lambda(lambda x: K.expand_dims(x, axis=2))( encoder_states) # now : encoder_states.shape = (batch_size,attn_length,1,attention_vec_size) attention_vec_size = attn_size W_h_shape = (1, 1, attn_size, attention_vec_size) Encoder_Feature_Extractor = Conv2D(kernel_size=(W_h_shape[0], W_h_shape[1]), filters=W_h_shape[3], padding="same", data_format="channels_last") # W_h = [filter_height, filter_width, in_channels, out_channels] encoder_features = Encoder_Feature_Extractor(encoder_states) # nn_ops.conv2d(encoder_states, W_h, [1, 1, 1, 1], "SAME") # shape (batch_size,attn_length, 1 , attention_vec_size) if use_coverage: w_c = (1, 1, 1, attention_vec_size) Coverage_Feature_Extractor = Conv2D(kernel_size=(w_c[0], w_c[1]), filters=w_c[3], padding="same", data_format="channels_last") if prev_coverage is not None: expand_2_3 = Lambda( lambda x: K.expand_dims(K.expand_dims(x, 2), 3)) prev_coverage = expand_2_3(prev_coverage) # v: shared vector, attention_vec_size-dim -> 1-dim, calculating V = Dense(1, use_bias=False, kernel_initializer='glorot_uniform' ) # shape : [attention_vec_size] Attn_Dist_and_Encoder_States_to_Context_Vector = Lambda( lambda X: attn_dist_and_encoder_states_to_context_vector( X, attn_size)) Masked_Attention = Lambda( lambda x: masked_attention(x, enc_padding_mask)) Features_Adder = Lambda(lambda x: sum_and_tanh(x)) Squeezer_3_2 = Lambda( lambda x: K.squeeze(K.squeeze(x, axis=3), axis=2)) Expand_Dim_2_2 = Lambda( lambda x: K.expand_dims(K.expand_dims(x, 2), 2)) Attention_Linear_layer = Linear(attention_vec_size, True) # the linear layer used in attention(...), # transform decoder_state to decoder_features Decoder_Input_to_Cell_Input = Linear(input_size, True) Calculate_pgen_Linear_layer = Linear(1, True, activation='sigmoid') AttnOutputProjection_Linear_layer = Linear(Cell.output_dim, True) Expand_1_1 = Lambda( lambda x: K.expand_dims(K.expand_dims(x, axis=1), axis=1)) def attention(decoder_state, coverage=None): # Calculate the context vector and attention distribution from the decoder state. # Args: # decoder_state: state of the decoder # coverage: Optional. Previous timestep's coverage vector, shape (batch_size, attn_len, 1, 1). # Returns: # context_vector: weighted sum of encoder_states # attn_dist: attention distribution # coverage: new coverage vector. shape (batch_size, attn_len, 1, 1) decoder_features = Attention_Linear_layer( decoder_state) # shape (batch_size, attention_vec_size) decoder_features = Expand_1_1( decoder_features ) # reshape to (batch_size, 1, 1, attention_vec_size) if use_coverage and coverage is not None: coverage_features = Coverage_Feature_Extractor(coverage) added_features = Features_Adder( [encoder_features, decoder_features, coverage_features]) # added_features: shape (batch_size,attn_length, 1, 1) e = Squeezer_3_2(V(added_features)) # e: shape (batch_size,attn_length) # Calculate attention distribution attn_dist = Masked_Attention(e) # Update coverage vector # sum over the input sequence coverage = Lambda(lambda x: x[0] + Reshape((-1, 1, 1))(x[1]))( [coverage, attn_dist]) else: added_features = Features_Adder( [encoder_features, decoder_features]) # added_features: shape (batch_size,attn_length, 1, 1) e = Squeezer_3_2(V(added_features)) attn_dist = Masked_Attention(e) if use_coverage: # first step of training coverage = Expand_Dim_2_2(attn_dist) # initialize coverage context_vector = Attn_Dist_and_Encoder_States_to_Context_Vector( [attn_dist, encoder_states]) # context_vector = math_ops.reduce_sum(array_ops.reshape(attn_dist, # [batch_size, -1, 1, 1]) * encoder_states, # [1, 2]) # shape (batch_size, attn_size). # context_vector = array_ops.reshape(context_vector, [-1, attn_size]) return context_vector, attn_dist, coverage # ####END OF ATTENTION#### # # Return values: outputs = [] attn_dists = [] p_gens = [] # initial_state is a list/ tuple state_h, state_c = initial_state[0], initial_state[1] coverage_ret = prev_coverage # initialize coverage to None or whatever was passed in # re-typed to tf.Tensor for backend operations context_vector_ret = Lambda( lambda x: K.zeros(shape=(self._batch_size, attn_size)))([]) # Get a zero-initialized context vector if initial_state_attention: # Re-calculate the context vector from the previous step # so that we can pass it through a linear layer with this step's input # to get a modified version of the input context_vector_ret, _, coverage_ret = attention( initial_state, coverage_ret) # in decode mode, this is what updates the coverage vector # otherwise, context_vector & coverage are zero vectors for i, inp in enumerate(decoder_inputs): transformed_inp = Decoder_Input_to_Cell_Input( [inp, context_vector_ret]) cell_output, state_h, state_c = Cell( [transformed_inp, state_h, state_c]) if i == 0 and initial_state_attention: # always true in decode mode context_vector_ret, attn_dist_ret, _ = attention( [state_h, state_c], coverage_ret) # don't allow coverage to update else: context_vector_ret, attn_dist_ret, coverage_ret = attention( [state_h, state_c], coverage_ret) attn_dists.append(attn_dist_ret) if pointer_gen: p_gen = Calculate_pgen_Linear_layer( [context_vector_ret, state_h, state_c, transformed_inp]) p_gens.append(p_gen) output = AttnOutputProjection_Linear_layer( [cell_output, context_vector_ret]) outputs.append(output) print('finished adding attention_decoder for each time step!') if coverage_ret is not None: coverage_ret = Lambda( lambda x: K.reshape(x, [self._batch_size, -1]))(coverage_ret) return outputs, [state_h, state_c], attn_dists, p_gens, coverage_ret
def __init__(self, n_inputs, n_hidden, n_output): super().__init__() self.n_inputs = n_inputs # кол-во данных во входе self.n_hidden = n_hidden # кол-во данных в скрытом слое self.n_output = n_output # кол-во данных в выходном слое self.xf = Linear(n_inputs, n_hidden) self.xi = Linear(n_inputs, n_hidden) self.xo = Linear(n_inputs, n_hidden) self.xc = Linear(n_inputs, n_hidden) self.hf = Linear(n_hidden, n_hidden, bias=False) self.hi = Linear(n_hidden, n_hidden, bias=False) self.ho = Linear(n_hidden, n_hidden, bias=False) self.hc = Linear(n_hidden, n_hidden, bias=False) self.w_ho = Linear(n_hidden, n_output, bias=False) self.parameters += self.xf.get_parameters() self.parameters += self.xi.get_parameters() self.parameters += self.xo.get_parameters() self.parameters += self.xc.get_parameters() self.parameters += self.hf.get_parameters() self.parameters += self.hi.get_parameters() self.parameters += self.ho.get_parameters() self.parameters += self.hc.get_parameters() self.parameters += self.w_ho.get_parameters()
class LSTMCell(Layer): def __init__(self, n_inputs, n_hidden, n_output): super().__init__() self.n_inputs = n_inputs # кол-во данных во входе self.n_hidden = n_hidden # кол-во данных в скрытом слое self.n_output = n_output # кол-во данных в выходном слое self.xf = Linear(n_inputs, n_hidden) self.xi = Linear(n_inputs, n_hidden) self.xo = Linear(n_inputs, n_hidden) self.xc = Linear(n_inputs, n_hidden) self.hf = Linear(n_hidden, n_hidden, bias=False) self.hi = Linear(n_hidden, n_hidden, bias=False) self.ho = Linear(n_hidden, n_hidden, bias=False) self.hc = Linear(n_hidden, n_hidden, bias=False) self.w_ho = Linear(n_hidden, n_output, bias=False) self.parameters += self.xf.get_parameters() self.parameters += self.xi.get_parameters() self.parameters += self.xo.get_parameters() self.parameters += self.xc.get_parameters() self.parameters += self.hf.get_parameters() self.parameters += self.hi.get_parameters() self.parameters += self.ho.get_parameters() self.parameters += self.hc.get_parameters() self.parameters += self.w_ho.get_parameters() def forward(self, input, hidden): prev_hidden = hidden[0] # кратковременная память сети prev_cell = hidden[1] # долгосрочная память сети # определяем какую информацию мы можем забыть и возвращаем результат, как часть того сколько нужно забыть, балгодаря сигмойде [0, 1] f = (self.xf.forward(input) + self.hf.forward(prev_hidden)).sigmoid() # определеяем какую информациюнадо сохранить. Точно также приводим к [0, 1] i = (self.xi.forward(input) + self.hi.forward(prev_hidden)).sigmoid() # определеяем какую информациюнадо можно добавить. g = (self.xc.forward(input) + self.hc.forward(prev_hidden)).tanh() # Заменяем старое состояние ячейки на новоое, забывая (f) и прибаляя (i * g) то, что нам нужнло c = (f * prev_cell) + (i * g) # Решаем какую долю информации нам вернуть в виде окончательного рещзультата [0, 1] o = (self.xo.forward(input) + self.ho.forward(prev_hidden)).sigmoid() # Выводим инофрмацию, с приведением нового сотостояния к диапазону от [-1, 1] h = o * c.tanh() output = self.w_ho.forward(h) return output, (h, c) def init_hidden(self, batch_size=1): init_hidden = Tensor(np.zeros((batch_size, self.n_hidden)), autograd=True) init_cell = Tensor(np.zeros((batch_size, self.n_hidden)), autograd=True) init_hidden.data[:, 0] += 1 init_cell.data[:, 0] += 1 return (init_hidden, init_cell)
def __init__(self): self.conv1 = conv2d(1, 1, 3) self.fc1 = Linear(14 * 14, 10) self.max_pool = max_pool2d(2, 2) self.relu = relu() self.sigmoid = sigmoid()