def __call__(self, htA, HO, transform_flag=True): """ :param htA: :param HO: :param transform_flag: determine if the model needs selective transformation, :return: """ seq_len = len(HO) HO_hat = [] Weights = [] for i in range(seq_len): hiO = HO[i] if transform_flag: hiO_hat = hiO + dy.rectify(self.W_A * htA + self.W_O * hiO + self.b) else: hiO_hat = hiO wi = dy.tanh(dy.dot_product(self.W_concat, dy.concatenate([htA, hiO_hat]))) HO_hat.append(hiO_hat) Weights.append(wi) HO_hat = dy.concatenate([dy.reshape(ele, d=(1, 2 * self.dim_opi)) for ele in HO_hat]) Weights = dy.concatenate(Weights) # length: seq_len Weights = dy.softmax(Weights) Weights_np = Weights.npvalue() ho_summary_t = dy.reshape(Weights, (1, seq_len)) * HO_hat return dy.reshape(ho_summary_t, (2 * self.dim_opi,)), Weights_np
def transduce(self, embed_sent): src = embed_sent.as_tensor() sent_len = src.dim()[0][1] src_width = 1 batch_size = src.dim()[1] pad_size = (self.window_receptor - 1) / 2 #TODO adapt it also for even window size src = dy.concatenate([ dy.zeroes((self.input_dim, pad_size), batch_size=batch_size), src, dy.zeroes((self.input_dim, pad_size), batch_size=batch_size) ], d=1) padded_sent_len = sent_len + 2 * pad_size conv1 = dy.parameter(self.pConv1) bias1 = dy.parameter(self.pBias1) src_chn = dy.reshape(src, (self.input_dim, padded_sent_len, 1), batch_size=batch_size) cnn_layer1 = dy.conv2d_bias(src_chn, conv1, bias1, stride=[1, 1]) hidden_layer = dy.reshape(cnn_layer1, (self.internal_dim, sent_len, 1), batch_size=batch_size) if self.non_linearity is 'linear': hidden_layer = hidden_layer elif self.non_linearity is 'tanh': hidden_layer = dy.tanh(hidden_layer) elif self.non_linearity is 'relu': hidden_layer = dy.rectify(hidden_layer) elif self.non_linearity is 'sigmoid': hidden_layer = dy.logistic(hidden_layer) for conv_hid, bias_hid in self.builder_layers: hidden_layer = dy.conv2d_bias(hidden_layer, dy.parameter(conv_hid), dy.parameter(bias_hid), stride=[1, 1]) hidden_layer = dy.reshape(hidden_layer, (self.internal_dim, sent_len, 1), batch_size=batch_size) if self.non_linearity is 'linear': hidden_layer = hidden_layer elif self.non_linearity is 'tanh': hidden_layer = dy.tanh(hidden_layer) elif self.non_linearity is 'relu': hidden_layer = dy.rectify(hidden_layer) elif self.non_linearity is 'sigmoid': hidden_layer = dy.logistic(hidden_layer) last_conv = dy.parameter(self.last_conv) last_bias = dy.parameter(self.last_bias) output = dy.conv2d_bias(hidden_layer, last_conv, last_bias, stride=[1, 1]) output = dy.reshape(output, (sent_len, self.output_dim), batch_size=batch_size) output_seq = ExpressionSequence(expr_tensor=output) self._final_states = [FinalTransducerState(output_seq[-1])] return output_seq
def calc_loss(self, src, db_idx, src_mask=None, trg_mask=None): src_embeddings = self.src_embedder.embed_sent(src, mask=src_mask) self.src_encoder.set_input(src) src_encodings = self.exprseq_pooling(self.src_encoder.transduce(src_embeddings)) trg_batch, trg_mask = self.database[db_idx] # print("trg_mask=\n",trg_mask) trg_encodings = self.encode_trg_example(trg_batch, mask=trg_mask) dim = trg_encodings.dim() trg_reshaped = dy.reshape(trg_encodings, (dim[0][0], dim[1])) # ### DEBUG # trg_npv = trg_reshaped.npvalue() # for i in range(dim[1]): # print("--- trg_reshaped {}: {}".format(i,list(trg_npv[:,i]))) # ### DEBUG prod = dy.transpose(src_encodings) * trg_reshaped # ### DEBUG # prod_npv = prod.npvalue() # for i in range(dim[1]): # print("--- prod {}: {}".format(i,list(prod_npv[0].transpose()[i]))) # ### DEBUG id_range = list(range(len(db_idx))) # This is ugly: if self.loss_direction == "forward": prod = dy.transpose(prod) loss = dy.sum_batches(dy.hinge_batch(prod, id_range)) elif self.loss_direction == "bidirectional": prod = dy.reshape(prod, (len(db_idx), len(db_idx))) loss = dy.sum_elems( dy.hinge_dim(prod, id_range, d=0) + dy.hinge_dim(prod, id_range, d=1)) else: raise RuntimeError("Illegal loss direction {}".format(self.loss_direction)) return loss
def decode_loss(self, src_encodings, tgt_seqs): """ :param tgt_seqs: (tgt_heads, tgt_labels): list (length=batch_size) of (src_len) """ # todo(NOTE): Sentences should start with empty token (as root of dependency tree)! tgt_heads, tgt_labels = tgt_seqs src_len = len(tgt_heads[0]) batch_size = len(tgt_heads) np_tgt_heads = np.array(tgt_heads).flatten() # (src_len * batch_size) np_tgt_labels = np.array(tgt_labels).flatten() s_arc, s_label = self.cal_scores(src_encodings) # (src_len, src_len, bs), ([(src_len, src_len, bs)]) s_arc_value = s_arc.npvalue() s_arc_choice = np.argmax(s_arc_value, axis=0).transpose().flatten() # (src_len * batch_size) s_pick_labels = [dy.pick_batch(dy.reshape(score, (src_len,), batch_size=src_len * batch_size), s_arc_choice) for score in s_label] s_argmax_labels = dy.concatenate(s_pick_labels, d=0) # n_labels, src_len * batch_size reshape_s_arc = dy.reshape(s_arc, (src_len,), batch_size=src_len * batch_size) arc_loss = dy.pickneglogsoftmax_batch(reshape_s_arc, np_tgt_heads) label_loss = dy.pickneglogsoftmax_batch(s_argmax_labels, np_tgt_labels) loss = dy.sum_batches(arc_loss + label_loss) / batch_size return loss
def flatten_triple(action_scores, location_scores, argument_scores): """ Flattens three scores vectors by summing over all possibilities. """ num_actions = action_scores.dim()[0][0] num_locations = location_scores.dim()[0][0] num_arguments = argument_scores.dim()[0][0] expanded_arguments = dy.reshape(argument_scores, (num_arguments, 1)) \ * dy.ones((1, num_locations)) expanded_locations = dy.ones((num_arguments, 1)) \ * dy.reshape(location_scores, (1, num_locations)) # num_locations x num_arguments location_and_argument_scores = expanded_locations + expanded_arguments location_and_argument_expanded = dy.reshape(location_and_argument_scores, (num_locations * num_arguments, 1)) \ * dy.ones((1, num_actions)) expanded_actions = dy.ones((num_arguments * num_locations, 1)) \ * dy.reshape(action_scores, (1, num_actions)) final_scores = location_and_argument_expanded + expanded_actions # num_actions * num_locations x num_arguments final_scores = dy.reshape(final_scores, (num_actions * num_locations * num_arguments, 1)) return final_scores
def transduce(self, es): es_expr = es.as_tensor() # e.g. es_expr.dim() ==((276, 240), 1) sent_len = es_expr.dim()[0][0] batch_size=es_expr.dim()[1] # convolutions won't work if sent length is too short; pad if necessary pad_size = 0 while math.ceil(float(sent_len + pad_size - self.filter_size_time + 1) / float(self.stride[0])) < self.filter_size_time: pad_size += 1 if pad_size>0: es_expr = dy.concatenate([es_expr, dy.zeroes((pad_size, self.freq_dim * self.chn_dim), batch_size=es_expr.dim()[1])]) sent_len += pad_size # convolution layers es_chn = dy.reshape(es_expr, (sent_len, self.freq_dim, self.chn_dim), batch_size=batch_size) # ((276, 80, 3), 1) cnn_layer1 = dy.conv2d(es_chn, dy.parameter(self.filters1), stride=self.stride, is_valid=True) # ((137, 39, 32), 1) cnn_layer2 = dy.conv2d(cnn_layer1, dy.parameter(self.filters2), stride=self.stride, is_valid=True) # ((68, 19, 32), 1) cnn_out = dy.reshape(cnn_layer2, (cnn_layer2.dim()[0][0], cnn_layer2.dim()[0][1]*cnn_layer2.dim()[0][2]), batch_size=batch_size) # ((68, 608), 1) es_list = [cnn_out[i] for i in range(cnn_out.dim()[0][0])] # RNN layers for (fb, bb) in self.builder_layers: fs = fb.initial_state().transduce(es_list) bs = bb.initial_state().transduce(reversed(es_list)) es_list = [dy.concatenate([f, b]) for f, b in zip(fs, reversed(bs))] return es_list
def stitch(self, layer_predictions): """ Takes as input the predicted states of all the layers of a task-specific network and produces a linear combination of them. :param layer_predictions: a list of length num_layers containing lists of length seq_len of predicted states for each layer :return: a list of linear combinations of the predicted states at every time step for each layer """ assert len(layer_predictions) == self.num_layers concatenated_layer_states = dynet.reshape(dynet.concatenate_cols(\ list(layer_predictions)), (self.num_layers, self.hidden_dim)) product = None if (self.num_layers > 1): product = dynet.transpose(dynet.parameter( self.betas)) * concatenated_layer_states else: product = dynet.parameter(self.betas) * concatenated_layer_states reshaped = dynet.reshape(product, (self.hidden_dim, )) return reshaped
def bilinear(x, W, y, input_size, seq_len, num_outputs=1, bias_x=False, bias_y=False): # x,y: (input_size x seq_len) x batch_size if bias_x: x = dy.concatenate( [x, dy.inputTensor(np.ones((1, seq_len), dtype=np.float32))]) if bias_y: y = dy.concatenate( [y, dy.inputTensor(np.ones((1, seq_len), dtype=np.float32))]) nx, ny = input_size + bias_x, input_size + bias_y # W: (num_outputs x ny) x nx lin = W * x if num_outputs > 1: lin = dy.reshape(lin, (ny, num_outputs * seq_len)) blin = dy.transpose(y) * lin if num_outputs > 1: blin = dy.reshape(blin, (seq_len, num_outputs, seq_len)) # seq_len_y x seq_len_x if output_size == 1 # seq_len_y x num_outputs x seq_len_x else return blin
def stitch(self, layer_predictions): """ Takes as input the predicted states of all the layers of a task-specific network and produces a linear combination of them. :param layer_predictions: a list of length num_layers containing lists of length seq_len of predicted states for each layer :return: a list of linear combinations of the predicted states at every time step for each layer """ assert len(layer_predictions) == self.num_layers linear_combinations = [] # iterate over tuples of predictions of each layer at every time step for layer_states in zip(*layer_predictions): # concatenate the predicted state for all layers to a matrix of # shape (num_layers, hidden_dim) concatenated_layer_states = dynet.reshape( dynet.concatenate_cols(list(layer_states)), (self.num_layers, self.hidden_dim)) # multiply with (1, num_layers) betas to produce (1, hidden_dim) product = dynet.transpose(dynet.parameter( self.betas)) * concatenated_layer_states # reshape to (hidden_dim) reshaped = dynet.reshape(product, (self.hidden_dim, )) linear_combinations.append(reshaped) return linear_combinations
def transform(self, input_expr: dy.Expression, mask: Optional[batchers.Mask]=None): """ Apply batch norm. Args: input_expr: input mask: compute statistics only over unmasked parts of the input expression """ dim_in = input_expr.dim() param_bn_gamma = dy.parameter(self.gamma) param_bn_beta = dy.parameter(self.beta) if self.train: num_unmasked = 0 if mask is not None: input_expr = set_masked_to_mean(mask, input_expr, self.time_first) num_unmasked = (mask.np_arr.size - np.count_nonzero(mask.np_arr)) * broadcast_factor(mask, input_expr) bn_mean = dy.moment_dim(input_expr, self.get_stat_dimensions(), 1, True, num_unmasked) neg_bn_mean_reshaped = -dy.reshape(-bn_mean, self.get_normalizer_dimensionality()) self.population_running_mean += (-BN_MOMENTUM) * self.population_running_mean + BN_MOMENTUM * bn_mean.npvalue() bn_std = dy.std_dim(input_expr, self.get_stat_dimensions(), True, num_unmasked) self.population_running_std += (-BN_MOMENTUM) * self.population_running_std + BN_MOMENTUM * bn_std.npvalue() else: neg_bn_mean_reshaped = -dy.reshape(dy.inputVector(self.population_running_mean), self.get_normalizer_dimensionality()) bn_std = dy.inputVector(self.population_running_std) bn_numerator = input_expr + neg_bn_mean_reshaped bn_xhat = dy.cdiv(bn_numerator, dy.reshape(bn_std, self.get_normalizer_dimensionality()) + BN_EPS) bn_y = dy.cmult(param_bn_gamma, bn_xhat) + param_bn_beta # y = gamma * xhat + beta dim_out = bn_y.dim() self.save_processed_arg("population_running_mean", self.population_running_mean) self.save_processed_arg("population_running_std", self.population_running_std) assert dim_out == dim_in return bn_y
def bilinear(x, W, y, input_size, seq_len, batch_size, num_outputs=1, bias_x=False, bias_y=False): # adopted from: https://github.com/jcyk/Dynet-Biaffine-dependency-parser/blob/master/lib/utils.py # x,y: (input_size x seq_len) x batch_size if bias_x: x = dy.concatenate( [x, dy.inputTensor(np.ones((1, seq_len), dtype=np.float32))]) if bias_y: y = dy.concatenate( [y, dy.inputTensor(np.ones((1, seq_len), dtype=np.float32))]) nx, ny = input_size + bias_x, input_size + bias_y # W: (num_outputs x ny) x nx lin = W * x if num_outputs > 1: lin = dy.reshape(lin, (ny, num_outputs * seq_len), batch_size=batch_size) blin = dy.transpose(y) * lin if num_outputs > 1: blin = dy.reshape(blin, (seq_len, num_outputs, seq_len), batch_size=batch_size) # seq_len_y x seq_len_x if output_size == 1 # seq_len_y x num_outputs x seq_len_x else return blin
def recurrence(self, xt, hmtm1, cmtm1, h_tilde_tm1, dropout_flag): """ recurrence function of LSTM with truncated self-attention :param xt: current input, shape: (n_in) :param hmtm1: hidden memory [htm1, ..., h1], shape: (n_steps, n_out) :param cmtm1: cell memory: (n_steps, n_out) :param h_tilde_tm1: previous hidden summary, shape: (n_out, ) :param h_tilde_tm1: previous cell summary :param dropout_flag: where perform partial dropout :return: """ score = dy.concatenate([dy.dot_product(self.u, dy.tanh(\ self.W_h * hmtm1[i] + self.W_x * xt + self.W_htilde * h_tilde_tm1)) for i in range(self.n_steps)]) # normalize the attention score score = dy.softmax(score) # shape: (1, n_out) h_tilde_t = dy.reshape(dy.transpose(score) * hmtm1, d=(self.n_out,)) c_tilde_t = dy.transpose(score) * cmtm1 Wx = self.W * xt if dropout_flag: # perform partial dropout over the lstm Wx = dy.dropout(Wx, self.dropout_rate) Uh = self.U * h_tilde_t # shape: (4*n_out) sum_item = Wx + Uh + self.b it = dy.logistic(sum_item[:self.n_out]) ft = dy.logistic(sum_item[self.n_out:2*self.n_out]) ot = dy.logistic(sum_item[2*self.n_out:3*self.n_out]) c_hat = dy.tanh(sum_item[3*self.n_out:]) ct = dy.cmult(ft, dy.reshape(c_tilde_t, d=(self.n_out,))) + dy.cmult(it, c_hat) ht = dy.cmult(ot, dy.tanh(ct)) hmt = dy.concatenate([hmtm1[1:], dy.reshape(ht, (1, self.n_out))]) cmt = dy.concatenate([cmtm1[1:], dy.reshape(ct, (1, self.n_out))]) return hmt, cmt, h_tilde_t
def calc_predict_and_activations(wids, tag, words): dy.renew_cg() if len(wids) < WIN_SIZE: wids += [0] * (WIN_SIZE-len(wids)) cnn_in = dy.concatenate([dy.lookup(W_emb, x) for x in wids], d=1) cnn_out = dy.conv2d_bias(cnn_in, W_cnn, b_cnn, stride=(1, 1), is_valid=False) filters = (dy.reshape(cnn_out, (len(wids), FILTER_SIZE))).npvalue() activations = filters.argmax(axis=0) pool_out = dy.max_dim(cnn_out, d=1) pool_out = dy.reshape(pool_out, (FILTER_SIZE,)) pool_out = dy.rectify(pool_out) scores = (W_sm * pool_out + b_sm).npvalue() print ('%d ||| %s' % (tag, ' '.join(words))) predict = np.argmax(scores) print (display_activations(words, activations)) print ('scores=%s, predict: %d' % (scores, predict)) features = pool_out.npvalue() W = W_sm.npvalue() bias = b_sm.npvalue() print (' bias=%s' % bias) contributions = W * features print (' very bad (%.4f): %s' % (scores[0], contributions[0])) print (' bad (%.4f): %s' % (scores[1], contributions[1])) print (' neutral (%.4f): %s' % (scores[2], contributions[2])) print (' good (%.4f): %s' % (scores[3], contributions[3])) print ('very good (%.4f): %s' % (scores[4], contributions[4]))
def _attend(self, input_vectors, state, prev_att, prev_att_expr, receptive, compute_attention): if compute_attention or prev_att_expr is None: w1 = self.att_w1.expr() w2 = self.att_w2.expr() w3 = self.att_w3.expr() w4 = self.att_w4.expr() v = self.att_v.expr() attention_weights = [] att_cnn = self.cnn_attention.apply( dy.reshape(prev_att, (len(input_vectors), 1))) att_cnn = dy.reshape( att_cnn, (len(input_vectors), self.config.att_lsa_filters)) w2dt = w2 * state.h()[-1] w4dt = w4 * receptive for cnn, input_vector in zip(att_cnn, input_vectors): attention_weight = v * dy.tanh(w1 * input_vector + w2dt + w3 * cnn + w4dt) attention_weights.append(attention_weight) attention_weights = dy.softmax(dy.concatenate(attention_weights)) #print attention_weights.value() else: attention_weights = prev_att_expr output_vectors = dy.esum([ vector * attention_weight for vector, attention_weight in zip( input_vectors, attention_weights) ]) return output_vectors, attention_weights
def train(self, trainning_set): for sentence, eid, entity, trigger, label, pos, chars, rule in trainning_set: features = self.encode_sentence(sentence, pos, chars) loss = [] entity_embeds = features[entity] attention, context = self.self_attend(features) ty = dy.vecInput(len(sentence)) ty.set([0 if i!=trigger else 1 for i in range(len(sentence))]) loss.append(dy.binary_log_loss(dy.reshape(attention,(len(sentence),)), ty)) h_t = dy.concatenate([context, entity_embeds]) hidden = dy.tanh(self.lb.expr() * h_t + self.lb_bias.expr()) out_vector = dy.reshape(dy.logistic(self.lb2.expr() * hidden + self.lb2_bias.expr()), (1,)) label = dy.scalarInput(label) loss.append(dy.binary_log_loss(out_vector, label)) pres = [0] for pattern in rule: probs = self.decoder(features, pres) loss.append(-dy.log(dy.pick(probs, pattern))) pres.append(pattern) loss = dy.esum(loss) loss.backward() self.trainer.update() dy.renew_cg()
def build_graph(self, x): conv_W_1 = dy.parameter(self.params['conv_W_1']) conv_b_1 = dy.parameter(self.params['conv_b_1']) conv_W_2 = dy.parameter(self.params['conv_W_2']) conv_b_2 = dy.parameter(self.params['conv_b_2']) conv_W_3 = dy.parameter(self.params['conv_W_3']) conv_b_3 = dy.parameter(self.params['conv_b_3']) W = dy.parameter(self.params['W']) b = dy.parameter(self.params['b']) (n, d), _ = x.dim() x = dy.reshape(x, (1, n, d)) # 一维卷积网络 conv_1 = dy.tanh( dy.conv2d_bias(x, conv_W_1, conv_b_1, (1, 1), is_valid=False)) conv_2 = dy.tanh( dy.conv2d_bias(x, conv_W_2, conv_b_2, (1, 1), is_valid=False)) conv_3 = dy.tanh( dy.conv2d_bias(x, conv_W_3, conv_b_3, (1, 1), is_valid=False)) pool_1 = dy.max_dim(dy.reshape(conv_1, (n, self.options['channel_1']))) pool_2 = dy.max_dim(dy.reshape(conv_2, (n, self.options['channel_2']))) pool_3 = dy.max_dim(dy.reshape(conv_3, (n, self.options['channel_3']))) # 全连接分类 pool = dy.concatenate([pool_1, pool_2, pool_3], 0) logit = dy.dot_product(pool, W) + b return logit
def calc_loss(self, src, trg, loss_calculator): if not batcher.is_batched(src): src = batcher.ListBatch([src]) src_inputs = batcher.ListBatch([s[:-1] for s in src], mask=batcher.Mask(src.mask.np_arr[:,:-1]) if src.mask else None) src_targets = batcher.ListBatch([s[1:] for s in src], mask=batcher.Mask(src.mask.np_arr[:,1:]) if src.mask else None) self.start_sent(src) embeddings = self.src_embedder.embed_sent(src_inputs) encodings = self.rnn.transduce(embeddings) encodings_tensor = encodings.as_tensor() ((hidden_dim, seq_len), batch_size) = encodings.dim() encoding_reshaped = dy.reshape(encodings_tensor, (hidden_dim,), batch_size=batch_size * seq_len) outputs = self.transform(encoding_reshaped) ref_action = np.asarray([sent.words for sent in src_targets]).reshape((seq_len * batch_size,)) loss_expr_perstep = self.scorer.calc_loss(outputs, batcher.mark_as_batch(ref_action)) loss_expr_perstep = dy.reshape(loss_expr_perstep, (seq_len,), batch_size=batch_size) if src_targets.mask: loss_expr_perstep = dy.cmult(loss_expr_perstep, dy.inputTensor(1.0-src_targets.mask.np_arr.T, batched=True)) loss_expr = dy.sum_elems(loss_expr_perstep) model_loss = loss.FactoredLossExpr() model_loss.add_loss("mle", loss_expr) return model_loss
def calc_predict_and_activations(wids, tag, words): dy.renew_cg() if len(wids) < WIN_SIZE: wids += [0] * (WIN_SIZE - len(wids)) cnn_in = dy.concatenate([dy.lookup(W_emb, x) for x in wids], d=1) cnn_out = dy.conv2d_bias(cnn_in, W_cnn, b_cnn, stride=(1, 1), is_valid=False) filters = (dy.reshape(cnn_out, (len(wids), FILTER_SIZE))).npvalue() activations = filters.argmax(axis=0) pool_out = dy.max_dim(cnn_out, d=1) pool_out = dy.reshape(pool_out, (FILTER_SIZE, )) pool_out = dy.rectify(pool_out) scores = (W_sm * pool_out + b_sm).npvalue() print('%d ||| %s' % (tag, ' '.join(words))) predict = np.argmax(scores) print(display_activations(words, activations)) print('scores=%s, predict: %d' % (scores, predict)) features = pool_out.npvalue() W = W_sm.npvalue() bias = b_sm.npvalue() print(' bias=%s' % bias) contributions = W * features print(' very bad (%.4f): %s' % (scores[0], contributions[0])) print(' bad (%.4f): %s' % (scores[1], contributions[1])) print(' neutral (%.4f): %s' % (scores[2], contributions[2])) print(' good (%.4f): %s' % (scores[3], contributions[3])) print('very good (%.4f): %s' % (scores[4], contributions[4]))
def shape_projection(self, x, batch_size): total_words = x.dim()[1] seq_len = total_words / batch_size out = dy.reshape(x, (self.model_dim, seq_len), batch_size=batch_size) out = dy.transpose(out) return dy.reshape(out, (seq_len, self.dim_per_head), batch_size=batch_size * self.head_count)
def transduce( self, src: expression_seqs.ExpressionSequence ) -> expression_seqs.ExpressionSequence: src_tensor = src.as_tensor() out_mask = src.mask if self.downsample_by > 1: assert len(src_tensor.dim()[0])==2, \ f"Downsampling only supported for tensors of order two. Found dims {src_tensor.dim()}" (hidden_dim, seq_len), batch_size = src_tensor.dim() if seq_len % self.downsample_by != 0: raise ValueError( "For downsampling, sequence lengths must be multiples of the total reduce factor. " "Configure batcher accordingly.") src_tensor = dy.reshape(src_tensor, (hidden_dim * self.downsample_by, seq_len // self.downsample_by), batch_size=batch_size) if out_mask: out_mask = out_mask.lin_subsampled( reduce_factor=self.downsample_by) output = self.transform.transform(src_tensor) if self.downsample_by == 1: if len(output.dim()) != src_tensor.dim( ): # can happen with seq length 1 output = dy.reshape(output, src_tensor.dim()[0], batch_size=src_tensor.dim()[1]) output_seq = expression_seqs.ExpressionSequence(expr_tensor=output, mask=out_mask) self._final_states = [FinalTransducerState(output_seq[-1])] return output_seq
def recurrence(self, xt, hmtm1, h_history_tm1, dropout_flag): """ :param xt: input vector at the time step t :param hmtm1: hidden memories in previous n_steps steps :param h_tilde_tm1: previous hidden summary :param dropout_flag: make a decision for conducting partial dropout :return: """ score = dy.concatenate([dy.dot_product(self.u, dy.tanh( \ self.W_h * hmtm1[i] + self.W_x * xt + self.W_htilde * h_history_tm1)) for i in range(self.n_steps)]) # normalize the attention score score = dy.softmax(score) # shape: (1, n_out), history of [h[t-n_steps-1], ..., h[t-2]] h_history_t = dy.reshape(dy.transpose(score) * hmtm1[:-1], d=(self.n_out,)) htm1 = hmtm1[-1] #h_tilde_t = dy.concatenate([h_history_t, htm1]) h_tilde_t = htm1 + dy.rectify(h_history_t) if dropout_flag: # perform partial dropout, i.e., add dropout over the matrices W_x* rt = dy.logistic(dy.dropout(self.W_xr, self.dropout_rate) * xt + self.W_hr * h_tilde_t + self.br) zt = dy.logistic(dy.dropout(self.W_xz, self.dropout_rate) * xt + self.W_hz * h_tilde_t + self.bz) ht_hat = dy.tanh(dy.dropout(self.W_xh, self.dropout_rate) * xt + self.W_hh * dy.cmult(rt, h_tilde_t) \ + self.bh) ht = dy.cmult(zt, h_tilde_t) + dy.cmult((1.0 - zt), ht_hat) else: rt = dy.logistic(self.W_xr * xt + self.W_hr * h_tilde_t + self.br) zt = dy.logistic(self.W_xz * xt + self.W_hz * h_tilde_t + self.bz) ht_hat = dy.tanh(self.W_xh * xt + self.W_hh * dy.cmult(rt, h_tilde_t) + self.bh) ht = dy.cmult(zt, h_tilde_t) + dy.cmult((1.0 - zt), ht_hat) hmt = dy.concatenate([hmtm1[1:], dy.reshape(ht, (1, self.n_out))]) return hmt, h_history_t
def scorer(self, q_d_hists, q_idf, bm25_score, overlap_features, p): """ Makes all the calculations and returns a relevance score """ idf_vec = dy.inputVector(q_idf) bm25_score = dy.scalarInput(bm25_score) overlap_features = dy.inputVector(overlap_features) # Pass each query term representation through the MLP term_scores = [] for hist in q_d_hists: q_d_hist = dy.reshape(dy.inputVector(hist), (1, len(hist))) hidd_out = dy.rectify(q_d_hist * self.W_1 + self.b_1) for i in range(0, self.mlp_layers): hidd_out = dy.rectify(hidd_out * self.W_n[i] + self.b_n[i]) term_scores.append(hidd_out * self.W_last + self.b_last) # Term Gating gating_weights = idf_vec * self.w_g bm25_feature = bm25_score * self.W_bm25 + self.b_bm25 drop_out = dy.scalarInput(1) drop_num = (np.random.rand(1) < p)/p #p= probability of keeping a unit active drop_out.set(drop_num) bm25_feature *= drop_out drmm_score = dy.transpose(dy.concatenate(term_scores)) * dy.reshape(gating_weights, (len(q_idf), 1)) #basic MLPs output doc_score = dy.transpose(dy.concatenate([drmm_score, overlap_features])) * self.W_scores + self.b_scores #extra features layer return doc_score
def encode(input_, train): dims = tuple([1] + list(input_.dim()[0])) input_ = dy.reshape(input_, dims) x = first_layer(input_, train) x = residual(x, train) new_shape = x.dim()[0] x = dy.reshape(x, new_shape[1:]) return x
def norm(x): """Layer Norm only handles a vector in dynet so fold extra dims into the batch.""" shape, batchsz = x.dim() first = shape[0] fold = np.prod(shape[1:]) x = dy.reshape(x, (first, ), batch_size=batchsz * fold) x = dy.layer_norm(x, a, b) return dy.reshape(x, shape, batch_size=batchsz)
def folded_softmax(x, softmax=dy.softmax): """Dynet only allows for softmax on matrices.""" shape, batchsz = x.dim() first = shape[0] flat = np.prod(shape[1:]) x = dy.reshape(x, (first, flat), batch_size=batchsz) x = softmax(x, d=0) return dy.reshape(x, shape, batch_size=batchsz)
def norm(x): """Layer Norm only handles a vector in dynet so fold extra dims into the batch.""" shape, batchsz = x.dim() first = shape[0] fold = np.prod(shape[1:]) x = dy.reshape(x, (first,), batch_size=batchsz*fold) x = dy.layer_norm(x, a, b) return dy.reshape(x, shape, batch_size=batchsz)
def transduce(self, seq: ExpressionSequence) -> ExpressionSequence: seq_tensor = self.child.transduce(seq).as_tensor() + seq.as_tensor() if self.layer_norm: d = seq_tensor.dim() seq_tensor = dy.reshape(seq_tensor, (d[0][0],), batch_size=d[0][1]*d[1]) seq_tensor = dy.layer_norm(seq_tensor, self.ln_g, self.ln_b) seq_tensor = dy.reshape(seq_tensor, d[0], batch_size=d[1]) return ExpressionSequence(expr_tensor=seq_tensor)
def do_one_batch(X_batch, Z_batch): # Flatten the batch into 1-D vector for workaround batch_size = X_batch.shape[0] if DO_BATCH: X_batch_f = X_batch.flatten('F') Z_batch_f = Z_batch.flatten('F') x = dy.reshape(dy.inputVector(X_batch_f), (nmf, nframes), batch_size=batch_size) z = dy.reshape(dy.inputVector(Z_batch_f), (nvgg), batch_size=batch_size) scnn.add_input([X_batch[i] for i in range(X_batch.shape[0])]) vgg.add_input([Z_batch[i] for i in range(X_batch.shape[0])]) else: x = dy.matInput(X_batch.shape[0], X_batch.shape[1]) x.set(X_batch.flatten('F')) z = dy.vecInput(Z_batch.shape[0]) z.set(Z_batch.flatten('F')) x = dy.reshape(dy.transpose(x, [1, 0]), (1, X_batch.shape[1], X_batch.shape[0])) print(x.npvalue().shape) a_h1 = dy.conv2d_bias(x, w_i, b_i, [1, 1], is_valid=False) h1 = dy.rectify(a_h1) h1_pool = dy.kmax_pooling(h1, D[1], d=1) a_h2 = dy.conv2d_bias(h1_pool, w_h1, b_h1, [1, 1], is_valid=False) h2 = dy.rectify(a_h2) h2_pool = dy.kmax_pooling(h2, D[2], d=1) a_h3 = dy.conv2d_bias(h2_pool, w_h2, b_h2, [1, 1], is_valid=False) h3 = dy.rectify(a_h3) h3_pool = dy.kmax_pooling(h3, D[3], d=1) h4 = dy.kmax_pooling(h3_pool, 1, d=1) h4_re = dy.reshape(h4, (J[3], )) #print(h4_re.npvalue().shape) g = dy.scalarInput(1.) zem_sp = dy.weight_norm(h4_re, g) #print(zem_sp.npvalue().shape) zem_vgg = w_embed * z + b_embed #print(zem_vgg.npvalue().shape) sa = dy.transpose(zem_sp) * zem_vgg s = dy.rectify(sa) if PRINT_EMBED: print('Vgg embedding vector:', zem_vgg.npvalue().shape) print(zem_vgg.value()) print('Speech embedding vector:', zem_sp.npvalue().shape) print(zem_sp.value()) if PRINT_SIM: print('Raw Similarity:', sa.npvalue()) print(sa.value()) print('Similarity:', s.npvalue()) print(s.value()) return s
def forward(self, state): # State should be a length-four matrix l1 =dy.reshape(dy.inputTensor(state), (1, 4)) * dy.parameter(self.w_1) + dy.reshape(dy.parameter(self.b_1), (1, 4)) l2 = l1 * dy.parameter(self.w_2) return dy.transpose(l2)
def transduce(self, expr_seq: ExpressionSequence) -> ExpressionSequence: """ transduce the sequence Args: expr_seq: expression sequence or list of expression sequences (where each inner list will be concatenated) Returns: expression sequence """ Wq, Wk, Wv, Wo = [ dy.parameter(x) for x in (self.pWq, self.pWk, self.pWv, self.pWo) ] bq, bk, bv, bo = [ dy.parameter(x) for x in (self.pbq, self.pbk, self.pbv, self.pbo) ] # Start with a [(length, model_size) x batch] tensor x = expr_seq.as_transposed_tensor() x_len = x.dim()[0][0] x_batch = x.dim()[1] # Get the query key and value vectors # TODO: do we need bias broadcasting in DyNet? # q = dy.affine_transform([bq, x, Wq]) # k = dy.affine_transform([bk, x, Wk]) # v = dy.affine_transform([bv, x, Wv]) q = bq + x * Wq k = bk + x * Wk v = bv + x * Wv # Split to batches [(length, head_dim) x batch * num_heads] tensor q, k, v = [ dy.reshape(x, (x_len, self.head_dim), batch_size=x_batch * self.num_heads) for x in (q, k, v) ] # Do scaled dot product [(length, length) x batch * num_heads], rows are queries, columns are keys attn_score = q * dy.transpose(k) / sqrt(self.head_dim) if expr_seq.mask is not None: mask = dy.inputTensor(np.repeat( expr_seq.mask.np_arr, self.num_heads, axis=0).transpose(), batched=True) * -1e10 attn_score = attn_score + mask attn_prob = dy.softmax(attn_score, d=1) # Reduce using attention and resize to match [(length, model_size) x batch] o = dy.reshape(attn_prob * v, (x_len, self.input_dim), batch_size=x_batch) # Final transformation # o = dy.affine_transform([bo, attn_prob * v, Wo]) o = bo + o * Wo expr_seq = ExpressionSequence(expr_transposed_tensor=o, mask=expr_seq.mask) self._final_states = [FinalTransducerState(expr_seq[-1], None)] return expr_seq
def create_network_return_best(self, x): dy.renew_cg() emb_vectors = [self.lookup[self.corpus.get(item, len(self.corpus))] for item in x] calc_avg = dy.average(emb_vectors) emb_vectors_mean = dy.reshape(calc_avg, (1, self.dim)) z1 = (emb_vectors_mean * self._pW1) + self._pB1 a1 = dy.tanh(z1) net_output = dy.softmax(dy.reshape((a1 * self._kW1) + self._kB1, (self.numClasses,))) return np.argmax(net_output.npvalue())
def parse(self, words, extwords, tags): arc_logits, rel_logits = self.forward(words, extwords, tags, False) seq_len = len(words) flat_arc_logits = dy.reshape(arc_logits, (seq_len, ), seq_len) arc_probs = dy.softmax(flat_arc_logits) flat_rel_logits = dy.reshape(rel_logits, (seq_len, self.rel_size), seq_len) rel_probs = dy.softmax(dy.transpose(flat_rel_logits)) return arc_probs, rel_probs
def __call__(self, x, dropout=False): if args.conv: x = dy.reshape(x, (28, 28, 1)) x = dy.conv2d_bias(x, self.F1, self.b1, [1, 1], is_valid=False) x = dy.rectify(dy.maxpooling2d(x, [2, 2], [2, 2])) x = dy.conv2d_bias(x, self.F2, self.b2, [1, 1], is_valid=False) x = dy.rectify(dy.maxpooling2d(x, [2, 2], [2, 2])) # 7x7x64 x = dy.reshape(x, (7 * 7 * 64,)) h = dy.rectify(self.W1 * x + self.hbias) if dropout: h = dy.dropout(h, DROPOUT_RATE) logits = self.W2 * h return logits
def word_repr(self, char_seq): # obtain the word representation when given its character sequence wlen = len(char_seq) if 'rgW%d'%wlen not in self.param_exprs: self.param_exprs['rgW%d'%wlen] = dy.parameter(self.params['reset_gate_W'][wlen-1]) self.param_exprs['rgb%d'%wlen] = dy.parameter(self.params['reset_gate_b'][wlen-1]) self.param_exprs['cW%d'%wlen] = dy.parameter(self.params['com_W'][wlen-1]) self.param_exprs['cb%d'%wlen] = dy.parameter(self.params['com_b'][wlen-1]) self.param_exprs['ugW%d'%wlen] = dy.parameter(self.params['update_gate_W'][wlen-1]) self.param_exprs['ugb%d'%wlen] = dy.parameter(self.params['update_gate_b'][wlen-1]) chars = dy.concatenate(char_seq) reset_gate = dy.logistic(self.param_exprs['rgW%d'%wlen] * chars + self.param_exprs['rgb%d'%wlen]) comb = dy.concatenate([dy.tanh(self.param_exprs['cW%d'%wlen] * dy.cmult(reset_gate,chars) + self.param_exprs['cb%d'%wlen]),chars]) update_logits = self.param_exprs['ugW%d'%wlen] * comb + self.param_exprs['ugb%d'%wlen] update_gate = dy.transpose(dy.concatenate_cols([dy.softmax(dy.pickrange(update_logits,i*(wlen+1),(i+1)*(wlen+1))) for i in xrange(self.options['ndims'])])) # The following implementation of Softmax fucntion is not safe, but faster... #exp_update_logits = dy.exp(dy.reshape(update_logits,(self.options['ndims'],wlen+1))) #update_gate = dy.cdiv(exp_update_logits, dy.concatenate_cols([dy.sum_cols(exp_update_logits)] *(wlen+1))) #assert (not np.isnan(update_gate.npvalue()).any()) word = dy.sum_cols(dy.cmult(update_gate,dy.reshape(comb,(self.options['ndims'],wlen+1)))) return word
def conv(input_, _=None): dims = tuple([1] + list(input_.dim()[0])) input_ = dy.reshape(input_, dims) mots = [] for conv in convs: mots.append(mot_pool(conv(input_))) return dy.concatenate(mots)
def unsqueeze(x, dim): """Add a dimension of size 1 to `x` at position `dim`.""" shape, batchsz = x.dim() dim = len(shape) + dim + 1 if dim < 0 else dim shape = list(shape) shape.insert(dim, 1) return dy.reshape(x, tuple(shape), batch_size=batchsz)
def calc_loss(words, labels, heads): dy.renew_cg() word_embs = [dy.lookup(W_emb, x) for x in words] fwd_init = fwdLSTM.initial_state() fwd_embs = fwd_init.transduce(word_embs) bwd_init = bwdLSTM.initial_state() bwd_embs = bwd_init.transduce(reversed(word_embs)) src_encodings = [dy.reshape(dy.concatenate([f, b]), (HID_SIZE * 2, 1)) for f, b in zip(fwd_embs, reversed(bwd_embs))] return biaffineParser.decode_loss(src_encodings, ([heads], [labels]))
def calc_loss(sents): dy.renew_cg() # Transduce all batch elements with an LSTM src_sents = [x[0] for x in sents] tgt_sents = [x[1] for x in sents] src_cws = [] src_len = [len(sent) for sent in src_sents] max_src_len = np.max(src_len) num_words = 0 for i in range(max_src_len): src_cws.append([sent[i] for sent in src_sents]) #initialize the LSTM init_state_src = LSTM_SRC_BUILDER.initial_state() #get the output of the first LSTM src_output = init_state_src.add_inputs([dy.lookup_batch(LOOKUP_SRC, cws) for cws in src_cws])[-1].output() #now decode all_losses = [] # Decoder #need to mask padding at end of sentence tgt_cws = [] tgt_len = [len(sent) for sent in sents] max_tgt_len = np.max(tgt_len) masks = [] for i in range(max_tgt_len): tgt_cws.append([sent[i] if len(sent) > i else eos_trg for sent in tgt_sents]) mask = [(1 if len(sent) > i else 0) for sent in tgt_sents] masks.append(mask) num_words += sum(mask) current_state = LSTM_TRG_BUILDER.initial_state().set_s([src_output, dy.tanh(src_output)]) prev_words = tgt_cws[0] W_sm = dy.parameter(W_sm_p) b_sm = dy.parameter(b_sm_p) for next_words, mask in zip(tgt_cws[1:], masks): #feed the current state into the current_state = current_state.add_input(dy.lookup_batch(LOOKUP_TRG, prev_words)) output_embedding = current_state.output() s = dy.affine_transform([b_sm, W_sm, output_embedding]) loss = (dy.pickneglogsoftmax_batch(s, next_words)) mask_expr = dy.inputVector(mask) mask_expr = dy.reshape(mask_expr, (1,),len(sents)) mask_loss = loss * mask_expr all_losses.append(mask_loss) prev_words = next_words return dy.sum_batches(dy.esum(all_losses)), num_words
def calc_acc(words, labels, heads): dy.renew_cg() word_embs = [dy.lookup(W_emb, x) for x in words] fwd_init = fwdLSTM.initial_state() fwd_embs = fwd_init.transduce(word_embs) bwd_init = bwdLSTM.initial_state() bwd_embs = bwd_init.transduce(reversed(word_embs)) src_encodings = [dy.reshape(dy.concatenate([f, b]), (HID_SIZE * 2, 1)) for f, b in zip(fwd_embs, reversed(bwd_embs))] pred_heads, pred_labels = biaffineParser.decoding(src_encodings) return biaffineParser.cal_accuracy(pred_heads, pred_labels, heads, labels)
def batch_matmul(x, y): """Matmul between first two layers but the rest are ignored. Input: ((X, Y, ..), B) and ((Y, Z, ..), B) Output: ((X, Z, ..), B) """ x_shape, batchsz = x.dim() x_mat = x_shape[:2] sames = x_shape[2:] fold = np.prod(sames) y_shape, _ = y.dim() y_mat = y_shape[:2] x = dy.reshape(x, x_mat, batch_size=fold*batchsz) y = dy.reshape(y, y_mat, batch_size=fold*batchsz) z = x * y z = dy.reshape(z, tuple([x_mat[0], y_mat[1]] + list(sames)), batch_size=batchsz) return z
def squeeze(x, d=-1): shape, batchsz = x.dim() if d == -1: shape = tuple(filter(lambda x: x != 1, shape)) else: assert shape[d] == 1, "Cannot squeeze dimension {} of size {}".format(d, shape[d]) shape = list(shape) _ = shape.pop(d) shape = tuple(shape) return dy.reshape(x, shape, batch_size=batchsz)
def calc_scores(wids): dy.renew_cg() if len(wids) < WIN_SIZE: wids += [0] * (WIN_SIZE-len(wids)) cnn_in = dy.concatenate([dy.lookup(W_emb, x) for x in wids], d=1) cnn_out = dy.conv2d_bias(cnn_in, W_cnn, b_cnn, stride=(1, 1), is_valid=False) pool_out = dy.max_dim(cnn_out, d=1) pool_out = dy.reshape(pool_out, (FILTER_SIZE,)) pool_out = dy.rectify(pool_out) return W_sm * pool_out + b_sm
def __call__(self, query, key, value, mask=None, train=False): """Input: ((H, T), B) Output: ((H, T), B)""" _, batchsz = query.dim() query = self.p_Q(query) t = query.dim()[0][1] query = dy.reshape(query, (self.d_k, self.h, t), batch_size=batchsz) query = transpose(query, 1, 2) key = self.p_K(key) t = key.dim()[0][1] key = dy.reshape(key, (self.d_k, self.h, t), batch_size=batchsz) key = transpose(key, 1, 2) value = self.p_V(value) t = value.dim()[0][1] value = dy.reshape(value, (self.d_k, self.h, t), batch_size=batchsz) value = transpose(value, 1, 2) pdrop = self.pdrop if train else None x = self.attn(query, key, value, mask=mask, dropout=pdrop) x = transpose(x, 1, 2) t = x.dim()[0][2] x = dy.reshape(x, (self.h * self.d_k, t), batch_size=batchsz) return self.p_O(x)
def evaluate(self, inputs, train=False): """ Apply all MLP layers to concatenated input :param inputs: (key, vector) per feature type :param train: are we training now? :return: output vector of size self.output_dim """ input_keys, inputs = list(map(list, zip(*list(inputs)))) if self.input_keys: assert input_keys == self.input_keys, "Got: %s\nBut expected input keys: %s" % ( self.input_keys_str(self.input_keys), self.input_keys_str(input_keys)) else: self.input_keys = input_keys if self.gated: gates = self.params.get("gates") if gates is None: # FIXME attention weights should not be just parameters, but based on biaffine product? gates = self.params["gates"] = self.model.add_parameters((len(inputs), self.gated), init=dy.UniformInitializer(1)) input_dims = [i.dim()[0][0] for i in inputs] max_dim = max(input_dims) x = dy.concatenate_cols([dy.concatenate([i, dy.zeroes(max_dim - d)]) # Pad with zeros to get uniform dim if d < max_dim else i for i, d in zip(inputs, input_dims)]) * gates # Possibly multiple "attention heads" -- concatenate outputs to one vector inputs = [dy.reshape(x, (x.dim()[0][0] * x.dim()[0][1],))] x = dy.concatenate(inputs) assert len(x.dim()[0]) == 1, "Input should be a vector, but has dimension " + str(x.dim()[0]) dim = x.dim()[0][0] if self.input_dim: assert dim == self.input_dim, "Input dim mismatch: %d != %d" % (dim, self.input_dim) else: self.init_params(dim) self.config.print(self, level=4) if self.total_layers: if self.weights is None: self.weights = [[self.params[prefix + str(i)] for prefix in ("W", "b")] for i in range(self.total_layers)] if self.weights[0][0].dim()[0][1] < dim: # number of columns in W0 self.weights[0][0] = dy.concatenate_cols([self.weights[0][0], self.params["W0+"]]) for i, (W, b) in enumerate(self.weights): self.config.print(lambda: x.npvalue().tolist(), level=4) try: if train and self.dropout: x = dy.dropout(x, self.dropout) x = self.activation()(W * x + b) except ValueError as e: raise ValueError("Error in evaluating layer %d of %d" % (i + 1, self.total_layers)) from e self.config.print(lambda: x.npvalue().tolist(), level=4) return x
def BuildLMGraph(self, sents): dy.renew_cg() # initialize the RNN init_state = self.builder.initial_state() # parameters -> expressions R = dy.parameter(self.R) bias = dy.parameter(self.bias) S = vocab.w2i["<s>"] # get the cids and masks for each step tot_chars = 0 cids = [] masks = [] for i in range(len(sents[0])): cids.append([(vocab.w2i[sent[i]] if len(sent) > i else S) for sent in sents]) mask = [(1 if len(sent)>i else 0) for sent in sents] masks.append(mask) tot_chars += sum(mask) # start the rnn with "<s>" init_ids = cids[0] s = init_state.add_input(lookup_batch(self.lookup, init_ids)) losses = [] # feed char vectors into the RNN and predict the next char for cid, mask in zip(cids[1:], masks[1:]): score = dy.affine_transform([bias, R, s.output()]) loss = dy.pickneglogsoftmax_batch(score, cid) # mask the loss if at least one sentence is shorter if mask[-1] != 1: mask_expr = dy.inputVector(mask) mask_expr = dy.reshape(mask_expr, (1,), len(sents)) loss = loss * mask_expr losses.append(loss) # update the state of the RNN cemb = dy.lookup_batch(self.lookup, cid) s = s.add_input(cemb) return dy.sum_batches(dy.esum(losses)), tot_chars
def __call__(self, inputs, dropout=False): x = dy.inputTensor(inputs) conv1 = dy.parameter(self.pConv1) b1 = dy.parameter(self.pB1) x = dy.conv2d_bias(x, conv1, b1, [1, 1], is_valid=False) x = dy.rectify(dy.maxpooling2d(x, [2, 2], [2, 2])) conv2 = dy.parameter(self.pConv2) b2 = dy.parameter(self.pB2) x = dy.conv2d_bias(x, conv2, b2, [1, 1], is_valid=False) x = dy.rectify(dy.maxpooling2d(x, [2, 2], [2, 2])) x = dy.reshape(x, (7*7*64, 1)) w1 = dy.parameter(self.pW1) b3 = dy.parameter(self.pB3) h = dy.rectify(w1*x+b3) if dropout: h = dy.dropout(h, DROPOUT_RATE) w2 = dy.parameter(self.pW2) output = w2*h # output = dy.softmax(w2*h) return output
def calc_lm_loss(sents): dy.renew_cg() # initialize the RNN f_init = RNN.initial_state() # get the wids and masks for each step tot_words = 0 wids = [] masks = [] for i in range(len(sents[0])): wids.append([(sent[i] if len(sent) > i else S) for sent in sents]) mask = [(1 if len(sent) > i else 0) for sent in sents] masks.append(mask) tot_words += sum(mask) # start the rnn by inputting "<s>" init_ids = [S] * len(sents) s = f_init.add_input(dy.lookup_batch(WORDS_LOOKUP, init_ids)) # feed word vectors into the RNN and predict the next word losses = [] for wid, mask in zip(wids, masks): # calculate the softmax and loss score = dy.affine_transform([b_exp, W_exp, s.output()]) loss = dy.pickneglogsoftmax_batch(score, wid) # mask the loss if at least one sentence is shorter if mask[-1] != 1: mask_expr = dy.inputVector(mask) mask_expr = dy.reshape(mask_expr, (1,), len(sents)) loss = loss * mask_expr losses.append(loss) # update the state of the RNN wemb = dy.lookup_batch(WORDS_LOOKUP, wid) s = s.add_input(wemb) return dy.sum_batches(dy.esum(losses)), tot_words
def mot_pool(x, strides=(1, 1, 1, 1)): # dy.max_dim(x, d=0) is currently slow (see https://github.com/clab/dynet/issues/1011) # So we do the max using max pooling instead. ((_, seq_len, cmotsz), _) = x.dim() pooled = dy.maxpooling2d(x, [1, seq_len, 1], strides) return dy.reshape(pooled, (cmotsz,))
def calc_loss(sents): dy.renew_cg() # Transduce all batch elements with an LSTM src_sents = [x[0] for x in sents] tgt_sents = [x[1] for x in sents] src_cws = [] src_len = [len(sent) for sent in src_sents] max_src_len = np.max(src_len) num_words = 0 for i in range(max_src_len): src_cws.append([sent[i] for sent in src_sents]) #get the outputs of the first LSTM src_outputs = [dy.concatenate([x.output(), y.output()]) for x,y in LSTM_SRC.add_inputs([dy.lookup_batch(LOOKUP_SRC, cws) for cws in src_cws])] src_output = src_outputs[-1] #gets the parameters for the attention src_output_matrix = dy.concatenate_cols(src_outputs) w1_att_src = dy.parameter(w1_att_src_p) fixed_attentional_component = w1_att_src * src_output_matrix #now decode all_losses = [] # Decoder #need to mask padding at end of sentence tgt_cws = [] tgt_len = [len(sent) for sent in sents] max_tgt_len = np.max(tgt_len) masks = [] for i in range(max_tgt_len): tgt_cws.append([sent[i] if len(sent) > i else eos_trg for sent in tgt_sents]) mask = [(1 if len(sent) > i else 0) for sent in tgt_sents] masks.append(mask) num_words += sum(mask) current_state = LSTM_TRG_BUILDER.initial_state().set_s([src_output, dy.tanh(src_output)]) prev_words = tgt_cws[0] W_sm = dy.parameter(W_sm_p) b_sm = dy.parameter(b_sm_p) W_m = dy.parameter(W_m_p) b_m = dy.parameter(b_m_p) for next_words, mask in zip(tgt_cws[1:], masks): #feed the current state into the current_state = current_state.add_input(dy.lookup_batch(LOOKUP_TRG, prev_words)) output_embedding = current_state.output() att_output, _ = calc_attention(src_output_matrix, output_embedding, fixed_attentional_component) middle_expr = dy.tanh(dy.affine_transform([b_m, W_m, dy.concatenate([output_embedding, att_output])])) s = dy.affine_transform([b_sm, W_sm, middle_expr]) loss = (dy.pickneglogsoftmax_batch(s, next_words)) mask_expr = dy.inputVector(mask) mask_expr = dy.reshape(mask_expr, (1,),len(sents)) mask_loss = loss * mask_expr all_losses.append(mask_loss) prev_words = next_words return dy.sum_batches(dy.esum(all_losses)), num_words