def build_graph(self, x): conv_W_1 = dy.parameter(self.params['conv_W_1']) conv_b_1 = dy.parameter(self.params['conv_b_1']) conv_W_2 = dy.parameter(self.params['conv_W_2']) conv_b_2 = dy.parameter(self.params['conv_b_2']) conv_W_3 = dy.parameter(self.params['conv_W_3']) conv_b_3 = dy.parameter(self.params['conv_b_3']) W = dy.parameter(self.params['W']) b = dy.parameter(self.params['b']) (n, d), _ = x.dim() x = dy.reshape(x, (1, n, d)) # 一维卷积网络 conv_1 = dy.tanh( dy.conv2d_bias(x, conv_W_1, conv_b_1, (1, 1), is_valid=False)) conv_2 = dy.tanh( dy.conv2d_bias(x, conv_W_2, conv_b_2, (1, 1), is_valid=False)) conv_3 = dy.tanh( dy.conv2d_bias(x, conv_W_3, conv_b_3, (1, 1), is_valid=False)) pool_1 = dy.max_dim(dy.reshape(conv_1, (n, self.options['channel_1']))) pool_2 = dy.max_dim(dy.reshape(conv_2, (n, self.options['channel_2']))) pool_3 = dy.max_dim(dy.reshape(conv_3, (n, self.options['channel_3']))) # 全连接分类 pool = dy.concatenate([pool_1, pool_2, pool_3], 0) logit = dy.dot_product(pool, W) + b return logit
def transduce(self, encodings): inp = encodings dim = inp.dim() if dim[0][1] < self.ngram_size: pad = dy.zeros((self.embed_dim, self.ngram_size-dim[0][1])) inp = dy.concatenate([inp, pad], d=1) dim = inp.dim() inp = dy.reshape(inp, (1, dim[0][1], dim[0][0])) encodings = dy.rectify(dy.conv2d_bias(inp, dy.parameter(self.filter), dy.parameter(self.bias), stride=(1, 1), is_valid=True)) return dy.max_dim(dy.max_dim(encodings, d=1), d=0)
def transduce(self, src: ExpressionSequence) -> ExpressionSequence: src = src.as_tensor() src_height = src.dim()[0][0] src_width = src.dim()[0][1] # src_channels = 1 batch_size = src.dim()[1] # convolution and pooling layers # src dim is ((40, 1000), 128) src = padding(src, self.filter_width[0]+3) l1 = dy.rectify(dy.conv2d(src, dy.parameter(self.filters1), stride = [self.stride[0], self.stride[0]], is_valid = True)) # ((1, 1000, 64), 128) pool1 = dy.maxpooling2d(l1, (1, 4), (1,2), is_valid = True) #((1, 499, 64), 128) pool1 = padding(pool1, self.filter_width[1]+3) l2 = dy.rectify(dy.conv2d(pool1, dy.parameter(self.filters2), stride = [self.stride[1], self.stride[1]], is_valid = True))# ((1, 499, 512), 128) pool2 = dy.maxpooling2d(l2, (1, 4), (1,2), is_valid = True)#((1, 248, 512), 128) pool2 = padding(pool2, self.filter_width[2]) l3 = dy.rectify(dy.conv2d(pool2, dy.parameter(self.filters3), stride = [self.stride[2], self.stride[2]], is_valid = True))# ((1, 248, 1024), 128) pool3 = dy.max_dim(l3, d = 1) my_norm = dy.l2_norm(pool3) + 1e-6 output = dy.cdiv(pool3,my_norm) output = dy.reshape(output, (self.num_filters[2],), batch_size = batch_size) return ExpressionSequence(expr_tensor=output)
def calc_scores(words): dy.renew_cg() W_cnn_express = dy.parameter(W_cnn) b_cnn_express = dy.parameter(b_cnn) W_sm_express = dy.parameter(W_sm) b_sm_express = dy.parameter(b_sm) Waux_sm_express = dy.parameter(Waux_sm) baux_sm_express = dy.parameter(baux_sm) # basically, win size tells you how many words/chars/pixels (?) we're 'looking at' at each step. # Here, 1 unit is 1 word. If a sample has fewer words than win size, then we probably do need some padding. # Padd with index 0. (so we're treating the pad words as UNK (?)) if len(words) < WIN_SIZE: words += [0] * (WIN_SIZE-len(words)) # Convolution + pooling layer cnn_in = dy.concatenate([W_emb[x] for x in words], d=1) # concat repr of all words cnn_out = dy.conv2d_bias(cnn_in, W_cnn_express, b_cnn_express, stride=(1, 1), is_valid=False) pool_out = dy.max_dim(cnn_out, d=1) # Is this max pooling? pool_out = dy.reshape(pool_out, (FILTER_SIZE,)) pool_out = dy.rectify(pool_out) # Is this ReLU activation? # get scores for either task scores_main = W_sm_express * pool_out + b_sm_express scores_aux = Waux_sm_express * pool_out + baux_sm_express return scores_main, scores_aux
def _build_computation_graph(self, words, train_mode=True): """ Builds the computational graph. """ dy.renew_cg() # turn parameters into expressions softmax_weight_exp = dy.parameter(self.softmax_weight) softmax_bias_exp = dy.parameter(self.softmax_bias) word_reps = [self._word_rep(word) for word in words] embs = dy.concatenate(word_reps, d=1) if self.pooling_method == "average": average_emb = dy.mean_dim(embs, d=1) elif self.pooling_method == "max": average_emb = dy.max_dim(embs, d=1) else: raise NotImplementedError average_emb = dy.reshape(average_emb, (self.word_embedding_size,)) if self.average_dropout is not None: dy.dropout(average_emb, p=self.average_dropout) return softmax_weight_exp * average_emb + softmax_bias_exp
def transduce(self, embeds): expr_seq = [] seq_len = embeds.dim()[0][1] for i in range(seq_len): expr_seq.append(dy.max_dim(dy.select_cols(embeds, [i]), 1)) encodings = self.seq_transducer.transduce(ExpressionSequence(expr_seq)) return self.seq_transducer.get_final_states()[-1].main_expr()
def decode(self, emissions): """Viterbi decode to find the best sequence. :param emissions: List[dy.Expression] Returns: List[int], dy.Expression ((1,), B) """ if self.add_ends: emissions = CRF._prep_input(emissions) backpointers = [] transitions = self.transitions inits = [-1e4] * self.n_tags inits[self.start_idx] = 0 alphas = dy.inputVector(inits) for emission in emissions: next_vars = dy.colwise_add(dy.transpose(transitions), alphas) best_tags = np.argmax(next_vars.npvalue(), 0) v_t = dy.max_dim(next_vars, 0) alphas = v_t + emission backpointers.append(best_tags) terminal_expr = alphas + dy.pick(transitions, self.end_idx) best_tag = np.argmax(terminal_expr.npvalue()) path_score = dy.pick(terminal_expr, best_tag) best_path = [best_tag] for bp_t in reversed(backpointers): best_tag = bp_t[best_tag] best_path.append(best_tag) _ = best_path.pop() best_path.reverse() return best_path, path_score
def calculateScores(self, instance, vectors, network, scores, isTraining): dReprCache = {} for dId in range(-1, len(instance.sentence)): depReprs = self.__featReprBuilder.extractAndBuildFeatRepr( gfeatures.FeatId.DEP, dId, instance.sentence, vectors, isTraining) depRepr = dynet.esum(depReprs) if len(depReprs) > 0 else None dReprCache[dId] = (depRepr, len(depReprs)) for hId in range(-1, len(instance.sentence)): headReprs = self.__featReprBuilder.extractAndBuildFeatRepr( gfeatures.FeatId.HEAD, hId, instance.sentence, vectors, isTraining) headRepr = dynet.esum(headReprs) if len(headReprs) > 0 else None for dId in range(-1, len(instance.sentence)): depRepr, depNr = dReprCache[dId] distRepr = self.__featReprBuilder.onlyBuildFeatRepr( gfeatures.FeatId.DIST, (hId, dId), isTraining) featRepr = [headRepr, depRepr] featReprNr = len(headReprs) + depNr if distRepr != None: featRepr.append(distRepr) featReprNr += 1 assert featReprNr == self.__featReprBuilder.getNrOfFeatures() featRepr = dynet.esum([f for f in featRepr if f is not None]) netOut = network.buildOutput(featRepr, isTraining=isTraining) scores.addOutput(hId, dId, netOut) scores.addScore(hId, dId, dynet.max_dim(netOut).scalar_value())
def viterbi(emissions, transition, start_idx, end_idx, norm=False): n_tags = emissions[0].dim()[0][0] backpointers = [] inits = [-1e4] * n_tags inits[start_idx] = 0 alphas = dy.inputVector(inits) alphas = dy.log_softmax(alphas) if norm else alphas for emission in emissions: next_vars = dy.colwise_add(dy.transpose(transition), alphas) best_tags = np.argmax(next_vars.npvalue(), 0) v_t = dy.max_dim(next_vars, 0) alphas = v_t + emission backpointers.append(best_tags) terminal_expr = alphas + dy.pick(transition, end_idx) best_tag = np.argmax(terminal_expr.npvalue()) path_score = dy.pick(terminal_expr, best_tag) best_path = [best_tag] for bp_t in reversed(backpointers): best_tag = bp_t[best_tag] best_path.append(best_tag) _ = best_path.pop() best_path.reverse() return best_path, path_score
def calc_predict_and_activations(wids, tag, words): dy.renew_cg() if len(wids) < WIN_SIZE: wids += [0] * (WIN_SIZE-len(wids)) cnn_in = dy.concatenate([dy.lookup(W_emb, x) for x in wids], d=1) cnn_out = dy.conv2d_bias(cnn_in, W_cnn, b_cnn, stride=(1, 1), is_valid=False) filters = (dy.reshape(cnn_out, (len(wids), FILTER_SIZE))).npvalue() activations = filters.argmax(axis=0) pool_out = dy.max_dim(cnn_out, d=1) pool_out = dy.reshape(pool_out, (FILTER_SIZE,)) pool_out = dy.rectify(pool_out) scores = (W_sm * pool_out + b_sm).npvalue() print ('%d ||| %s' % (tag, ' '.join(words))) predict = np.argmax(scores) print (display_activations(words, activations)) print ('scores=%s, predict: %d' % (scores, predict)) features = pool_out.npvalue() W = W_sm.npvalue() bias = b_sm.npvalue() print (' bias=%s' % bias) contributions = W * features print (' very bad (%.4f): %s' % (scores[0], contributions[0])) print (' bad (%.4f): %s' % (scores[1], contributions[1])) print (' neutral (%.4f): %s' % (scores[2], contributions[2])) print (' good (%.4f): %s' % (scores[3], contributions[3])) print ('very good (%.4f): %s' % (scores[4], contributions[4]))
def calc_predict_and_activations(wids, tag, words): dy.renew_cg() if len(wids) < WIN_SIZE: wids += [0] * (WIN_SIZE - len(wids)) cnn_in = dy.concatenate([dy.lookup(W_emb, x) for x in wids], d=1) cnn_out = dy.conv2d_bias(cnn_in, W_cnn, b_cnn, stride=(1, 1), is_valid=False) filters = (dy.reshape(cnn_out, (len(wids), FILTER_SIZE))).npvalue() activations = filters.argmax(axis=0) pool_out = dy.max_dim(cnn_out, d=1) pool_out = dy.reshape(pool_out, (FILTER_SIZE, )) pool_out = dy.rectify(pool_out) scores = (W_sm * pool_out + b_sm).npvalue() print('%d ||| %s' % (tag, ' '.join(words))) predict = np.argmax(scores) print(display_activations(words, activations)) print('scores=%s, predict: %d' % (scores, predict)) features = pool_out.npvalue() W = W_sm.npvalue() bias = b_sm.npvalue() print(' bias=%s' % bias) contributions = W * features print(' very bad (%.4f): %s' % (scores[0], contributions[0])) print(' bad (%.4f): %s' % (scores[1], contributions[1])) print(' neutral (%.4f): %s' % (scores[2], contributions[2])) print(' good (%.4f): %s' % (scores[3], contributions[3])) print('very good (%.4f): %s' % (scores[4], contributions[4]))
def softmax(x): """ Compute the softmax function in tensorflow. You might find the tensorflow functions tf.exp, tf.reduce_max, tf.reduce_sum, tf.expand_dims useful. (Many solutions are possible, so you may not need to use all of these functions). Recall also that many common tensorflow operations are sugared (e.g. x * y does a tensor multiplication if x and y are both tensors). Make sure to implement the numerical stability fixes as in the previous homework! Args: x: tf.Tensor with shape (n_samples, n_features). Note feature vectors are represented by row-vectors. (For simplicity, no need to handle 1-d input as in the previous homework) Returns: out: tf.Tensor with shape (n_sample, n_features). You need to construct this tensor in this problem. """ ### YOUR CODE HERE x_max = dy.max_dim(x, 1) x_sub = dy.colwise_add(x, -x_max) x_exp = dy.exp(x_sub) sum_exp = dy.colwise_add(dy.zeroes(x.dim()[0]), dy.sum_cols(x_exp)) out = dy.cdiv(x_exp, sum_exp) ### END YOUR CODE return out
def compute_loss(model, prev_state, current_state, action, reward, step_num): q = dy.pick(model.forward(prev_state), action) v = dy.max_dim(model.forward(current_state)) expval = v * math.pow(GAMMA, step_num) + reward loss = q - expval return loss
def exprseq_pooling(self, exprseq): # Reduce to vector if exprseq.expr_tensor != None: if len(exprseq.expr_tensor.dim()[0]) > 1: return dy.max_dim(exprseq.expr_tensor, d=1) else: return exprseq.expr_tensor else: return dy.emax(exprseq.expr_list)
def exprseq_pooling(self, exprseq): # Reduce to vector exprseq = ExpressionSequence(expr_tensor=exprseq.mask.add_to_tensor_expr(exprseq.as_tensor(),-1e10), mask=exprseq.mask) if exprseq.expr_tensor != None: if len(exprseq.expr_tensor.dim()[0]) > 1: return dy.max_dim(exprseq.expr_tensor, d=1) else: return exprseq.expr_tensor else: return dy.emax(exprseq.expr_list)
def softmax(x): ### YOUR CODE HERE x_max = dy.max_dim(x, 1) x_sub = dy.colwise_add(x, -x_max) x_exp = dy.exp(x_sub) x_sum = dy.sum_cols(x_exp) x_tmp = dy.zeroes(x.dim()[0]) x_tmp = dy.colwise_add(x_tmp, x_sum) out = dy.cdiv(x_exp, x_tmp) ### END YOUR CODE return out
def calc_scores(wids): dy.renew_cg() if len(wids) < WIN_SIZE: wids += [0] * (WIN_SIZE-len(wids)) cnn_in = dy.concatenate([dy.lookup(W_emb, x) for x in wids], d=1) cnn_out = dy.conv2d_bias(cnn_in, W_cnn, b_cnn, stride=(1, 1), is_valid=False) pool_out = dy.max_dim(cnn_out, d=1) pool_out = dy.reshape(pool_out, (FILTER_SIZE,)) pool_out = dy.rectify(pool_out) return W_sm * pool_out + b_sm
def conv(input_): """Perform the 1D conv. :param input: dy.Expression ((1, T, dsz), B) Returns: dy.Expression ((cmotsz,), B) """ c = dy.conv2d_bias(input_, weight, bias, strides, is_valid=False) activation = dy.rectify(c) mot = dy.reshape(dy.max_dim(activation, 1), (cmotsz, )) return mot
def calc_loss(self, scores, axis, true, importance): ret = [ i * dy.pickneglogsoftmax(scores, t) for t, i in zip(true, importance) ] if self.loss == "max_margin": ret.append( dy.max_dim( dy.log_softmax(scores, restrict=list( set(range(self.num_labels[axis])) - set(true))))) return ret
def optimize(self, environment, prev_pos, action, next_pos, reward): # Get Q(s_t, a_t): predictions of action taken in environment at # previous position q = dy.pick(self.forward(environment, prev_pos), action) # V: max of Q at next state v = dy.max_dim(self.forward(environment, next_pos)) expval = v * GAMMA + reward loss = q - expval loss.backward() self.trainer.update()
def log_sum_exp_dim_0(x): # numerically stable log_sum_exp dims = x.dim() max_score = dy.max_dim(x, 0) # (dim_1, batch_size) if len(dims[0]) == 1: max_score_extend = max_score else: max_score_reshape = dy.reshape(max_score, (1, dims[0][1]), batch_size=dims[1]) max_score_extend = dy.concatenate([max_score_reshape] * dims[0][0]) x = x - max_score_extend exp_x = dy.exp(x) # (dim_1, batch_size), if no dim_1, return ((1,), batch_size) log_sum_exp_x = dy.log(dy.mean_dim(exp_x, d=[0], b=False) * dims[0][0]) return log_sum_exp_x + max_score
def encode(self, word, training=False): W_cnn = dy.parameter(self.W_cnn) b_cnn = dy.parameter(self.b_cnn) embs = dy.concatenate( [dy.lookup(self.char_embeds, x) for x in word[:45]], d=1) if self.dropout > 0 and training: embs = dy.dropout(embs, self.dropout) cnn_out = dy.conv2d_bias( embs, W_cnn, b_cnn, stride=(1, 1), is_valid=False) # maybe change this? diagram shows padding max_pool = dy.max_dim(cnn_out, d=1) rep = dy.reshape(dy.tanh(max_pool), (self.filter_size, )) return rep
def compose(self, embeds): if type(embeds) != list: embeds = [ dy.pick_batch_elem(embeds, i) for i in range(embeds.dim()[1]) ] if len(embeds) < self.ngram_size: embeds.extend([dy.zeros(self.embed_dim)] * (self.ngram_size - len(embeds))) embeds = dy.transpose( dy.concatenate([dy.concatenate_cols(embeds)], d=2), [2, 1, 0]) embeds = dy.conv2d_bias(embeds, self.filter, self.bias, (self.embed_dim, 1)) embeds = dy.max_dim(dy.pick(embeds, index=0), d=0) return self.transform.transform(embeds)
def _build_computation_graph(self, words, train_mode=True): """ Builds the computational graph. """ dy.renew_cg() # turn parameters into expressions softmax_weight_exp = dy.parameter(self.softmax_weight) softmax_bias_exp = dy.parameter(self.softmax_bias) # initialize the RNNs f_init = self.fwd_word_rnn.initial_state() b_init = self.bwd_word_rnn.initial_state() # cf_init = self.fwd_char_rnn.initial_state() # cb_init = self.bwd_char_rnn.initial_state() # only use word-level for now word_reps = [self._word_rep(word) for word in words] if train_mode and self.add_word_noise: word_reps = [dy.noise(word_rep, 0.05) for word_rep in word_reps] # feed word vectors into biLSTM fw_exps = f_init.transduce(word_reps) bw_exps = b_init.transduce(reversed(word_reps)) if self.pooling_method == "last": average_lstm = dy.concatenate([fw_exps[-1], bw_exps[-1]]) else: bi_exps = [ dy.concatenate([f, b]) for f, b in zip(fw_exps, reversed(bw_exps)) ] bi_exps = dy.concatenate(bi_exps, d=1) if self.pooling_method == "average": average_lstm = dy.mean_dim(bi_exps, d=1) elif self.pooling_method == "max": average_lstm = dy.max_dim(bi_exps, d=1) else: raise NotImplementedError if self.average_dropout is not None: average_lstm = dy.dropout(average_lstm, p=self.average_dropout) return softmax_weight_exp * average_lstm + softmax_bias_exp
def _build_tagging_graph(self, words, train_mode=True): """ Builds the computational graph. Model similar to http://aclweb.org/anthology/D/D14/D14-1181.pdf. """ dy.renew_cg() # turn parameters into expressions mlp_output = dy.parameter(self.pO) W_cnn_expressions = [] b_cnn_expressions = [] for W_cnn, b_cnn in zip(self.W_cnns, self.b_cnns): W_cnn_expressions.append(dy.parameter(W_cnn)) b_cnn_expressions.append(dy.parameter(b_cnn)) if len(words) < self._cnn_window_size: pad_char = "<*>" words += [pad_char] * (self._cnn_window_size - len(words)) if self._char_level: cnn_in = dy.concatenate(self._chars_rep(words), d=1) else: word_reps = [self._word_rep(word) for word in words] cnn_in = dy.concatenate(word_reps, d=1) pools_out = [] for W_cnn_express, b_cnn_express in zip(W_cnn_expressions, b_cnn_expressions): cnn_out = dy.conv2d_bias(cnn_in, W_cnn_express, b_cnn_express, stride=(1, 1), is_valid=False) # max-pooling pool_out = dy.max_dim(cnn_out, d=1) pool_out = dy.reshape(pool_out, (self._cnn_filter_size, )) pools_out.append(pool_out) pools_concat = dy.concatenate(pools_out) return mlp_output * pools_concat
def loss_upper_bound(gold_tags, idx, beam_costs_prev, scores, beam_size): beam_size_prev, num_tags = scores.dim()[0] next_beam_size = beam_size if idx < len(gold_tags) - 1 else 1 scores_flat = dy.reshape(scores, (beam_size_prev * num_tags,)) costs_flat = dynet_compute_costs_flat(gold_tags, idx, beam_costs_prev) sigma_star = np.argsort(costs_flat) gold_idx = sigma_star[0] scores_flat_np = scores_flat.npvalue() sigma_hat = np.argsort(scores_flat_np)[::-1] scores_delta = scores_flat - scores_flat[gold_idx] + 1.0 costs_delta = costs_flat - costs_flat[gold_idx] # mask those that are inside the beam. costs_delta[sigma_star[:next_beam_size]] = 0.0 deltas = dy.cmult(dy.inputTensor(costs_delta), scores_delta) return dy.max_dim(deltas)
def on_calc_additional_loss(self, *args, **kwargs): seq_len = len(self.last_output) loss_expr = 0 for pos_i in range(seq_len): input_i = self.last_output[pos_i] affine = self.linear_layer(input_i) softmax_out = dy.softmax(affine) if self.mode == "entropy": loss_expr = loss_expr - dy.sum_dim( dy.cmult(dy.log(softmax_out), softmax_out), d=[0]) elif self.mode == "max": loss_expr = loss_expr - dy.log(dy.max_dim(softmax_out)) else: raise ValueError(f"unknown mode {self.mode}") # loss_expr = loss_expr * (self.scale / seq_len) loss_expr = loss_expr * self.scale return losses.FactoredLossExpr({"enc_entropy": loss_expr})
def __buildErrorOutputs(self, scores, correctTree, predictedTree): result = [ ] for tPos in range(correctTree.nrOfTokens()): corrHead = correctTree.getHead(tPos) predHead = predictedTree.getHead(tPos) corrOutputs = scores.getOutput(corrHead, tPos) predOutputs = scores.getOutput(predHead, tPos) corrLblId = self.__lblDict.getLblId(correctTree.getLabel(tPos)) predLblId = self.__lblDict.getLblId(predictedTree.getLabel(tPos)) ### tree errors if corrHead != predHead: result.append((predOutputs[predLblId], dynet.max_dim(corrOutputs))) ### lbl errors worstLblId = max((scr, lId) for (lId, scr) in enumerate(corrOutputs.value()) if lId != corrLblId)[1] result.append((corrOutputs[worstLblId], corrOutputs[corrLblId])) return result
def run_classifier(self, common_top_recur, word_inputs, domain_flag): batch_size = word_inputs.shape[1] seq_len = word_inputs.shape[0] cnn_filter = [] for filt in self.filter: cnn_filter.append(dy.parameter(filt)) cnn_W = dy.parameter(self.class_W) cnn_input = dy.reshape(common_top_recur, (1, seq_len, 2 * self.lstm_hiddens), batch_size) # print(cnn_input.npvalue().shape) cnn_out_list = [] for i in range(len(cnn_filter)): cnn_out = dy.conv2d(cnn_input, cnn_filter[i], [1, 1], is_valid=False) # len*batch*filter_num # print(cnn_out.npvalue().shape) pool_out = dy.max_dim(cnn_out, d=1) # print(pool_out.npvalue().shape) pool_out = dy.reshape(pool_out, (self.filter_size, ), batch_size) # print(pool_out.npvalue().shape) pool_out = dy.rectify(pool_out) cnn_out_list.append(pool_out) final_out = dy.concatenate(cnn_out_list) result = cnn_W * final_out predict = np.argmax(result.npvalue(), axis=0) # print(predict) cor = 0. for pre in predict: if int(pre) == domain_flag: cor += 1 class_accurate = cor / batch_size target = [domain_flag] * batch_size # [0,0,0,0] # print(result.npvalue().shape, np.array(target).shape) classes_loss = dy.pickneglogsoftmax_batch(result, target) class_loss = dy.sum_batches(classes_loss) / batch_size # print(class_loss.npvalue().shape) return class_loss, class_accurate
def calc_scores(words): dy.renew_cg() W_cnn_express = dy.parameter(W_cnn) b_cnn_express = dy.parameter(b_cnn) W_sm_express = dy.parameter(W_sm) b_sm_express = dy.parameter(b_sm) # basically, win size tells you how many words/chars/pixels (?) we're 'looking at' at each step. # Here, 1 unit is 1 word. If a sample has fewer words than win size, then we probably do need some padding. # Padd with index 0. (so we're treating the pad words as UNK (?)) if len(words) < WIN_SIZE: words += [0] * (WIN_SIZE - len(words)) cnn_in = dy.concatenate([dy.lookup(W_emb, x) for x in words], d=1) # concat repr of all words cnn_out = dy.conv2d_bias(cnn_in, W_cnn_express, b_cnn_express, stride=(1, 1), is_valid=False) pool_out = dy.max_dim(cnn_out, d=1) pool_out = dy.reshape(pool_out, (FILTER_SIZE, )) pool_out = dy.rectify(pool_out) return W_sm_express * pool_out + b_sm_express
def calc_loss(self, scores, axis, true, importance): ret = [i * dy.pickneglogsoftmax(scores, t) for t, i in zip(true, importance)] if self.loss == "max_margin": ret.append(dy.max_dim(dy.log_softmax(scores, restrict=list(set(range(self.num_labels[axis])) - set(true))))) return ret
def attend(self, context, x): context_cols = dy.concatenate_cols(context) context_emb = dy.max_dim(context_cols, 1) return context_emb, None