def score(self, features, axis): """ Calculate score for each label :param features: extracted feature values, of size input_size :param axis: axis of the label we are predicting :return: array with score for each label """ super().score(features, axis) num_labels = self.num_labels[axis] if self.updates > 0 and num_labels > 1: if dynet_config.gpu(): # RestrictedLogSoftmax is not implemented for GPU, so we move the value to CPU first value = dy.to_device(self.evaluate(features, axis), 'CPU') # then, we move it back to GPU (if the device name is '', the default device will be selected) value = dy.to_device( dy.log_softmax(value, restrict=list(range(num_labels))), '').npvalue() else: value = dy.log_softmax(self.evaluate(features, axis), restrict=list( range(num_labels))).npvalue() return value[:num_labels] self.config.print(" no updates done yet, returning zero vector.", level=4) return np.zeros(num_labels)
def rec(index): if (words[index] == -1): # branch node (l_loss, l_hidden) = rec(lchs[index]) (r_loss, r_hidden) = rec(rchs[index]) # i_gate = dy.logistic(U0i * l_hidden + U1i * r_hidden + bbi) # fl_gate = dy.logistic(U00f * l_hidden + U01f * r_hidden + bbf) # fr_gate = dy.logistic(U10f * l_hidden + U11f * r_hidden + bbf) # o_gate = dy.logistic(U0o * l_hidden + U1o * r_hidden + bbo) hidden = dy.tanh(U0u * l_hidden + U1u * r_hidden + bbu) # cell = dy.cmult(i_gate, u_value) + dy.cmult(fl_gate, l_cell) + dy.cmult(fr_gate, r_cell) # hidden = dy.cmult(o_gate, dy.tanh(cell)) pred1 = dy.log_softmax(Why * hidden + by) loss = l_loss + r_loss - pred1[int(scores[index])] return (loss, hidden) else: embedding_tensor = dy.inputTensor(word_embedding[words[index]]) # i_gate = dy.logistic(Wi * embedding_tensor + bi) # o_gate = dy.logistic(Wo * embedding_tensor + bo) hidden = dy.tanh(Wu * embedding_tensor + bu) # cell = dy.cmult(i_gate, u_value) # hidden = dy.cmult(o_gate, dy.tanh(cell)) pred1 = dy.log_softmax(Why * hidden + by) loss = -pred1[int(scores[index])] return (loss, hidden)
def compute_logits(self, input): W_type = dy.parameter(self.p_W_type) W_beaker_from = dy.parameter(self.p_W_beaker_from) W_beaker_to = dy.parameter(self.p_W_beaker_to) W_amount = dy.parameter(self.p_W_amount) type_logits = dy.log_softmax(W_type * input) beaker_from_logits = dy.log_softmax(W_beaker_from * input) beaker_to_logits = dy.log_softmax(W_beaker_to * input) amount_logits = dy.log_softmax(W_amount * input) return type_logits, beaker_from_logits, beaker_to_logits, amount_logits
def compute_logits(self, input): W_type = dy.parameter(self.p_W_type) W_first_ix = dy.parameter(self.p_W_first_ix) W_second_ix = dy.parameter(self.p_W_second_ix) W_shape = dy.parameter(self.p_W_shape) type_logits = dy.log_softmax(W_type * input) first_ix_logits = dy.log_softmax(W_first_ix * input) second_ix_logits = dy.log_softmax(W_second_ix * input) shape_logits = dy.log_softmax(W_shape * input) return type_logits, first_ix_logits, second_ix_logits, shape_logits
def compute_logits(self, input): W_type = dy.parameter(self.p_W_type) W_from = dy.parameter(self.p_W_from) W_to = dy.parameter(self.p_W_to) W_shirt = dy.parameter(self.p_W_shirt) # W_hat = dy.parameter(self.p_W_hat) type_logits = dy.log_softmax(W_type * input) from_logits = dy.log_softmax(W_from * input) to_logits = dy.log_softmax(W_to * input) shirt_logits = dy.log_softmax(W_shirt * input) # hat_logits = dy.log_softmax(W_hat * input) return type_logits, from_logits, to_logits, shirt_logits
def compute_logits(self, input): W_type = dy.parameter(self.p_W_type) W_first_ix = dy.parameter(self.p_W_first_ix) W_second_ix = dy.parameter(self.p_W_second_ix) W_shape = dy.parameter(self.p_W_shape) type_logits = dy.log_softmax(W_type * input[:self.p_dim]) first_ix_logits = dy.log_softmax(W_first_ix * input[self.p_dim:2 * self.p_dim]) second_ix_logits = dy.log_softmax(W_second_ix * input[2 * self.p_dim:3 * self.p_dim]) shape_logits = dy.log_softmax(W_shape * input[3 * self.p_dim:]) return type_logits, first_ix_logits, second_ix_logits, shape_logits
def compute_logits(self, input): W_type = dy.parameter(self.p_W_type) W_beaker_from = dy.parameter(self.p_W_beaker_from) W_beaker_to = dy.parameter(self.p_W_beaker_to) W_amount = dy.parameter(self.p_W_amount) p_dim = self.input_dim // 4 type_logits = dy.log_softmax(W_type * input[:p_dim]) beaker_from_logits = dy.log_softmax(W_beaker_from * input[p_dim:2*p_dim]) beaker_to_logits = dy.log_softmax(W_beaker_to * input[2*p_dim:3*p_dim]) amount_logits = dy.log_softmax(W_amount * input[3*p_dim:]) return type_logits, beaker_from_logits, beaker_to_logits, amount_logits
def __call__(self, sent1, sent2): """ :param sent1: np matrix. :param sent2: np matrix. :return: np array of 3 elements. """ sent1_linear, sent2_linear = self.apply_linear_embed(sent1, sent2) f1, f2 = self.apply_f(sent1_linear, sent2_linear) score1 = f1 * dy.transpose(f2) prob1 = dy.softmax(score1) score2 = dy.transpose(score1) prob2 = dy.softmax(score2) sent1_combine = dy.concatenate_cols( [sent1_linear, prob1 * sent2_linear]) sent2_combine = dy.concatenate_cols( [sent2_linear, prob2 * sent1_linear]) # sum g1, g2 = self.apply_g(sent1_combine, sent2_combine) sent1_output = dy.sum_dim(g1, [0]) sent2_output = dy.sum_dim(g2, [0]) input_combine = dy.transpose( dy.concatenate([sent1_output, sent2_output])) h = self.apply_h(input_combine) linear_final = dy.parameter(self.linear_final) h = h * linear_final output = dy.log_softmax(dy.transpose(h)) return output
def log_softmax_costs(logits, costs=None, valid_actions=None): """Compute log softmax-margin with arbitrary costs.""" if costs is not None: # each action gets a cost, the higher the overall score the better. # Typically, when adding `costs`, no `valid_actions` are passed. logits += dy.inputVector(costs) return dy.log_softmax(logits, restrict=valid_actions)
def generate(sent): dy.renew_cg() src = sent #initialize the LSTM init_state_src = LSTM_SRC_BUILDER.initial_state() #get the output of the first LSTM src_output = init_state_src.add_inputs([LOOKUP_SRC[x] for x in src])[-1].output() #generate until a eos tag or max is reached current_state = LSTM_TRG_BUILDER.initial_state().set_s([src_output, dy.tanh(src_output)]) prev_word = sos_trg trg_sent = [] W_sm = dy.parameter(W_sm_p) b_sm = dy.parameter(b_sm_p) for i in range(MAX_SENT_SIZE): #feed the previous word into the lstm, calculate the most likely word, add it to the sentence current_state = current_state.add_input(LOOKUP_TRG[prev_word]) output_embedding = current_state.output() s = dy.affine_transform([b_sm, W_sm, output_embedding]) probs = (-dy.log_softmax(s)).value() next_word = np.argmax(probs) if next_word == eos_trg: break prev_word = next_word trg_sent.append(i2w_trg[next_word]) return trg_sent
def sample_action(self, state, argmax=False, sample_pp=None, predefined_actions=None): policy = dy.log_softmax(self.policy_network(state)) actions = [] if predefined_actions is not None: # Use defined action value self.sampling_action = self.SamplingAction.PREDEFINED actions.extend(predefined_actions) else: # sample from policy for k in range(self.sample): sample = self.sample_from_policy(policy, argmax=argmax) if sample_pp is not None: sample = sample_pp(sample) actions.append(sample) # only one sample during argmax if argmax: break try: return actions finally: self.policy_lls.append(policy) self.actions.append(actions) self.states.append(state)
def generate(self, context, trg, decorate=False, maxpossible=100): #greedy generation! prev_out=dy.zeros((self.hdim)) outputs=[] for i in range(maxpossible): emb=dy.concatenate([context, prev_out]) Ui,Uo,Uu = [dy.parameter(u) for u in self.US] Uf1= dy.parameter(self.UFS[0]) bi,bo,bu,bf = [dy.parameter(b) for b in self.BS] #import pdb;pdb.set_trace() i = dy.logistic(bi+Ui*emb) o = dy.logistic(bi+Uo*emb) f = dy.logistic(bf+Uf1*emb) #print("hey") u = dy.tanh(bu+Uu*emb) c = dy.cmult(i,u) + dy.cmult(f,prev_out) h = dy.cmult(o,dy.tanh(c)) if decorate: tree._e = h prev_out=c #pre1=dy.parameter(self.pre_l) pre2=dy.parameter(self.pred) out=dy.log_softmax(pre2*h) out=np.argmax(out) outputs.append(out) if out==1: print(outputs) print("-----") print(trg) return outputs print(outputs) print("---") print(trg) return outputs
def decode_to_prediction(self, encoded, max_length): w = dy.parameter(self.w_softmax) b = dy.parameter(self.b_softmax) w1 = dy.parameter(self.attention_source) encoded_states = dy.concatenate_cols(encoded) attentional_component = w1 * encoded_states prev_output_embeddings = self.target_lookup[self.eos_target] current_state = self.decoder.initial_state().add_input( dy.concatenate( [dy.vecInput(self.hidden_size * 2), prev_output_embeddings])) result = "" for i in range(max_length): vector = dy.concatenate([ self.attention(encoded_states, current_state, attentional_component), prev_output_embeddings ]) current_state = current_state.add_input(vector) s = dy.affine_transform([b, w, current_state.output()]) probs = (dy.log_softmax(s)).value() next_word = np.argmax(probs) prev_output_embeddings = self.target_lookup[next_word] if (next_word == self.eos_target): return result[:-1] if next_word in self.targetDictionnary.keys(): result += self.targetDictionnary[next_word] + " " else: result += self.targetDictionnary[unk_target] + " " return result[:-1]
def sample_segmentation(self, encodings, batch_size): lmbd = self.lmbd.value() if self.lmbd is not None else 0 eps = self.eps.value() if self.eps is not None else None segment_logsoftmaxes = [dy.log_softmax(self.segment_transform(fb)) for fb in encodings] # Flags is_presegment_provided = len(self.src_sent) != 0 and hasattr(self.src_sent[0], "annotation") is_warmup = lmbd == 0 or self.is_segmentation_warmup() is_epsgreedy_triggered = eps is not None and numpy.random.random() <= eps # Sample based on the criterion if self.learn_segmentation and not is_warmup and not self.train: # During testing always sample from softmax if it is not warmup segment_decisions = self.sample_from_softmax(encodings, batch_size, segment_logsoftmaxes) elif is_presegment_provided: segment_decisions = self.sample_from_prior(encodings, batch_size) elif is_warmup or is_epsgreedy_triggered: segment_decisions = self.sample_from_poisson(encodings, batch_size) else: segment_decisions = self.sample_from_softmax(encodings, batch_size, segment_logsoftmaxes) segment_decisions = segment_decisions.transpose() # The last segment decision of an active components should be equal to 1 if encodings.mask is not None: src = self.src_sent mask = [numpy.nonzero(m)[0] for m in encodings.mask.np_arr.transpose()] assert len(segment_decisions) == len(mask), \ "Len(seg)={}, Len(mask)={}".format(len(segment_decisions), len(mask)) for i in range(len(segment_decisions)): if len(mask[i]) != 0: segment_decisions[i-1][mask[i]] = 1 segment_decisions[-1][:] = 1 return segment_decisions, segment_logsoftmaxes
def generate_output(self, decoder, attender, output_embedder, dec_state, src_length=None, forced_trg_ids=None): score = 0.0 word_ids = [] attentions = [] while (word_ids == [] or word_ids[-1] != Vocab.ES) and len(word_ids) < self.max_len: if len(word_ids ) > 0: # don't feed in the initial start-of-sentence token dec_state = decoder.add_input( dec_state, output_embedder.embed( word_ids[-1] if forced_trg_ids is None else forced_trg_ids[len(word_ids) - 1])) dec_state.context = attender.calc_context( dec_state.rnn_state.output()) logsoftmax = dy.log_softmax( decoder.get_scores(dec_state)).npvalue() if forced_trg_ids is None: cur_id = np.argmax(logsoftmax) else: cur_id = forced_trg_ids[len(word_ids)] score += logsoftmax[cur_id] word_ids.append(cur_id) attentions.append(attender.get_last_attention()) return SearchOutput(word_ids, attentions), score
def predict(self, batch_dict): dy.renew_cg() inputs = self.make_input(batch_dict) lengths = inputs['lengths'] unaries = self.compute_unaries(inputs) if self.do_crf is True: best_path, path_score = self.crf.decode(unaries) elif self.constraint is not None: best_path, path_score = viterbi( unaries, dy.log_softmax(dy.inputTensor(self.constraint[1] * -1e4)), Offsets.GO, Offsets.EOS, norm=True) else: best_path = [np.argmax(x.npvalue(), axis=0) for x in unaries] # TODO: RN using autobatching, so none of this is really useful # If we want to support batching in this function we have to either loop over the batch # or we can just simplify all this code here best_path = np.stack(best_path).reshape(-1, 1) # (T, B) best_path = best_path.transpose(1, 0) results = [] for b in range(best_path.shape[0]): sentence = best_path[b, :lengths[b]] results.append(sentence) return results
def predict(self, batch_dict): dy.renew_cg() inputs = self.make_input(batch_dict) lengths = inputs['lengths'] unaries = self.compute_unaries(inputs) if self.do_crf is True: best_path, path_score = self.crf.decode(unaries) elif self.constraint is not None: best_path, path_score = viterbi( unaries, dy.log_softmax(dy.inputTensor(self.constraint[1] * -1e4)), Offsets.GO, Offsets.EOS, norm=True ) else: best_path = [np.argmax(x.npvalue(), axis=0) for x in unaries] # TODO: RN using autobatching, so none of this is really useful # If we want to support batching in this function we have to either loop over the batch # or we can just simplify all this code here best_path = np.stack(best_path).reshape(-1, 1) # (T, B) best_path = best_path.transpose(1, 0) results = [] for b in range(best_path.shape[0]): sentence = best_path[b, :lengths[b]] results.append(sentence) return results
def viterbi(emissions, transition, start_idx, end_idx, norm=False): n_tags = emissions[0].dim()[0][0] backpointers = [] inits = [-1e4] * n_tags inits[start_idx] = 0 alphas = dy.inputVector(inits) alphas = dy.log_softmax(alphas) if norm else alphas for emission in emissions: next_vars = dy.colwise_add(dy.transpose(transition), alphas) best_tags = np.argmax(next_vars.npvalue(), 0) v_t = dy.max_dim(next_vars, 0) alphas = v_t + emission backpointers.append(best_tags) terminal_expr = alphas + dy.pick(transition, end_idx) best_tag = np.argmax(terminal_expr.npvalue()) path_score = dy.pick(terminal_expr, best_tag) best_path = [best_tag] for bp_t in reversed(backpointers): best_tag = bp_t[best_tag] best_path.append(best_tag) _ = best_path.pop() best_path.reverse() return best_path, path_score
def train(self, words, lemmas, gold, bad): dy.renew_cg() W = dy.parameter(self.pW) b = dy.parameter(self.pb) losses = [] gold_scores = [] bad_scores = [] for item in gold: lf, denotation = item[0], item[1] feature = self.extract_feature(words, lemmas, lf, denotation) feature_vec = dy.vecInput(self.nfeatures) feature_vec.set(feature) gold_scores.append(W * feature_vec + b) for item in bad: lf, denotation = item[0], item[1] feature = self.extract_feature(words, lemmas, lf, denotation) feature_vec = dy.vecInput(self.nfeatures) feature_vec.set(feature) bad_scores.append(W * feature_vec + b) log_prob = dy.log_softmax(dy.concatenate(gold_scores + bad_scores)) for i in range(len(gold_scores)): losses.append(dy.pick(log_prob, i)) return -dy.esum(losses)
def compute_output_log_probs(self, inputs, possible_actions, state=None, past_states=None, past_actions=None): assert state is not None W_context_action = dy.parameter(self.p_W_context_action) W_action = dy.parameter(self.p_W_action) unconstrained, support = self._log_probs_unconstrained_unnormed(inputs, possible_actions) unconstrained += action_in_state_context_bonuses(self.corpus, state, inputs, W_context_action, W_action, self.predict_invalid, past_states, past_actions) return dy.log_softmax(unconstrained, support)
def calc_loss( self, x: dy.Expression, y: Union[numbers.Integral, List[numbers.Integral]]) -> dy.Expression: scores = self.calc_scores(x) if self.label_smoothing == 0.0: # single mode if not batchers.is_batched(y): loss = dy.pickneglogsoftmax(scores, y) # minibatch mode else: loss = dy.pickneglogsoftmax_batch(scores, y) else: log_prob = dy.log_softmax(scores) if not batchers.is_batched(y): pre_loss = -dy.pick(log_prob, y) else: pre_loss = -dy.pick_batch(log_prob, y) ls_loss = -dy.mean_elems(log_prob) loss = ((1 - self.label_smoothing) * pre_loss) + (self.label_smoothing * ls_loss) return loss
def calc_loss(self, mlp_dec_state, ref_action): """ Label Smoothing is implemented with reference to Section 7 of the paper "Rethinking the Inception Architecture for Computer Vision" (https://arxiv.org/pdf/1512.00567.pdf) """ scores = self.get_scores(mlp_dec_state) if self.label_smoothing == 0.0: # single mode if not xnmt.batcher.is_batched(ref_action): return dy.pickneglogsoftmax(scores, ref_action) # minibatch mode else: return dy.pickneglogsoftmax_batch(scores, ref_action) else: log_prob = dy.log_softmax(scores) if not xnmt.batcher.is_batched(ref_action): pre_loss = -dy.pick(log_prob, ref_action) else: pre_loss = -dy.pick_batch(log_prob, ref_action) ls_loss = -dy.mean_elems(log_prob) loss = ((1 - self.label_smoothing) * pre_loss) + (self.label_smoothing * ls_loss) return loss
def generate(sent): dy.renew_cg() src = sent # initialize the LSTM init_state_src = LSTM_SRC_BUILDER.initial_state() # get the output of the first LSTM src_output = init_state_src.add_inputs([LOOKUP_SRC[x] for x in src])[-1].output() # generate until a eos tag or max is reached current_state = LSTM_TRG_BUILDER.initial_state().set_s( [src_output, dy.tanh(src_output)]) prev_word = sos_trg trg_sent = [] W_sm = dy.parameter(W_sm_p) b_sm = dy.parameter(b_sm_p) for i in range(MAX_SENT_SIZE): # feed the previous word into the lstm, calculate the most likely word, add it to the sentence current_state = current_state.add_input(LOOKUP_TRG[prev_word]) output_embedding = current_state.output() s = dy.affine_transform([b_sm, W_sm, output_embedding]) probs = (-dy.log_softmax(s)).value() next_word = np.argmax(probs) if next_word == eos_trg: break prev_word = next_word trg_sent.append(i2w_trg[next_word]) return trg_sent
def get_constit_loss(fws, bws, goldspans): if not USE_PTB_CONSTITS: raise Exception("should not be using the constit loss now!", USE_PTB_CONSTITS) if len(goldspans) == 0: return None, 0 losses = [] sentlen = len(fws) for j in range(sentlen): istart = 0 if USE_SPAN_CLIP and j > ALLOWED_SPANLEN: istart = max(0, j - ALLOWED_SPANLEN) for i in range(istart, j + 1): constit_ij = w_c * dy.rectify( w_fb * dy.concatenate([fws[i][j], bws[i][j]]) + b_fb) + b_c logloss = dy.log_softmax(constit_ij) isconstit = int((i, j) in goldspans) losses.append(pick(logloss, isconstit)) ptbconstitloss = dy.scalarInput(DELTA) * -esum(losses) numspanstagged = len(losses) return ptbconstitloss, numspanstagged
def cross_entropy_loss(self, scores, next_words): if self.label_smoothing: log_softmax = dy.log_softmax(scores) return -dy.pick_batch(log_softmax, next_words) * (1 - self.label_smoothing) \ - dy.mean_elems(log_softmax) * self.label_smoothing else: return dy.pickneglogsoftmax_batch(scores, next_words)
def _policy_shape_probs(self, prob_dist): # TODO: this is specific to Alchemy num_actions = len(self.output_action_vocabulary) - 1 num_locations = len(self.output_location_vocabulary) - 1 num_arguments = len(self.output_argument_vocabulary) - 1 new_probdist = dy.zeros(prob_dist.dim()[0]) zeroes = numpy.zeros(num_locations * num_arguments) ones = numpy.ones(num_locations * num_arguments) eos_prob = prob_dist[self._all_output_vocabulary.lookup_index((EOS, NO_ARG, NO_ARG))] action_idx = 0 for action in self.output_action_vocabulary: masks = numpy.concatenate( (numpy.repeat(zeroes, action_idx), ones, numpy.repeat(zeroes, num_actions - action_idx - 1))) actions_masks = dy.reshape(dy.inputTensor(masks), (num_actions * num_locations * num_arguments, 1)) if action == EOS: new_probdist += dy.cmult(actions_masks, prob_dist) / 2. elif action == "push": new_probdist += dy.cmult(actions_masks, prob_dist) + eos_prob / (2. * 56.) elif action == "pop": new_probdist += dy.cmult(actions_masks, prob_dist) if self.args.syntax_restricted: return dy.exp(dy.log_softmax(dy.cmult(new_probdist, prob_dist), restrict = self._valid_action_indices)) else: return dy.softmax(dy.cmult(new_probdist, prob_dist))
def __call__(self, translator, dec_state, src, trg): # TODO: apply trg.mask ? samples = [] logsofts = [] self.bs = [] done = [False for _ in range(len(trg))] for _ in range(self.sample_length): dec_state.context = translator.attender.calc_context(dec_state.rnn_state.output()) if self.use_baseline: h_t = dy.tanh(translator.decoder.context_projector(dy.concatenate([dec_state.rnn_state.output(), dec_state.context]))) self.bs.append(self.baseline(dy.nobackprop(h_t))) logsoft = dy.log_softmax(translator.decoder.get_scores(dec_state)) sample = logsoft.tensor_value().categorical_sample_log_prob().as_numpy()[0] # Keep track of previously sampled EOS sample = [sample_i if not done_i else Vocab.ES for sample_i, done_i in zip(sample, done)] # Appending and feeding in the decoder logsoft = dy.pick_batch(logsoft, sample) logsofts.append(logsoft) samples.append(sample) dec_state = translator.decoder.add_input(dec_state, translator.trg_embedder.embed(xnmt.batcher.mark_as_batch(sample))) # Check if we are done. done = list(six.moves.map(lambda x: x == Vocab.ES, sample)) if all(done): break samples = np.stack(samples, axis=1).tolist() self.eval_score = [] for trg_i, sample_i in zip(trg, samples): # Removing EOS try: idx = sample_i.index(Vocab.ES) sample_i = sample_i[:idx] except ValueError: pass try: idx = trg_i.words.index(Vocab.ES) trg_i.words = trg_i.words[:idx] except ValueError: pass # Calculate the evaluation score score = 0 if not len(sample_i) else self.evaluation_metric.evaluate_fast(trg_i.words, sample_i) self.eval_score.append(score) self.true_score = dy.inputTensor(self.eval_score, batched=True) loss = LossBuilder() if self.use_baseline: for i, (score, _) in enumerate(zip(self.bs, logsofts)): logsofts[i] = dy.cmult(logsofts[i], score - self.true_score) loss.add_loss("Reinforce", dy.sum_elems(dy.esum(logsofts))) else: loss.add_loss("Reinforce", dy.sum_elems(dy.cmult(-self.true_score, dy.esum(logsofts)))) if self.use_baseline: baseline_loss = [] for bs in self.bs: baseline_loss.append(dy.squared_distance(self.true_score, bs)) loss.add_loss("Baseline", dy.sum_elems(dy.esum(baseline_loss))) return loss
def _encodings_to_label_log_probabilities(self, encodings, lmbd=None): label_scores = self.f_label(dy.concatenate_to_batch(encodings)) label_scores_reshaped = dy.reshape(label_scores, (self.label_vocab.size, len(encodings))) if lmbd is not None: label_scores_reshaped = dy.cmult(label_scores_reshaped, lmbd) return dy.log_softmax(label_scores_reshaped)
def compute_output_log_probs(self, inputs, possible_actions, state=None, past_states=None, past_actions=None): assert len(inputs) == 1 input = inputs[0] type_logits, from_logits, to_logits, shirt_logits = self.compute_logits(input) support = sorted([self.corpus.ACTIONS_TO_INDEX[ac] for ac in possible_actions]) unconstrained = self.combine_logits(type_logits, from_logits, to_logits, shirt_logits) return dy.log_softmax(unconstrained, support)
def _get_transition(self, stack, buffer, empty_buffer, valid_transitions): stack_embedding = stack[-1][0].output( ) # the stack is not empty so we should decide transition buffer_embedding = buffer[-1][0] if buffer else empty_buffer parser_state = dy.concatenate([buffer_embedding, stack_embedding]) h = dy.rectify(self.s2h * parser_state + self.s2h_b) logits = self.h2t * h + self.h2t_b logps = dy.log_softmax(logits, valid_transitions) return logps, h
def compute_output_log_probs(self, inputs, possible_actions, state=None, past_states=None, past_actions=None): unconstrained, support = self._log_probs_unconstrained_unnormed( inputs, possible_actions) return dy.log_softmax(unconstrained, support)
def __call__(self, query, options, gold, lengths, query_no): if len(options) == 1: return None, 0 final = [] if args.word_vectors: qvecs = [dy.lookup(self.pEmbedding, w) for w in query] qvec_max = dy.emax(qvecs) qvec_mean = dy.average(qvecs) for otext, features in options: if not args.no_features: inputs = dy.inputTensor(features) if args.word_vectors: ovecs = [dy.lookup(self.pEmbedding, w) for w in otext] ovec_max = dy.emax(ovecs) ovec_mean = dy.average(ovecs) if args.no_features: inputs = dy.concatenate( [qvec_max, qvec_mean, ovec_max, ovec_mean]) else: inputs = dy.concatenate( [inputs, qvec_max, qvec_mean, ovec_max, ovec_mean]) if args.drop > 0: inputs = dy.dropout(inputs, args.drop) h = inputs for pH, pB in zip(self.hidden, self.bias): h = dy.affine_transform([pB, pH, h]) if args.nonlin == "linear": pass elif args.nonlin == "tanh": h = dy.tanh(h) elif args.nonlin == "cube": h = dy.cube(h) elif args.nonlin == "logistic": h = dy.logistic(h) elif args.nonlin == "relu": h = dy.rectify(h) elif args.nonlin == "elu": h = dy.elu(h) elif args.nonlin == "selu": h = dy.selu(h) elif args.nonlin == "softsign": h = dy.softsign(h) elif args.nonlin == "swish": h = dy.cmult(h, dy.logistic(h)) final.append(dy.sum_dim(h, [0])) final = dy.concatenate(final) nll = -dy.log_softmax(final) dense_gold = [] for i in range(len(options)): dense_gold.append(1.0 / len(gold) if i in gold else 0.0) answer = dy.inputTensor(dense_gold) loss = dy.transpose(answer) * nll predicted_link = np.argmax(final.npvalue()) return loss, predicted_link
def score(self, features, axis): """ Calculate score for each label :param features: extracted feature values, of size input_size :param axis: axis of the label we are predicting :return: array with score for each label """ super().score(features, axis) num_labels = self.num_labels[axis] if self.updates > 0 and num_labels > 1: if dynet_config.gpu(): # RestrictedLogSoftmax is not implemented for GPU, so we move the value to CPU first value = dy.to_device(self.evaluate(features, axis), 'CPU') # then, we move it back to GPU (if the device name is '', the default device will be selected) value = dy.to_device(dy.log_softmax(value, restrict=list(range(num_labels))), '').npvalue() else: value = dy.log_softmax(self.evaluate(features, axis), restrict=list(range(num_labels))).npvalue() return value[:num_labels] self.config.print(" no updates done yet, returning zero vector.", level=4) return np.zeros(num_labels)
def generate(sent): dy.renew_cg() # Transduce all batch elements with an LSTM src = sent #get the output of the first LSTM src_outputs = [dy.concatenate([x.output(), y.output()]) for x,y in LSTM_SRC.add_inputs([LOOKUP_SRC[word] for word in src])] src_output = src_outputs[-1] #gets the parameters for the attention src_output_matrix = dy.concatenate_cols(src_outputs) w1_att_src = dy.parameter(w1_att_src_p) fixed_attentional_component = w1_att_src * src_output_matrix #generate until a eos tag or max is reached current_state = LSTM_TRG_BUILDER.initial_state().set_s([src_output, dy.tanh(src_output)]) prev_word = sos_trg trg_sent = [] attention_matrix = [] W_sm = dy.parameter(W_sm_p) b_sm = dy.parameter(b_sm_p) W_m = dy.parameter(W_m_p) b_m = dy.parameter(b_m_p) for i in range(MAX_SENT_SIZE): #feed the previous word into the lstm, calculate the most likely word, add it to the sentence current_state = current_state.add_input(LOOKUP_TRG[prev_word]) output_embedding = current_state.output() att_output, alignment = calc_attention(src_output_matrix, output_embedding, fixed_attentional_component) attention_matrix.append(alignment) middle_expr = dy.tanh(dy.affine_transform([b_m, W_m, dy.concatenate([output_embedding, att_output])])) s = dy.affine_transform([b_sm, W_sm, middle_expr]) probs = (-dy.log_softmax(s)).value() next_word = np.argmax(probs) if next_word == eos_trg: break prev_word = next_word trg_sent.append(i2w_trg[next_word]) return trg_sent, dy.concatenate_cols(attention_matrix).value()
def calc_loss(self, scores, axis, true, importance): ret = [i * dy.pickneglogsoftmax(scores, t) for t, i in zip(true, importance)] if self.loss == "max_margin": ret.append(dy.max_dim(dy.log_softmax(scores, restrict=list(set(range(self.num_labels[axis])) - set(true))))) return ret
def prediction(self, x): return [dy.log_softmax(y) for y in self.output(x)]
def parse(self, t, oracle_actions=None): dy.renew_cg() self.NULL_REP = self.WORDS_LOOKUP[self.nwords-1] if oracle_actions: oracle_actions = list(oracle_actions) oracle_actions.reverse() toks = list(t) toks.reverse() stack = [] buffer = [] W1 = dy.parameter(self.pW1) b1 = dy.parameter(self.pb1) W_act = dy.parameter(self.pW_act) b_act = dy.parameter(self.pb_act) losses = [] for tok in toks: tok_embedding = self.WORDS_LOOKUP[tok] buffer.append(Head(self.vocab.i2w[tok], tok_embedding)) while not (len(stack) == 1 and len(buffer) == 0): # based on parser state, get valid actions valid_actions = [] if len(buffer) > 0: # can only reduce if elements in buffer valid_actions += [SHIFT] if len(stack) >= 2: # can only shift if 2 elements on stack valid_actions += [REDUCE_L, REDUCE_R] # compute probability of each of the actions and choose an action # either from the oracle or if there is no oracle, based on the model action = valid_actions[0] log_probs = None if len(valid_actions) > 1: representations = self.extract_features(stack, buffer) h = dy.cube(W1*dy.concatenate(representations) + b1) logits = W_act * h + b_act log_probs = dy.log_softmax(logits, valid_actions) if oracle_actions is None: action = max(enumerate(log_probs.vec_value()), key=itemgetter(1))[0] if oracle_actions is not None: action = oracle_actions.pop() if log_probs is not None: # append the action-specific loss losses.append(dy.pick(log_probs, action)) # execute the action to update the parser state if action == SHIFT: token = buffer.pop() stack.append(token) else: # one of the reduce actions right = stack.pop() left = stack.pop() head, modifier = (left, right) if action == REDUCE_R else (right, left) #add the tokens and their embeddings into the children list if action == REDUCE_R: head.add_child(modifier, 'right') else: head.add_child(modifier, 'left') stack.append(head) if oracle_actions is None: print('{0} --> {1}'.format(head.word, modifier.word)) # the head of the tree that remains at the top of the stack is now the root if oracle_actions is None: head = stack.pop().word print('ROOT --> {0}'.format(head)) return -dy.esum(losses) if losses else None