def learn(self, characters, target_mgc, guided_att=True): num_mgc = target_mgc.shape[0] # print num_mgc dy.renew_cg() output_mgc, output_stop, output_attention = self._predict( characters, target_mgc) losses = [] index = 0 for mgc, real_mgc in zip(output_mgc, target_mgc): t_mgc = dy.inputVector(real_mgc) # losses.append(self._compute_binary_divergence(mgc, t_mgc) ) losses.append(dy.l1_distance(mgc, t_mgc)) if index % 3 == 0: # attention loss if guided_att: att = output_attention[index / 3] losses.append( self._compute_guided_attention(att, index / 3, len(characters) + 2, num_mgc / 3)) # EOS loss stop = output_stop[index / 3] if index >= num_mgc - 6: losses.append(dy.l1_distance(stop, dy.scalarInput(-0.8))) else: losses.append(dy.l1_distance(stop, dy.scalarInput(0.8))) index += 1 loss = dy.esum(losses) loss_val = loss.value() / num_mgc loss.backward() self.trainer.update() return loss_val
) # add a noise to each element from a gausian with standard-dev = stddev dy.dropout(e1, p) # apply dropout with probability p # functions over lists of expressions e = dy.esum([e1, e2, ...]) # sum e = dy.average([e1, e2, ...]) # average e = dy.concatenate_cols( [e1, e2, ...] ) # e1, e2,.. are column vectors. return a matrix. (sim to np.hstack([e1,e2,...]) e = dy.concatenate([e1, e2, ...]) # concatenate e = dy.affine_transform([e0, e1, e2, ...]) # e = e0 + ((e1*e2) + (e3*e4) ...) ## Loss functions e = dy.squared_distance(e1, e2) e = dy.l1_distance(e1, e2) e = dy.huber_distance(e1, e2, c=1.345) # e1 must be a scalar that is a value between 0 and 1 # e2 (ty) must be a scalar that is a value between 0 and 1 # e = ty * log(e1) + (1 - ty) * log(1 - e1) e = dy.binary_log_loss(e1, e2) # e1 is row vector or scalar # e2 is row vector or scalar # m is number # e = max(0, m - (e1 - e2)) e = dy.pairwise_rank_loss(e1, e2, m=1.0) # Convolutions # e1 \in R^{d x s} (input)
# define trainable projection layer from word dim to phrase dim # this simplifies concatenation and allows us to treat the recursive base case as a phrase of its own word_to_phrase_projection = model.add_parameters((config.sent_dim, word_dim)) # define graph building operation def generate_graph(parse): parse_graph = parse.to_tree() return graph_gen_helper(parse_graph) def graph_gen_helper(node): node_value = word_to_phrase_projection * embeddings[node.data.form] for child in node: child_subtree = graph_gen_helper(child) # concatenate the node so far with the subtree, select layer according to dep reln node_value = dep_layers[child.data.deprel] * dynet.concatenate( [node_value, child_subtree]) return node_value # run training for parse, y_pred in zip(parse_train, y_preds): y_pred = generate_graph(parse) loss = dynet.l1_distance(dynet.l2_norm(y_pred), dynet.l2_norm(y)) # run eval