def get_normalized_reps(self, embs, forward_lstm, backward_lstm, encode=False): word_reps = [dy.concatenate([forward_lstm.initial_state().transduce(emb)[-1], backward_lstm.initial_state().transduce(reversed(emb))[-1]]) for emb in embs] if not encode: return [dy.cdiv(rep, dy.l2_norm(rep)) for rep in word_reps] else: return [dy.cdiv(rep, dy.l2_norm(rep)).value() for rep in word_reps]
def embed(self, x: Union[batchers.Batch, numbers.Integral]) -> dy.Expression: if self.train and self.word_dropout > 0.0 and self.word_id_mask is None: batch_size = x.batch_size() if batchers.is_batched(x) else 1 self.word_id_mask = [set(np.random.choice(self.vocab_size, int(self.vocab_size * self.word_dropout), replace=False)) for _ in range(batch_size)] emb_e = dy.parameter(self.embeddings) # single mode if not batchers.is_batched(x): if self.train and self.word_id_mask and x in self.word_id_mask[0]: ret = dy.zeros((self.emb_dim,)) else: ret = dy.pick(emb_e, index=x) if self.fix_norm is not None: ret = dy.cdiv(ret, dy.l2_norm(ret)) if self.fix_norm != 1: ret *= self.fix_norm # minibatch mode else: ret = dy.pick_batch(emb_e, x) if self.fix_norm is not None: ret = dy.cdiv(ret, dy.l2_norm(ret)) if self.fix_norm != 1: ret *= self.fix_norm if self.train and self.word_id_mask and any(x[i] in self.word_id_mask[i] for i in range(x.batch_size())): dropout_mask = dy.inputTensor(np.transpose([[0.0]*self.emb_dim if x[i] in self.word_id_mask[i] else [1.0]*self.emb_dim for i in range(x.batch_size())]), batched=True) ret = dy.cmult(ret, dropout_mask) if self.train and self.weight_noise > 0.0: ret = dy.noise(ret, self.weight_noise) return ret
def embed(self, x): if self.train and self.word_dropout > 0.0 and self.word_id_mask is None: batch_size = x.batch_size() if xnmt.batcher.is_batched(x) else 1 self.word_id_mask = [set(np.random.choice(self.vocab_size, int(self.vocab_size * self.word_dropout), replace=False)) for _ in range(batch_size)] # single mode if not xnmt.batcher.is_batched(x): if self.train and self.word_id_mask and x in self.word_id_mask[0]: ret = dy.zeros((self.emb_dim,)) else: ret = self.embeddings[x] if self.fix_norm is not None: ret = dy.cdiv(ret, dy.l2_norm(ret)) if self.fix_norm != 1: ret *= self.fix_norm # minibatch mode else: ret = self.embeddings.batch(x) if self.fix_norm is not None: ret = dy.cdiv(ret, dy.l2_norm(ret)) if self.fix_norm != 1: ret *= self.fix_norm if self.train and self.word_id_mask and any(x[i] in self.word_id_mask[i] for i in range(x.batch_size())): dropout_mask = dy.inputTensor(np.transpose([[0.0]*self.emb_dim if x[i] in self.word_id_mask[i] else [1.0]*self.emb_dim for i in range(x.batch_size())]), batched=True) ret = dy.cmult(ret, dropout_mask) if self.train and self.weight_noise > 0.0: ret = dy.noise(ret, self.weight_noise) return ret
def __cosine_loss(self, pred, gold): sn1 = dy.l2_norm(pred) sn2 = dy.l2_norm(gold) mult = dy.cmult(sn1, sn2) dot = dy.dot_product(pred, gold) div = dy.cdiv(dot, mult) vec_y = dy.scalarInput(2) res = dy.cdiv(1 - div, vec_y) return res
def word_assoc_score(self, source_idx, target_idx, relation): """ NOTE THAT DROPOUT IS BEING APPLIED HERE :param source_idx: embedding index of source atom :param target_idx: embedding index of target atom :param relation: relation type :return: score """ # prepare s = self.embeddings[source_idx] if self.no_assoc: A = dy.const_parameter(self.word_assoc_weights[relation]) else: A = dy.parameter(self.word_assoc_weights[relation]) dy.dropout(A, self.dropout) t = self.embeddings[target_idx] # compute if self.mode == BILINEAR_MODE: return dy.transpose(s) * A * t elif self.mode == DIAG_RANK1_MODE: diag_A = dyagonalize(A[0]) rank1_BC = A[1] * dy.transpose(A[2]) ABC = diag_A + rank1_BC return dy.transpose(s) * ABC * t elif self.mode == TRANSLATIONAL_EMBED_MODE: return -dy.l2_norm(s - t + A) elif self.mode == DISTMULT: return dy.sum_elems(dy.cmult(dy.cmult(s, A), t))
def transduce(self, src: ExpressionSequence) -> ExpressionSequence: src = src.as_tensor() src_height = src.dim()[0][0] src_width = src.dim()[0][1] # src_channels = 1 batch_size = src.dim()[1] # convolution and pooling layers # src dim is ((40, 1000), 128) src = padding(src, self.filter_width[0]+3) l1 = dy.rectify(dy.conv2d(src, dy.parameter(self.filters1), stride = [self.stride[0], self.stride[0]], is_valid = True)) # ((1, 1000, 64), 128) pool1 = dy.maxpooling2d(l1, (1, 4), (1,2), is_valid = True) #((1, 499, 64), 128) pool1 = padding(pool1, self.filter_width[1]+3) l2 = dy.rectify(dy.conv2d(pool1, dy.parameter(self.filters2), stride = [self.stride[1], self.stride[1]], is_valid = True))# ((1, 499, 512), 128) pool2 = dy.maxpooling2d(l2, (1, 4), (1,2), is_valid = True)#((1, 248, 512), 128) pool2 = padding(pool2, self.filter_width[2]) l3 = dy.rectify(dy.conv2d(pool2, dy.parameter(self.filters3), stride = [self.stride[2], self.stride[2]], is_valid = True))# ((1, 248, 1024), 128) pool3 = dy.max_dim(l3, d = 1) my_norm = dy.l2_norm(pool3) + 1e-6 output = dy.cdiv(pool3,my_norm) output = dy.reshape(output, (self.num_filters[2],), batch_size = batch_size) return ExpressionSequence(expr_tensor=output)
def calculate_loss(self, src_file, tgt_file): # Renew the computation graph dy.renew_cg() # Initialize LSTMs enc_init_state_fwd = self.enc_lstm_fwd_builder.initial_state() enc_init_state_bwd = self.enc_lstm_bwd_builder.initial_state() # MLP to predict the duration W_d = dy.parameter(self.W_duration) b_d = dy.parameter(self.b_duration) # MLP to predict f0 W_f0 = dy.parameter(self.W_f0) b_f0 = dy.parameter(self.b_duration) input_frames = dy.inputTensor(np.loadtxt(src_file)) output_frames = dy.inputTensor(np.loadtxt(tgt_file)) len_tgt = len(np.loadtxt(tgt_file)) input_frames_reverse = dy.inputTensor(np.flipud(np.loadtxt(src_file))) # Get the LSTM embeddings fwd_output = enc_init_state_fwd.add_inputs( [frame for frame in input_frames])[-1].output() bwd_output = enc_init_state_bwd.add_inputs( [frame for frame in input_frames_reverse])[-1].output() # Concatenate bilstm_embeddings = dy.concatenate([fwd_output, bwd_output]) # Predict durations target_duration = self.mlp(bilstm_embeddings, W_d, b_d) duration_loss = dy.l2_norm(target_duration - len_tgt) # initialize decoder LSTM dec_init_state = self.dec_lstm_builder.initial_state().add_inputs( bilstm_embeddings)[-1].output() # Generate target frames prediction_loss = [] for k in range(len_tgt): predicted_frame = self.mlp(dec_init_state, W_f0, b_f0) prediction_loss.append( dy.l2_norm(predicted_frame - output_frames[k])) return duration_loss, dy.esum(prediction_loss)
def train_network(params, ntags, train_data, dev_set): global telemetry_file, randstring, MIN_ACC prev_acc = 0 m = params[0] t0 = time.clock() # train the network trainer = dy.SimpleSGDTrainer(m) total_loss = 0 seen_instances = 0 train_good = 0 for train_x, train_y in train_data: dy.renew_cg() output = build_network(params, train_x) # l2 regularization did not look promising at all, so it's commented out loss = -dy.log(output[train_y]) + REG_LAMBDA * sum( [dy.l2_norm(p) for p in params[2:]]) if train_y == np.argmax(output.npvalue()): train_good += 1 seen_instances += 1 total_loss += loss.value() loss.backward() trainer.update() if seen_instances % 20000 == 0: # measure elapsed seconds secs = time.clock() - t0 t0 = time.clock() good = case = 0 max_dev_instances = 70 * 1000 dev_instances = 0 for x_tuple, dev_y in dev_set: output = build_network(params, x_tuple) if np.argmax(output.npvalue()) == dev_y: good += 1 case += 1 dev_instances += 1 if dev_instances >= max_dev_instances: break acc = float(good) / case print( "iterations: {}. train_accuracy: {} accuracy: {} avg loss: {} secs per 1000:{}" .format(seen_instances, float(train_good) / 20000, acc, total_loss / (seen_instances + 1), secs / 20)) train_good = 0 if acc > MIN_ACC and acc > prev_acc: print("saving.") dy.save("params_" + randstring, list(params)[1:]) prev_acc = acc telemetry_file.write("{}\t{}\t{}\t{}\n".format( seen_instances, acc, total_loss / (seen_instances + 1), secs / 20)) MIN_ACC = max(prev_acc, MIN_ACC)
def learn(self, src, dst): softmax_list, aux_list = self._predict(src, dst=dst, num_predictions=len(dst) + 1, runtime=False) for softmax, aux, entry in zip(softmax_list, aux_list, dst): word = entry.word.decode('utf-8').lower() if word in self.output_encodings.word2int: w_index = self.output_encodings.word2int[word] else: w_index = self.output_encodings.word2int["<UNK>"] w_emb, found = self.dst_we.get_word_embeddings(entry.word.decode('utf-8')) self.losses.append(-dy.log(dy.pick(softmax, w_index))) if found: vec1=aux vec2=dy.inputVector(w_emb) cosine = dy.dot_product(vec1, vec2) * dy.pow(dy.l2_norm(vec1) * dy.l2_norm(vec2), dy.scalarInput(-1)) self.losses.append(dy.squared_distance(cosine, dy.scalarInput(1.0))) self.losses.append(-dy.log(dy.pick(softmax_list[-1], self.EOS)))
def embed(self, x: Union[batchers.Batch, numbers.Integral]) -> dy.Expression: """ Embed a single word in a sentence. :param x: A word id. :return: Embedded word. """ ret = self._embed_word(x, batchers.is_batched(x)) ## Applying Fix normalization if self.fix_norm is not None: ret = dy.cdiv(ret, dy.l2_norm(ret)) * self.fix_norm ## Weight noise only when training if self.train and self.weight_noise > 0.0: ret = dy.noise(ret, self.weight_noise) return ret
def calculate_loss(self, input, output): #dy.renew_cg() weight_matrix_array = [] biases_array = [] for (W,b) in zip(self.weight_matrix_array, self.biases_array): weight_matrix_array.append(dy.parameter(W)) biases_array.append(dy.parameter(b)) acts = self.act w = weight_matrix_array[0] b = biases_array[0] act = acts[0] intermediate = act(dy.affine_transform([b, w, input])) activations = [intermediate] for (W,b,g) in zip(weight_matrix_array[1:], biases_array[1:], acts[1:]): pred = g(dy.affine_transform([b, W, activations[-1]])) activations.append(pred) losses = output - pred return dy.l2_norm(losses)
def l2_norm(self, with_embeddings=True): # specify regularization term: sum of Frobenius/L2-normalized weights # assume that we add to a computation graph reg = [] # RNN weight matrices for rnn in (self.fbuffRNN, self.bbuffRNN, self.wordRNN): for exp in (e for layer in rnn.get_parameter_expressions() for e in layer): if len(exp.dim()[0]) != 1: # this is not a bias term reg.append(dy.l2_norm(exp)) # classifier weight matices reg.append(dy.l2_norm(self.pW_act.expr())) if self.MLP_DIM: reg.append(dy.l2_norm(self.pW_s2h.expr())) if with_embeddings: # add embedding params reg.append(dy.l2_norm(self.FEAT_LOOKUP.expr())) reg.append(dy.l2_norm(self.CHAR_LOOKUP.expr())) if not self.param_tying: reg.append(dy.l2_norm(self.ACT_LOOKUP.expr())) return 0.5 * dy.esum(reg)
def macro_node_iteration(opts, multi_graph, assoc_cache, trainer, log_file, synsets, rel, src_i, use_assoc): """ One node-relation iteration in a macro-level pass over the multigraph :param opts: parameter dictionary from calling model :param multi_graph: trained data structure :param assoc_cache: cache for association model :param trainer: dynet training module :param log_file: log file location :param synsets: synset name dictionary for reporting :param rel: relation type for iteration :param src_i: source node ID for iteration :param use_assoc: use association score model :return: state of cache after iteration """ g = multi_graph.graphs[rel] N = multi_graph.vocab_size # set up iteration if opts.debug: dy.renew_cg(immediate_compute=True, check_validity=True) else: dy.renew_cg() # keep existing score for all deltas multi_graph.rescore() score_with_all = multi_graph.dy_score # report progress perform_verbosity_steps = opts.v > 1 or (opts.v > 0 and src_i > 0 and src_i % 10 == 0) if perform_verbosity_steps: timeprint('iterating on node {}, {}, current score = {:.6f}'\ .format(src_i, synsets[src_i], score_with_all.scalar_value())) # true targets scoring true_targets = targets(g, src_i) if len(true_targets) == 0: # don't perform negative sampling without true targets return assoc_cache # compute log likelihood on targets # each used to be multiplied by multi_graph.a_scale target_assoc_scores = { t: multi_graph.word_assoc_score(src_i, t, rel) for t in true_targets } if opts.no_assoc_bp: # turn into values to detach from computation graph target_assoc_scores = { t: t_as.value() for t, t_as in list(target_assoc_scores.items()) } target_scores = { t: score_with_all + t_as for t, t_as in list(target_assoc_scores.items()) } # false targets scoring - importance sampling # compute softmax over all false targets based on bilinear scores if use_assoc: assoc_sc = multi_graph.score_from_source_cache(assoc_cache, src_i) neg_assocs = { j: s for j, s in enumerate(assoc_sc) if j not in true_targets and j != src_i } else: neg_assocs = { j: 1.0 for j in range(N) if j not in true_targets and j != src_i } neg_probs = softmaxify(neg_assocs) # collect negative samples # TODO see if searchsorted can work here too (issue in dynet repo) neg_samples = {t: [dy.np.random.choice(range(len(neg_assocs)), p=neg_probs)\ for _ in range(opts.neg_samp)]\ for t in true_targets} # sample without return? # for reporting if perform_verbosity_steps: neg_sample_idcs = [] for negs in list(neg_samples.values()): neg_sample_idcs.extend([list(neg_assocs.keys())[j] for j in negs]) # compute neg log likelihood on negative samples margins = [] for t in true_targets: t_score = target_scores[t] negs = [list(neg_assocs.keys())[j] for j in neg_samples[t]] # each used to be multiplied by multi_graph.a_scale neg_assoc_scores = [ multi_graph.word_assoc_score(src_i, j, rel) for j in negs ] if opts.no_assoc_bp: # turn into values to detach from computation graph neg_assoc_scores = [s.value() for s in neg_assoc_scores] # prepare graph for pass multi_graph.remove_edge(src_i, t, rel, permanent=True) t_cache = (copy.deepcopy(multi_graph.cache), copy.deepcopy(multi_graph.feature_vals)) for jas, j, origj in zip(neg_assoc_scores, negs, neg_samples[t]): q_norm = 1.0 / neg_probs[origj] g_score = multi_graph.add_edge(src_i, j, rel, caches=t_cache, report_feat_diff=opts.v > 1) margins.append( dy.rectify(g_score + jas + MARGIN - t_score) * q_norm) log_file.write('{}\t{}\t{}\t{}\t{:.2e}\t{:.2e}\t{:.2e}\n'\ .format(rel, src_i, t, j, t_score.scalar_value(), g_score.scalar_value(), jas if type(jas) == float else jas.value())) # revert graph for next margin iteration multi_graph.add_edge(src_i, t, rel, permanent=True) node_loss = dy.esum(margins) # backprop and recompute score if perform_verbosity_steps: timeprint('selected nodes {} with probabilities {}'\ .format(neg_sample_idcs, ['{:.2e}'.format(neg_probs[n]) for n in neg_samples])) timeprint('overall {} loss = {:.6f}'\ .format('margin' if opts.margin_loss else 'neg log', node_loss.scalar_value())) # record state for later reporting pre_weights = multi_graph.ergm_weights.as_array() pre_assoc = multi_graph.word_assoc_weights[rel].as_array() # add regularization if multi_graph.regularize > 0.0: node_loss += multi_graph.regularize * dy.l2_norm( dy.parameter(multi_graph.ergm_weights)) # perform actual learning node_loss.backward() trainer.update() if perform_verbosity_steps: post_weights = multi_graph.ergm_weights.as_array() post_assoc = multi_graph.word_assoc_weights[rel].as_array() w_diff = post_weights - pre_weights a_diff = post_assoc - pre_assoc timeprint('changed weights = {}'.format(len(w_diff.nonzero()[0]))) timeprint('changed pre_assoc = {}, norm {}'\ .format(len(a_diff.nonzero()[0]), np.linalg.norm(a_diff))) # recompute assoc_cache columns for src_i and participating targets if use_assoc and not opts.no_assoc_bp: # TODO normalize embeddings? return multi_graph.source_ranker_cache(rel) return assoc_cache
def Cosine(self, v1, v2): return dy.cdiv(dy.dot_product(v1, v2), dy.l2_norm(v1) * dy.l2_norm(v2))
def regularization_loss(self, coef=0.001): losses = [ dy.l2_norm(p)**2 for p in self.model.parameters_list() if p.name().startswith('/linearW') ] return (coef / 2) * dy.esum(losses)
def calculate_loss(self, input, output, tgtspk): # Initial layer weight_matrix_array = [] biases_array = [] acts = [] if debug: print "The number of generic biases: ", len(self.biases_array) print "The number of generic acts: ", len(self.act_generic) # Generic layers for (W, b, a) in zip(self.weight_matrix_array, self.biases_array, self.act_generic): weight_matrix_array.append(dy.parameter(W)) biases_array.append(dy.parameter(b)) acts.append(a) # Specific layers length = len(self.postspecificlayers) start_index = (tgtspk - 1) * length idx = 0 if debug: print "The number of specific biases: ", len( self.biases_array[start_index:start_index + length]) print "The number of specific acts: ", len(self.act_postspecific) for (W, b, a) in zip( self.specific_weights_array[start_index:start_index + length], self.specific_biases_array[start_index:start_index + length], self.act_postspecific): weight_matrix_array.append(dy.parameter(W)) biases_array.append(dy.parameter(b)) acts.append(a) # Final Layer weight_matrix_array.append(dy.parameter(self.W_final)) biases_array.append(dy.parameter(self.b_final)) acts.append(self.act_final) w = weight_matrix_array[0] b = biases_array[0] act = acts[0] intermediate = act(dy.affine_transform([b, w, input])) if debug: print "Here are the dimensions of the biases: ", [ len(k.value()) for k in biases_array ] print "Here are the acts: ", [k for k in acts] print "Dimensions of the intermediate: " print len(intermediate.value()) activations = [intermediate] count = 1 for (W, b, g) in zip(weight_matrix_array[1:], biases_array[1:], acts[1:]): if debug: print "Adding to the layer number: ", count print "Total layers: ", self.number_of_layers if count == self.number_of_layers - 1: t = dy.concatenate([activations[-1], input]) pred = g(dy.affine_transform([b, W, t])) else: pred = g(dy.affine_transform([b, W, activations[-1]])) activations.append(pred) count += 1 if debug: print "Activation dimensions are : ", [ len(k.value()) for k in activations ] print "Output dimensions are: ", len(output.value()) losses = output - pred return dy.l2_norm(losses)
# define trainable projection layer from word dim to phrase dim # this simplifies concatenation and allows us to treat the recursive base case as a phrase of its own word_to_phrase_projection = model.add_parameters((config.sent_dim, word_dim)) # define graph building operation def generate_graph(parse): parse_graph = parse.to_tree() return graph_gen_helper(parse_graph) def graph_gen_helper(node): node_value = word_to_phrase_projection * embeddings[node.data.form] for child in node: child_subtree = graph_gen_helper(child) # concatenate the node so far with the subtree, select layer according to dep reln node_value = dep_layers[child.data.deprel] * dynet.concatenate( [node_value, child_subtree]) return node_value # run training for parse, y_pred in zip(parse_train, y_preds): y_pred = generate_graph(parse) loss = dynet.l1_distance(dynet.l2_norm(y_pred), dynet.l2_norm(y)) # run eval
def regularization_loss(self, coef=1e-4): losses = [dy.l2_norm(p)**2 for p in self.model.parameters_list()] return (coef / 2) * dy.esum(losses)