def get_normalized_reps(self, embs, forward_lstm, backward_lstm, encode=False):
     word_reps = [dy.concatenate([forward_lstm.initial_state().transduce(emb)[-1],
                                  backward_lstm.initial_state().transduce(reversed(emb))[-1]]) for emb in embs]
     if not encode:
         return [dy.cdiv(rep, dy.l2_norm(rep)) for rep in word_reps]
     else:
         return [dy.cdiv(rep, dy.l2_norm(rep)).value() for rep in word_reps]
Example #2
0
 def embed(self, x: Union[batchers.Batch, numbers.Integral]) -> dy.Expression:
   if self.train and self.word_dropout > 0.0 and self.word_id_mask is None:
     batch_size = x.batch_size() if batchers.is_batched(x) else 1
     self.word_id_mask = [set(np.random.choice(self.vocab_size, int(self.vocab_size * self.word_dropout), replace=False)) for _ in range(batch_size)]
   emb_e = dy.parameter(self.embeddings)
   # single mode
   if not batchers.is_batched(x):
     if self.train and self.word_id_mask and x in self.word_id_mask[0]:
       ret = dy.zeros((self.emb_dim,))
     else:
       ret = dy.pick(emb_e, index=x)
       if self.fix_norm is not None:
         ret = dy.cdiv(ret, dy.l2_norm(ret))
         if self.fix_norm != 1:
           ret *= self.fix_norm
   # minibatch mode
   else:
     ret = dy.pick_batch(emb_e, x)
     if self.fix_norm is not None:
       ret = dy.cdiv(ret, dy.l2_norm(ret))
       if self.fix_norm != 1:
         ret *= self.fix_norm
     if self.train and self.word_id_mask and any(x[i] in self.word_id_mask[i] for i in range(x.batch_size())):
       dropout_mask = dy.inputTensor(np.transpose([[0.0]*self.emb_dim if x[i] in self.word_id_mask[i] else [1.0]*self.emb_dim for i in range(x.batch_size())]), batched=True)
       ret = dy.cmult(ret, dropout_mask)
   if self.train and self.weight_noise > 0.0:
     ret = dy.noise(ret, self.weight_noise)
   return ret
Example #3
0
 def embed(self, x):
   if self.train and self.word_dropout > 0.0 and self.word_id_mask is None:
     batch_size = x.batch_size() if xnmt.batcher.is_batched(x) else 1
     self.word_id_mask = [set(np.random.choice(self.vocab_size, int(self.vocab_size * self.word_dropout), replace=False)) for _ in range(batch_size)]
   # single mode
   if not xnmt.batcher.is_batched(x):
     if self.train and self.word_id_mask and x in self.word_id_mask[0]:
       ret = dy.zeros((self.emb_dim,))
     else:
       ret = self.embeddings[x]
       if self.fix_norm is not None:
         ret = dy.cdiv(ret, dy.l2_norm(ret))
         if self.fix_norm != 1:
           ret *= self.fix_norm
   # minibatch mode
   else:
     ret = self.embeddings.batch(x)
     if self.fix_norm is not None:
       ret = dy.cdiv(ret, dy.l2_norm(ret))
       if self.fix_norm != 1:
         ret *= self.fix_norm
     if self.train and self.word_id_mask and any(x[i] in self.word_id_mask[i] for i in range(x.batch_size())):
       dropout_mask = dy.inputTensor(np.transpose([[0.0]*self.emb_dim if x[i] in self.word_id_mask[i] else [1.0]*self.emb_dim for i in range(x.batch_size())]), batched=True)
       ret = dy.cmult(ret, dropout_mask)
   if self.train and self.weight_noise > 0.0:
     ret = dy.noise(ret, self.weight_noise)
   return ret
Example #4
0
 def __cosine_loss(self, pred, gold):
     sn1 = dy.l2_norm(pred)
     sn2 = dy.l2_norm(gold)
     mult = dy.cmult(sn1, sn2)
     dot = dy.dot_product(pred, gold)
     div = dy.cdiv(dot, mult)
     vec_y = dy.scalarInput(2)
     res = dy.cdiv(1 - div, vec_y)
     return res
Example #5
0
 def word_assoc_score(self, source_idx, target_idx, relation):
     """
     NOTE THAT DROPOUT IS BEING APPLIED HERE
     :param source_idx: embedding index of source atom
     :param target_idx: embedding index of target atom
     :param relation: relation type
     :return: score
     """
     # prepare
     s = self.embeddings[source_idx]
     if self.no_assoc:
         A = dy.const_parameter(self.word_assoc_weights[relation])
     else:
         A = dy.parameter(self.word_assoc_weights[relation])
     dy.dropout(A, self.dropout)
     t = self.embeddings[target_idx]
     
     # compute
     if self.mode == BILINEAR_MODE:
         return dy.transpose(s) * A * t
     elif self.mode == DIAG_RANK1_MODE:
         diag_A = dyagonalize(A[0])
         rank1_BC = A[1] * dy.transpose(A[2])
         ABC = diag_A + rank1_BC
         return dy.transpose(s) * ABC * t
     elif self.mode == TRANSLATIONAL_EMBED_MODE:
         return -dy.l2_norm(s - t + A)
     elif self.mode == DISTMULT:
         return dy.sum_elems(dy.cmult(dy.cmult(s, A), t))
Example #6
0
    def word_assoc_score(self, source_idx, target_idx, relation):
        """
        NOTE THAT DROPOUT IS BEING APPLIED HERE
        :param source_idx: embedding index of source atom
        :param target_idx: embedding index of target atom
        :param relation: relation type
        :return: score
        """
        # prepare
        s = self.embeddings[source_idx]
        if self.no_assoc:
            A = dy.const_parameter(self.word_assoc_weights[relation])
        else:
            A = dy.parameter(self.word_assoc_weights[relation])
        dy.dropout(A, self.dropout)
        t = self.embeddings[target_idx]

        # compute
        if self.mode == BILINEAR_MODE:
            return dy.transpose(s) * A * t
        elif self.mode == DIAG_RANK1_MODE:
            diag_A = dyagonalize(A[0])
            rank1_BC = A[1] * dy.transpose(A[2])
            ABC = diag_A + rank1_BC
            return dy.transpose(s) * ABC * t
        elif self.mode == TRANSLATIONAL_EMBED_MODE:
            return -dy.l2_norm(s - t + A)
        elif self.mode == DISTMULT:
            return dy.sum_elems(dy.cmult(dy.cmult(s, A), t))
Example #7
0
  def transduce(self, src: ExpressionSequence) -> ExpressionSequence:
    src = src.as_tensor()

    src_height = src.dim()[0][0]
    src_width = src.dim()[0][1]
    # src_channels = 1
    batch_size = src.dim()[1]

    # convolution and pooling layers
    # src dim is ((40, 1000), 128)
    src = padding(src, self.filter_width[0]+3)
    l1 = dy.rectify(dy.conv2d(src, dy.parameter(self.filters1), stride = [self.stride[0], self.stride[0]], is_valid = True)) # ((1, 1000, 64), 128)
    pool1 = dy.maxpooling2d(l1, (1, 4), (1,2), is_valid = True) #((1, 499, 64), 128)

    pool1 = padding(pool1, self.filter_width[1]+3)
    l2 = dy.rectify(dy.conv2d(pool1, dy.parameter(self.filters2), stride = [self.stride[1], self.stride[1]], is_valid = True))# ((1, 499, 512), 128)
    pool2 = dy.maxpooling2d(l2, (1, 4), (1,2), is_valid = True)#((1, 248, 512), 128)

    pool2 = padding(pool2, self.filter_width[2])
    l3 = dy.rectify(dy.conv2d(pool2, dy.parameter(self.filters3), stride = [self.stride[2], self.stride[2]], is_valid = True))# ((1, 248, 1024), 128)
    pool3 = dy.max_dim(l3, d = 1)

    my_norm = dy.l2_norm(pool3) + 1e-6
    output = dy.cdiv(pool3,my_norm)
    output = dy.reshape(output, (self.num_filters[2],), batch_size = batch_size)

    return ExpressionSequence(expr_tensor=output)
Example #8
0
    def calculate_loss(self, src_file, tgt_file):

        # Renew the computation graph
        dy.renew_cg()

        # Initialize LSTMs
        enc_init_state_fwd = self.enc_lstm_fwd_builder.initial_state()
        enc_init_state_bwd = self.enc_lstm_bwd_builder.initial_state()

        # MLP to predict the duration
        W_d = dy.parameter(self.W_duration)
        b_d = dy.parameter(self.b_duration)

        # MLP to predict f0
        W_f0 = dy.parameter(self.W_f0)
        b_f0 = dy.parameter(self.b_duration)

        input_frames = dy.inputTensor(np.loadtxt(src_file))
        output_frames = dy.inputTensor(np.loadtxt(tgt_file))
        len_tgt = len(np.loadtxt(tgt_file))
        input_frames_reverse = dy.inputTensor(np.flipud(np.loadtxt(src_file)))

        # Get the LSTM embeddings
        fwd_output = enc_init_state_fwd.add_inputs(
            [frame for frame in input_frames])[-1].output()
        bwd_output = enc_init_state_bwd.add_inputs(
            [frame for frame in input_frames_reverse])[-1].output()

        # Concatenate
        bilstm_embeddings = dy.concatenate([fwd_output, bwd_output])

        # Predict durations
        target_duration = self.mlp(bilstm_embeddings, W_d, b_d)
        duration_loss = dy.l2_norm(target_duration - len_tgt)

        # initialize decoder LSTM
        dec_init_state = self.dec_lstm_builder.initial_state().add_inputs(
            bilstm_embeddings)[-1].output()

        # Generate target frames
        prediction_loss = []
        for k in range(len_tgt):
            predicted_frame = self.mlp(dec_init_state, W_f0, b_f0)
            prediction_loss.append(
                dy.l2_norm(predicted_frame - output_frames[k]))
        return duration_loss, dy.esum(prediction_loss)
Example #9
0
def train_network(params, ntags, train_data, dev_set):
    global telemetry_file, randstring, MIN_ACC
    prev_acc = 0
    m = params[0]
    t0 = time.clock()
    # train the network
    trainer = dy.SimpleSGDTrainer(m)
    total_loss = 0
    seen_instances = 0
    train_good = 0
    for train_x, train_y in train_data:
        dy.renew_cg()
        output = build_network(params, train_x)
        # l2 regularization did not look promising at all, so it's commented out
        loss = -dy.log(output[train_y]) + REG_LAMBDA * sum(
            [dy.l2_norm(p) for p in params[2:]])
        if train_y == np.argmax(output.npvalue()):
            train_good += 1
        seen_instances += 1
        total_loss += loss.value()
        loss.backward()
        trainer.update()

        if seen_instances % 20000 == 0:
            # measure elapsed seconds
            secs = time.clock() - t0
            t0 = time.clock()
            good = case = 0
            max_dev_instances = 70 * 1000
            dev_instances = 0
            for x_tuple, dev_y in dev_set:
                output = build_network(params, x_tuple)
                if np.argmax(output.npvalue()) == dev_y:
                    good += 1
                case += 1
                dev_instances += 1
                if dev_instances >= max_dev_instances:
                    break
            acc = float(good) / case
            print(
                "iterations: {}. train_accuracy: {} accuracy: {} avg loss: {} secs per 1000:{}"
                .format(seen_instances,
                        float(train_good) / 20000, acc,
                        total_loss / (seen_instances + 1), secs / 20))
            train_good = 0
            if acc > MIN_ACC and acc > prev_acc:
                print("saving.")
                dy.save("params_" + randstring, list(params)[1:])
                prev_acc = acc

            telemetry_file.write("{}\t{}\t{}\t{}\n".format(
                seen_instances, acc, total_loss / (seen_instances + 1),
                secs / 20))
    MIN_ACC = max(prev_acc, MIN_ACC)
Example #10
0
    def learn(self, src, dst):
        softmax_list, aux_list = self._predict(src, dst=dst,  num_predictions=len(dst) + 1, runtime=False)
        for softmax, aux, entry in zip(softmax_list, aux_list, dst):
            word = entry.word.decode('utf-8').lower()
            if word in self.output_encodings.word2int:
                w_index = self.output_encodings.word2int[word]
            else:
                w_index = self.output_encodings.word2int["<UNK>"]

            w_emb, found = self.dst_we.get_word_embeddings(entry.word.decode('utf-8'))
            self.losses.append(-dy.log(dy.pick(softmax, w_index)))
            if found:
                vec1=aux
                vec2=dy.inputVector(w_emb)
                cosine = dy.dot_product(vec1, vec2) * dy.pow(dy.l2_norm(vec1) * dy.l2_norm(vec2),
                                                                       dy.scalarInput(-1))
                self.losses.append(dy.squared_distance(cosine, dy.scalarInput(1.0)))


        self.losses.append(-dy.log(dy.pick(softmax_list[-1], self.EOS)))
Example #11
0
 def embed(self, x: Union[batchers.Batch,
                          numbers.Integral]) -> dy.Expression:
     """
 Embed a single word in a sentence.
 :param x: A word id.
 :return: Embedded word.
 """
     ret = self._embed_word(x, batchers.is_batched(x))
     ## Applying Fix normalization
     if self.fix_norm is not None:
         ret = dy.cdiv(ret, dy.l2_norm(ret)) * self.fix_norm
     ## Weight noise only when training
     if self.train and self.weight_noise > 0.0:
         ret = dy.noise(ret, self.weight_noise)
     return ret
Example #12
0
 def calculate_loss(self, input, output):
   #dy.renew_cg()
   weight_matrix_array = []
   biases_array = []
   for (W,b) in zip(self.weight_matrix_array, self.biases_array):
        weight_matrix_array.append(dy.parameter(W))
        biases_array.append(dy.parameter(b)) 
   acts = self.act
   w = weight_matrix_array[0]
   b = biases_array[0]
   act = acts[0]
   intermediate = act(dy.affine_transform([b, w, input]))
   activations = [intermediate]
   for (W,b,g) in zip(weight_matrix_array[1:], biases_array[1:], acts[1:]):
       pred = g(dy.affine_transform([b, W, activations[-1]]))
       activations.append(pred)  
   losses = output - pred
   return dy.l2_norm(losses)
 def l2_norm(self, with_embeddings=True):
     # specify regularization term: sum of Frobenius/L2-normalized weights
     # assume that we add to a computation graph
     reg = []
     # RNN weight matrices
     for rnn in (self.fbuffRNN, self.bbuffRNN, self.wordRNN):
         for exp in (e for layer in rnn.get_parameter_expressions() for e in layer):
             if len(exp.dim()[0]) != 1:
                 # this is not a bias term
                 reg.append(dy.l2_norm(exp))
     # classifier weight matices
     reg.append(dy.l2_norm(self.pW_act.expr()))
     if self.MLP_DIM:
         reg.append(dy.l2_norm(self.pW_s2h.expr()))
     if with_embeddings:
         # add embedding params
         reg.append(dy.l2_norm(self.FEAT_LOOKUP.expr()))
         reg.append(dy.l2_norm(self.CHAR_LOOKUP.expr()))
         if not self.param_tying:
             reg.append(dy.l2_norm(self.ACT_LOOKUP.expr()))
     return 0.5 * dy.esum(reg)
Example #14
0
def macro_node_iteration(opts, multi_graph, assoc_cache, trainer, log_file,
                         synsets, rel, src_i, use_assoc):
    """
    One node-relation iteration in a macro-level pass over the multigraph
    :param opts: parameter dictionary from calling model
    :param multi_graph: trained data structure
    :param assoc_cache: cache for association model
    :param trainer: dynet training module
    :param log_file: log file location
    :param synsets: synset name dictionary for reporting
    :param rel: relation type for iteration
    :param src_i: source node ID for iteration
    :param use_assoc: use association score model
    :return: state of cache after iteration
    """

    g = multi_graph.graphs[rel]
    N = multi_graph.vocab_size

    # set up iteration
    if opts.debug:
        dy.renew_cg(immediate_compute=True, check_validity=True)
    else:
        dy.renew_cg()

    # keep existing score for all deltas
    multi_graph.rescore()
    score_with_all = multi_graph.dy_score

    # report progress
    perform_verbosity_steps = opts.v > 1 or (opts.v > 0 and src_i > 0
                                             and src_i % 10 == 0)
    if perform_verbosity_steps:
        timeprint('iterating on node {}, {}, current score = {:.6f}'\
                  .format(src_i, synsets[src_i], score_with_all.scalar_value()))

    # true targets scoring

    true_targets = targets(g, src_i)

    if len(true_targets) == 0:
        # don't perform negative sampling without true targets
        return assoc_cache

    # compute log likelihood on targets
    # each used to be multiplied by multi_graph.a_scale
    target_assoc_scores = {
        t: multi_graph.word_assoc_score(src_i, t, rel)
        for t in true_targets
    }
    if opts.no_assoc_bp:
        # turn into values to detach from computation graph
        target_assoc_scores = {
            t: t_as.value()
            for t, t_as in list(target_assoc_scores.items())
        }
    target_scores = {
        t: score_with_all + t_as
        for t, t_as in list(target_assoc_scores.items())
    }

    # false targets scoring - importance sampling

    # compute softmax over all false targets based on bilinear scores
    if use_assoc:
        assoc_sc = multi_graph.score_from_source_cache(assoc_cache, src_i)
        neg_assocs = {
            j: s
            for j, s in enumerate(assoc_sc)
            if j not in true_targets and j != src_i
        }
    else:
        neg_assocs = {
            j: 1.0
            for j in range(N) if j not in true_targets and j != src_i
        }
    neg_probs = softmaxify(neg_assocs)

    # collect negative samples
    # TODO see if searchsorted can work here too (issue in dynet repo)
    neg_samples = {t: [dy.np.random.choice(range(len(neg_assocs)), p=neg_probs)\
                      for _ in range(opts.neg_samp)]\
                   for t in true_targets} # sample without return?

    # for reporting
    if perform_verbosity_steps:
        neg_sample_idcs = []
        for negs in list(neg_samples.values()):
            neg_sample_idcs.extend([list(neg_assocs.keys())[j] for j in negs])

    # compute neg log likelihood on negative samples
    margins = []
    for t in true_targets:
        t_score = target_scores[t]
        negs = [list(neg_assocs.keys())[j] for j in neg_samples[t]]
        # each used to be multiplied by multi_graph.a_scale
        neg_assoc_scores = [
            multi_graph.word_assoc_score(src_i, j, rel) for j in negs
        ]
        if opts.no_assoc_bp:
            # turn into values to detach from computation graph
            neg_assoc_scores = [s.value() for s in neg_assoc_scores]
        # prepare graph for pass
        multi_graph.remove_edge(src_i, t, rel, permanent=True)
        t_cache = (copy.deepcopy(multi_graph.cache),
                   copy.deepcopy(multi_graph.feature_vals))
        for jas, j, origj in zip(neg_assoc_scores, negs, neg_samples[t]):
            q_norm = 1.0 / neg_probs[origj]
            g_score = multi_graph.add_edge(src_i,
                                           j,
                                           rel,
                                           caches=t_cache,
                                           report_feat_diff=opts.v > 1)
            margins.append(
                dy.rectify(g_score + jas + MARGIN - t_score) * q_norm)
            log_file.write('{}\t{}\t{}\t{}\t{:.2e}\t{:.2e}\t{:.2e}\n'\
                         .format(rel, src_i, t, j, t_score.scalar_value(),
                                 g_score.scalar_value(), jas if type(jas) == float else jas.value()))
        # revert graph for next margin iteration
        multi_graph.add_edge(src_i, t, rel, permanent=True)
    node_loss = dy.esum(margins)

    # backprop and recompute score
    if perform_verbosity_steps:
        timeprint('selected nodes {} with probabilities {}'\
                  .format(neg_sample_idcs, ['{:.2e}'.format(neg_probs[n]) for n in neg_samples]))
        timeprint('overall {} loss = {:.6f}'\
                  .format('margin' if opts.margin_loss else 'neg log', node_loss.scalar_value()))

        # record state for later reporting
        pre_weights = multi_graph.ergm_weights.as_array()
        pre_assoc = multi_graph.word_assoc_weights[rel].as_array()

    # add regularization
    if multi_graph.regularize > 0.0:
        node_loss += multi_graph.regularize * dy.l2_norm(
            dy.parameter(multi_graph.ergm_weights))

    # perform actual learning
    node_loss.backward()
    trainer.update()

    if perform_verbosity_steps:
        post_weights = multi_graph.ergm_weights.as_array()
        post_assoc = multi_graph.word_assoc_weights[rel].as_array()
        w_diff = post_weights - pre_weights
        a_diff = post_assoc - pre_assoc
        timeprint('changed weights = {}'.format(len(w_diff.nonzero()[0])))
        timeprint('changed pre_assoc = {}, norm {}'\
                  .format(len(a_diff.nonzero()[0]), np.linalg.norm(a_diff)))

    # recompute assoc_cache columns for src_i and participating targets
    if use_assoc and not opts.no_assoc_bp:
        # TODO normalize embeddings?
        return multi_graph.source_ranker_cache(rel)
    return assoc_cache
 def Cosine(self, v1, v2):
   return dy.cdiv(dy.dot_product(v1, v2),
                  dy.l2_norm(v1) * dy.l2_norm(v2))
Example #16
0
 def regularization_loss(self, coef=0.001):
     losses = [
         dy.l2_norm(p)**2 for p in self.model.parameters_list()
         if p.name().startswith('/linearW')
     ]
     return (coef / 2) * dy.esum(losses)
    def calculate_loss(self, input, output, tgtspk):
        # Initial layer
        weight_matrix_array = []
        biases_array = []
        acts = []
        if debug:
            print "The number of generic biases: ", len(self.biases_array)
            print "The number of generic acts: ", len(self.act_generic)
        # Generic layers
        for (W, b, a) in zip(self.weight_matrix_array, self.biases_array,
                             self.act_generic):
            weight_matrix_array.append(dy.parameter(W))
            biases_array.append(dy.parameter(b))
            acts.append(a)
        # Specific layers
        length = len(self.postspecificlayers)
        start_index = (tgtspk - 1) * length
        idx = 0
        if debug:
            print "The number of specific biases: ", len(
                self.biases_array[start_index:start_index + length])
            print "The number of specific acts: ", len(self.act_postspecific)
        for (W, b, a) in zip(
                self.specific_weights_array[start_index:start_index + length],
                self.specific_biases_array[start_index:start_index + length],
                self.act_postspecific):
            weight_matrix_array.append(dy.parameter(W))
            biases_array.append(dy.parameter(b))
            acts.append(a)
        # Final Layer
        weight_matrix_array.append(dy.parameter(self.W_final))
        biases_array.append(dy.parameter(self.b_final))
        acts.append(self.act_final)

        w = weight_matrix_array[0]
        b = biases_array[0]
        act = acts[0]
        intermediate = act(dy.affine_transform([b, w, input]))
        if debug:
            print "Here are the dimensions of the biases: ", [
                len(k.value()) for k in biases_array
            ]
            print "Here are the acts: ", [k for k in acts]
            print "Dimensions of the intermediate: "
            print len(intermediate.value())
        activations = [intermediate]
        count = 1
        for (W, b, g) in zip(weight_matrix_array[1:], biases_array[1:],
                             acts[1:]):
            if debug:
                print "Adding to the layer number: ", count
                print "Total layers: ", self.number_of_layers
            if count == self.number_of_layers - 1:
                t = dy.concatenate([activations[-1], input])
                pred = g(dy.affine_transform([b, W, t]))
            else:
                pred = g(dy.affine_transform([b, W, activations[-1]]))
            activations.append(pred)
            count += 1
        if debug:
            print "Activation dimensions are : ", [
                len(k.value()) for k in activations
            ]
            print "Output dimensions are: ", len(output.value())
        losses = output - pred
        return dy.l2_norm(losses)
Example #18
0
# define trainable projection layer from word dim to phrase dim
# this simplifies concatenation and allows us to treat the recursive base case as a phrase of its own
word_to_phrase_projection = model.add_parameters((config.sent_dim, word_dim))


# define graph building operation
def generate_graph(parse):
    parse_graph = parse.to_tree()
    return graph_gen_helper(parse_graph)


def graph_gen_helper(node):
    node_value = word_to_phrase_projection * embeddings[node.data.form]

    for child in node:
        child_subtree = graph_gen_helper(child)

        # concatenate the node so far with the subtree, select layer according to dep reln
        node_value = dep_layers[child.data.deprel] * dynet.concatenate(
            [node_value, child_subtree])

    return node_value


# run training
for parse, y_pred in zip(parse_train, y_preds):
    y_pred = generate_graph(parse)
    loss = dynet.l1_distance(dynet.l2_norm(y_pred), dynet.l2_norm(y))

# run eval
Example #19
0
 def regularization_loss(self, coef=1e-4):
     losses = [dy.l2_norm(p)**2 for p in self.model.parameters_list()]
     return (coef / 2) * dy.esum(losses)