Beispiel #1
0
def decode(dec_lstm, vectors, output):
    output = [EOS] + list(output) + [EOS]
    output = [char2int[c] for c in output]

    w = dy.parameter(decoder_w)
    b = dy.parameter(decoder_b)
    w1 = dy.parameter(attention_w1)
    input_mat = dy.concatenate_cols(vectors)
    w1dt = None

    last_output_embeddings = output_lookup[char2int[EOS]]
    s = dec_lstm.initial_state().add_input(dy.concatenate([dy.vecInput(STATE_SIZE*2), last_output_embeddings]))
    loss = []

    for char in output:
        # w1dt can be computed and cached once for the entire decoding phase
        w1dt = w1dt or w1 * input_mat
        vector = dy.concatenate([attend(input_mat, s, w1dt), last_output_embeddings])
        s = s.add_input(vector)
        out_vector = w * s.output() + b
        probs = dy.softmax(out_vector)
        last_output_embeddings = output_lookup[char]
        loss.append(-dy.log(dy.pick(probs, char)))
    loss = dy.esum(loss)
    return loss
Beispiel #2
0
def calc_scores(words):
  # Create a computation graph, and add parameters
  dy.renew_cg()
  # Take the sum of all the embedding vectors for each word
  score = dy.esum([dy.lookup(W, x) for x in words])
  # Add the bias vector and return
  return score + b
def attend(blstm_outputs, h_t, W_c, v_a, W__a, U__a):
    # iterate through input states to compute alphas
    # print 'computing scores...'
    # scores = [W_a * pc.concatenate([h_t, h_input]) for h_input in blstm_outputs]
    scores = [v_a * pc.tanh(W__a * h_t + U__a * h_input) for h_input in blstm_outputs]
    # print 'computed scores'
    # normalize to alphas using softmax
    # print 'computing alphas...'
    alphas = pc.softmax(pc.concatenate(scores))
    # print 'computed alphas...'
    # compute c using alphas
    # print 'computing c...'

    # import time
    # s = time.time()
    # dim = len(blstm_outputs[0].vec_value())
    # stacked_alphas = pc.concatenate_cols([alphas for j in xrange(dim)])
    # stacked_vecs = pc.concatenate_cols([h_input for h_input in blstm_outputs])
    # c = pc.esum(pc.cwise_multiply(stacked_vecs, stacked_alphas))
    # print "stack time:", time.time() - s

    # s = time.time()
    c = pc.esum([h_input * pc.pick(alphas, j) for j, h_input in enumerate(blstm_outputs)])
    # print "pick time:", time.time() - s
    # print 'computed c'
    # print 'c len is {}'.format(len(c.vec_value()))
    # compute output state h~ using c and the decoder's h (global attention variation from Loung and Manning 2015)
    # print 'computing h~...'
    h_output = pc.tanh(W_c * pc.concatenate([h_t, c]))
    # print 'len of h_output is {}'.format(len(h_output.vec_value()))
    # print 'computed h~'

    return h_output, alphas, W__a.value()
Beispiel #4
0
 def finalize(self, finished_epoch=False, **kwargs):
     """
     Fit this model on collected samples
     :return self
     """
     super().finalize(finished_epoch=finished_epoch, **kwargs)
     assert self.model, "Cannot finalize a model without initializing it first"
     if self.losses:
         loss = dy.esum(self.losses)
         loss.forward()
         self.config.print(lambda: "Total loss from %d time steps: %g" % (self.steps, loss.value()), level=4)
         loss.backward()
         try:
             self.trainer.update()
         except RuntimeError as e:
             Config().log("Error in update(): %s\n" % e)
         self.init_cg()
         self.losses = []
         self.steps = 0
         self.updates += 1
     if finished_epoch:
         self.trainer.learning_rate /= (1 - self.learning_rate_decay)
     if self.config.args.verbose > 2:
         self.trainer.status()
     return self
Beispiel #5
0
def calc_sent_loss(sent):
  # Create a computation graph
  dy.renew_cg()
  
  #add padding to the sentence equal to the size of the window
  #as we need to predict the eos as well, the future window at that point is N past it 
  padded_sent = [S] * N + sent + [S] * N
  padded_emb = [W_c_p[x] for x in padded_sent]

  # Step through the sentence
  all_losses = [] 
  for i in range(N,len(sent)+N):
    c = dy.esum(padded_emb[i-N:i] + padded_emb[i+1:i+N+1])
    s = W_w * c
    all_losses.append(dy.pickneglogsoftmax(s, padded_sent[i]))
  return dy.esum(all_losses)
Beispiel #6
0
def calc_loss(sent):
    dy.renew_cg()

    # Transduce all batch elements with an LSTM
    src = sent[0]
    trg = sent[1]


    #initialize the LSTM
    init_state_src = LSTM_SRC_BUILDER.initial_state()

    #get the output of the first LSTM
    src_output = init_state_src.add_inputs([LOOKUP_SRC[x] for x in src])[-1].output()
    #now step through the output sentence
    all_losses = []

    current_state = LSTM_TRG_BUILDER.initial_state().set_s([src_output, dy.tanh(src_output)])
    prev_word = trg[0]
    W_sm = dy.parameter(W_sm_p)
    b_sm = dy.parameter(b_sm_p)

    for next_word in trg[1:]:
        #feed the current state into the 
        current_state = current_state.add_input(LOOKUP_TRG[prev_word])
        output_embedding = current_state.output()

        s = dy.affine_transform([b_sm, W_sm, output_embedding])
        all_losses.append(dy.pickneglogsoftmax(s, next_word))

        prev_word = next_word
    return dy.esum(all_losses)
Beispiel #7
0
def sent_loss(words, tags):
    vecs = build_tagging_graph(words)
    errs = []
    for v,t in zip(vecs,tags):
        tid = vt.w2i[t]
        err = dy.pickneglogsoftmax(v, tid)
        errs.append(err)
    return dy.esum(errs)
def calc_loss(sents):
    dy.renew_cg()

    # Transduce all batch elements with an LSTM
    src_sents = [x[0] for x in sents]
    tgt_sents = [x[1] for x in sents]
    src_cws = []

    src_len = [len(sent) for sent in src_sents]        
    max_src_len = np.max(src_len)
    num_words = 0

    for i in range(max_src_len):
        src_cws.append([sent[i] for sent in src_sents])


    #initialize the LSTM
    init_state_src = LSTM_SRC_BUILDER.initial_state()

    #get the output of the first LSTM
    src_output = init_state_src.add_inputs([dy.lookup_batch(LOOKUP_SRC, cws) for cws in src_cws])[-1].output()
    #now decode
    all_losses = []

    # Decoder
    #need to mask padding at end of sentence
    tgt_cws = []
    tgt_len = [len(sent) for sent in sents]
    max_tgt_len = np.max(tgt_len)
    masks = []

    for i in range(max_tgt_len):
        tgt_cws.append([sent[i] if len(sent) > i else eos_trg for sent in tgt_sents])
        mask = [(1 if len(sent) > i else 0) for sent in tgt_sents]
        masks.append(mask)
        num_words += sum(mask)



    current_state = LSTM_TRG_BUILDER.initial_state().set_s([src_output, dy.tanh(src_output)])
    prev_words = tgt_cws[0]
    W_sm = dy.parameter(W_sm_p)
    b_sm = dy.parameter(b_sm_p)

    for next_words, mask in zip(tgt_cws[1:], masks):
        #feed the current state into the 
        current_state = current_state.add_input(dy.lookup_batch(LOOKUP_TRG, prev_words))
        output_embedding = current_state.output()

        s = dy.affine_transform([b_sm, W_sm, output_embedding])
        loss = (dy.pickneglogsoftmax_batch(s, next_words))
        mask_expr = dy.inputVector(mask)
        mask_expr = dy.reshape(mask_expr, (1,),len(sents))
        mask_loss = loss * mask_expr
        all_losses.append(mask_loss)
        prev_words = next_words
    return dy.sum_batches(dy.esum(all_losses)), num_words
Beispiel #9
0
def calc_reinforce_loss(words, tags, delta):
    dy.renew_cg()

    # Transduce all batch elements with an LSTM
    word_reps = LSTM.transduce([LOOKUP[x] for x in words])

    # Softmax scores
    W = dy.parameter(W_sm)
    b = dy.parameter(b_sm)

    #calculate the probability distribution 
    scores = [dy.affine_transform([b, W, x]) for x in word_reps]
    losses = [dy.pickneglogsoftmax(score, tag) for score, tag in zip(scores, tags)]
    probs = [-dy.exp(loss).as_array() for loss in losses]

    #then take samples from the probability distribution
    samples = [np.random.choice(range(len(x)), p=x) for x in probs]

    #calculate accuracy=reward
    correct = [sample == tag for sample, tag in zip(samples, tags)]
    r_i = float(sum(correct))/len(correct)
    r = dy.constant((1), r_i)
    # Reward baseline for each word
    W_bl = dy.parameter(W_bl_p)
    b_bl = dy.parameter(b_bl_p)
    r_b = [dy.affine_transform([b_bl, W_bl, dy.nobackprop(x)]) for x in word_reps]

    #we need to take the value in order to break the computation graph
    #as the reward portion is trained seperatley and not backpropogated through during the overall score
    rewards_over_baseline = [(r - dy.nobackprop(x)) for x in r_b]
    #the scores for training the baseline
    baseline_scores = [dy.square(r - x) for x in r_b]

    #then calculate the reinforce scores using reinforce
    reinforce_scores = [r_s*score for r_s, score in zip(rewards_over_baseline, scores)]

    #we want the first len(sent)-delta scores from xent then delta scores from reinforce
    #for mixer
    if len(scores) > delta:
        mixer_scores = scores[:len(scores)-delta] + reinforce_scores[delta-1:]
    else:
        mixer_scores = reinforce_scores
    return dy.esum(mixer_scores), dy.esum(baseline_scores)
Beispiel #10
0
def calc_loss(sent):
    dy.renew_cg()

    # Transduce all batch elements with an LSTM
    src = sent[0]
    trg = sent[1]

    # initialize the LSTM
    init_state_src = LSTM_SRC_BUILDER.initial_state()

    # get the output of the first LSTM
    src_output = init_state_src.add_inputs([LOOKUP_SRC[x] for x in src])[-1].output()

    # Now compute mean and standard deviation of source hidden state.
    W_mean = dy.parameter(W_mean_p)
    V_mean = dy.parameter(V_mean_p)
    b_mean = dy.parameter(b_mean_p)

    W_var = dy.parameter(W_var_p)
    V_var = dy.parameter(V_var_p)
    b_var = dy.parameter(b_var_p)

    # The mean vector from the encoder.
    mu = mlp(src_output, W_mean, V_mean, b_mean)
    # This is the diagonal vector of the log co-variance matrix from the encoder
    # (regard this as log variance is easier for furture implementation)
    log_var = mlp(src_output, W_var, V_var, b_var)

    # Compute KL[N(u(x), sigma(x)) || N(0, I)]
    # 0.5 * sum(1 + log(sigma^2) - mu^2 - sigma^2)
    kl_loss = -0.5 * dy.sum_elems(1 + log_var - dy.pow(mu, dy.inputVector([2])) - dy.exp(log_var))

    z = reparameterize(mu, log_var)

    # now step through the output sentence
    all_losses = []

    current_state = LSTM_TRG_BUILDER.initial_state().set_s([z, dy.tanh(z)])
    prev_word = trg[0]
    W_sm = dy.parameter(W_sm_p)
    b_sm = dy.parameter(b_sm_p)

    for next_word in trg[1:]:
        # feed the current state into the
        current_state = current_state.add_input(LOOKUP_TRG[prev_word])
        output_embedding = current_state.output()

        s = dy.affine_transform([b_sm, W_sm, output_embedding])
        all_losses.append(dy.pickneglogsoftmax(s, next_word))

        prev_word = next_word

    softmax_loss = dy.esum(all_losses)

    return kl_loss, softmax_loss
Beispiel #11
0
    def loss(self, preds, y):
        if self.do_crf is True:
            return self.crf.neg_log_loss(preds, y.squeeze(0))
        else:
            element_loss = dy.pickneglogsoftmax
            errs = []

            for pred, y_i in zip(preds, y.T):
                err = element_loss(pred, y_i)
                errs.append(err)
            return dy.esum(errs)
Beispiel #12
0
def transduce(seq,Y):
    seq = [E[i] for i in seq]
    fw = fwR.initial_state().transduce(seq)

    # this UNUSED part affects strategy 2
    XXX = fwR2.initial_state().transduce([E[3],E[5]])

    W = W_.expr()
    outs = [W*z for z in fw]
    losses = [dy.pickneglogsoftmax(o,y) for o,y in zip(outs,Y)]
    s = dy.esum(losses)
    return s
Beispiel #13
0
def calc_sent_loss(sent):
  # Create a computation graph
  dy.renew_cg()
  # The initial history is equal to end of sentence symbols
  hist = [S] * N
  # Step through the sentence, including the end of sentence token
  all_losses = []
  for next_word in sent + [S]:
    s = calc_score_of_history(hist)
    all_losses.append(dy.pickneglogsoftmax(s, next_word))
    hist = hist[1:] + [next_word]
  return dy.esum(all_losses)
Beispiel #14
0
 def sent_lm_loss(self, sent):
   rnn_cur = self.rnn.initial_state()
   losses = []
   prev_word = self.start
   for word in sent:
     x_t = self.embeddings[prev_word]
     rnn_cur = rnn_cur.add_input(x_t)
     logits = dy.affine_transform([self.lb,
                                   self.h2l,
                                   rnn_cur.output()])
     losses.append(dy.pickneglogsoftmax(logits, word))
     prev_word = word
   return dy.esum(losses)
Beispiel #15
0
    def BuildLMGraph(self, sent):
        dy.renew_cg()
        init_state = self.builder.initial_state()

        errs = [] # will hold expressions
        es=[]
        state = init_state
        inputs = [self.lookup[int(cw)] for cw in sent[:-1]]
        expected_outputs = [int(nw) for nw in sent[1:]]
        outputs = state.transduce(inputs)
        r_ts = ((self.bias + (self.R * y_t)) for y_t in outputs)
        errs = [dy.pickneglogsoftmax(r_t, eo) for r_t, eo in zip(r_ts, expected_outputs)]
        nerr = dy.esum(errs)
        return nerr
Beispiel #16
0
def attend(input_vectors, state):
    global attention_w1
    global attention_w2
    global attention_v
    w1 = dy.parameter(attention_w1)
    w2 = dy.parameter(attention_w2)
    v = dy.parameter(attention_v)
    attention_weights = []

    w2dt = w2*dy.concatenate(list(state.s()))
    for input_vector in input_vectors:
        attention_weight = v*dy.tanh(w1*input_vector + w2dt)
        attention_weights.append(attention_weight)
    attention_weights = dy.softmax(dy.concatenate(attention_weights))
    output_vectors = dy.esum([vector*attention_weight for vector, attention_weight in zip(input_vectors, attention_weights)])
    return output_vectors
def attend2(blstm_outputs, s_prev, y_feedback, v_a, W_a, U_a, U_o, V_o, C_o):

    # attention mechanism - Bahdanau style
    # iterate through input states to compute alphas
    # print 'computing scores...'

    # W_a: hidden x hidden, U_a: hidden x 2 hidden, v_a: hidden, each score: scalar
    scores = [v_a * pc.tanh(W_a * s_prev + U_a * h_j) for h_j in blstm_outputs]
    alphas = pc.softmax(pc.concatenate(scores))

    # c_i: 2 hidden
    c_i = pc.esum([h_input * pc.pick(alphas, j) for j, h_input in enumerate(blstm_outputs)])

    # U_o = 2l x hidden, V_o = 2l x input, C_o = 2l x 2 hidden
    attention_output_vector = U_o * s_prev + V_o * y_feedback + C_o * c_i

    return attention_output_vector, alphas
Beispiel #18
0
    def build_lm_graph(self, sent):
        dy.renew_cg()
        init_state = self.builder.initial_state()

        errs = [] # will hold expressions
        es=[]
        state = init_state
        for (cw,nw) in zip(sent,sent[1:]):
            # assume word is already a word-id
            x_t = dy.lookup(self.lookup, int(cw))
            state = state.add_input(x_t)
            y_t = state.output()
            r_t = self.bias + (self.R * y_t)
            err = dy.pickneglogsoftmax(r_t, int(nw))
            errs.append(err)
        nerr = dy.esum(errs)
        return nerr
def calc_sent_loss(sent):
  # Create a computation graph
  dy.renew_cg()

  
  # Get embeddings for the sentence
  emb = [W_w_p[x] for x in sent]

  # Step through the sentence and calculate binary prediction losses
  all_losses = [] 
  for i, my_emb in enumerate(emb):
    scores = dy.logistic(W_c * my_emb)
    pos_words = ([sent[x] if x >= 0 else S for x in range(i-N,i)] +
                 [sent[x] if x < len(sent) else S for x in range(i+1,i+N+1)])
    word_repr = [[float(y) for y in np.binary_repr(x).zfill(nbits)] for x in pos_words]
    word_repr = [dy.inputVector(x) for x in word_repr]
    all_losses.extend([dy.binary_log_loss(scores, x) for x in word_repr])
  return dy.esum(all_losses)
Beispiel #20
0
    def BuildLMGraph(self, sents):
        dy.renew_cg()
        # initialize the RNN
        init_state = self.builder.initial_state()
        # parameters -> expressions
        R = dy.parameter(self.R)
        bias = dy.parameter(self.bias)

        S = vocab.w2i["<s>"]
        # get the cids and masks for each step
        tot_chars = 0
        cids = []
        masks = []

        for i in range(len(sents[0])):
            cids.append([(vocab.w2i[sent[i]] if len(sent) > i else S) for sent in sents])
            mask = [(1 if len(sent)>i else 0) for sent in sents]
            masks.append(mask)
            tot_chars += sum(mask)

        # start the rnn with "<s>"
        init_ids = cids[0]
        s = init_state.add_input(lookup_batch(self.lookup, init_ids))

        losses = []

        # feed char vectors into the RNN and predict the next char
        for cid, mask in zip(cids[1:], masks[1:]):
            score = dy.affine_transform([bias, R, s.output()])
            loss = dy.pickneglogsoftmax_batch(score, cid)
            # mask the loss if at least one sentence is shorter
            if mask[-1] != 1:
                mask_expr = dy.inputVector(mask)
                mask_expr = dy.reshape(mask_expr, (1,), len(sents))
                loss = loss * mask_expr

            losses.append(loss)
            # update the state of the RNN
            cemb = dy.lookup_batch(self.lookup, cid)
            s = s.add_input(cemb)

        return dy.sum_batches(dy.esum(losses)), tot_chars
Beispiel #21
0
def decode(dec_lstm, vectors, output):
    output = [EOS] + list(output) + [EOS]
    output = [char2int[c] for c in output]

    w = dy.parameter(decoder_w)
    b = dy.parameter(decoder_b)

    last_output_embeddings = output_lookup[char2int[EOS]]
    s = dec_lstm.initial_state().add_input(dy.concatenate([dy.vecInput(STATE_SIZE*2), last_output_embeddings]))
    loss = []
    for char in output:
        vector = dy.concatenate([attend(vectors, s), last_output_embeddings])

        s = s.add_input(vector)
        out_vector = w * s.output() + b
        probs = dy.softmax(out_vector)
        last_output_embeddings = output_lookup[char]
        loss.append(-dy.log(dy.pick(probs, char)))
    loss = dy.esum(loss)
    return loss
Beispiel #22
0
def calc_sent_loss(sent):
  # Create a computation graph
  dy.renew_cg()
  
  # Get embeddings for the sentence
  emb = [W_w_p[x] for x in sent]

  # Sample K negative words for each predicted word at each position
  all_neg_words = np.random.choice(nwords, size=2*N*K*len(emb), replace=True, p=word_probabilities)

  # W_w = dy.parameter(W_w_p)
  # Step through the sentence and calculate the negative and positive losses
  all_losses = [] 
  for i, my_emb in enumerate(emb):
    neg_words = all_neg_words[i*K*2*N:(i+1)*K*2*N]
    pos_words = ([sent[x] if x >= 0 else S for x in range(i-N,i)] +
                 [sent[x] if x < len(sent) else S for x in range(i+1,i+N+1)])
    neg_loss = -dy.log(dy.logistic(-dy.dot_product(my_emb, dy.lookup_batch(W_c_p, neg_words))))
    pos_loss = -dy.log(dy.logistic(dy.dot_product(my_emb, dy.lookup_batch(W_c_p, pos_words))))
    all_losses.append(dy.sum_batches(neg_loss) + dy.sum_batches(pos_loss))
  return dy.esum(all_losses)
Beispiel #23
0
def calc_lm_loss(sents):
    dy.renew_cg()

    # initialize the RNN
    f_init = RNN.initial_state()

    # get the wids and masks for each step
    tot_words = 0
    wids = []
    masks = []
    for i in range(len(sents[0])):
        wids.append([(sent[i] if len(sent) > i else S) for sent in sents])
        mask = [(1 if len(sent) > i else 0) for sent in sents]
        masks.append(mask)
        tot_words += sum(mask)

    # start the rnn by inputting "<s>"
    init_ids = [S] * len(sents)
    s = f_init.add_input(dy.lookup_batch(WORDS_LOOKUP, init_ids))

    # feed word vectors into the RNN and predict the next word
    losses = []
    for wid, mask in zip(wids, masks):
        # calculate the softmax and loss
        score = dy.affine_transform([b_exp, W_exp, s.output()])
        loss = dy.pickneglogsoftmax_batch(score, wid)
        # mask the loss if at least one sentence is shorter
        if mask[-1] != 1:
            mask_expr = dy.inputVector(mask)
            mask_expr = dy.reshape(mask_expr, (1,), len(sents))
            loss = loss * mask_expr
        losses.append(loss)
        # update the state of the RNN
        wemb = dy.lookup_batch(WORDS_LOOKUP, wid)
        s = s.add_input(wemb)

    return dy.sum_batches(dy.esum(losses)), tot_words
def CalculateLossForDaf(daf, fValidation=False, fRunning=False):
    dy.renew_cg()
    tagged_daf = {"words": []}

    # add a bos before and after
    seq = ['*BOS*'] + list(' '.join([word for word, _ in daf])) + ['*BOS*']

    # get all the char encodings for the daf
    char_embeds = [let_enc(let) for let in seq]

    # run it through the bilstm
    char_bilstm_outputs = bilstm(char_embeds)

    # now iterate and get all the separate word representations by concatenating the bilstm output
    # before and after the word
    word_bilstm_outputs = []
    iLet_start = 0
    for iLet, char in enumerate(seq):
        # if it is a bos, check if it's at the end of the sequence
        if char == '*BOS*':
            if iLet + 1 == len(seq):
                char = ' '
            else:
                continue
        # if we are at a space, take this bilstm output and the one at the letter start
        if char == ' ':
            cur_word_bilstm_output = dy.concatenate([char_bilstm_outputs[iLet_start], char_bilstm_outputs[iLet]])
            # add it in
            word_bilstm_outputs.append(cur_word_bilstm_output)

            # set the iLet_start ocunter to here
            iLet_start = iLet

    # safe-check, make sure word bilstm outputs length is the same as the daf
    if len(word_bilstm_outputs) != len(daf):
        log_message('Size mismatch!! word_bilstm_outputs: ' + str(len(word_bilstm_outputs)) + ', daf: ' + str(len(daf)))

    prev_lang_lstm_state = prev_lang_lstm.initial_state().add_input(lang_enc('*BOS*'))

    all_losses = []
    lang_prec = 0.0
    lang_items = 0

    # now iterate through the bilstm outputs, and each word in the daf
    for (word, gold_word_lang), bilstm_output in zip(daf, word_bilstm_outputs):

        # create the mlp input, a concatenate of the bilstm output and of the prev pos output
        mlp_input = dy.concatenate([bilstm_output, prev_lang_lstm_state.output()])

        # run through the class mlp
        lang_mlp_output = lang_mlp(mlp_input)
        predicted_word_lang = lang_vocab.getItem(np.argmax(lang_mlp_output.npvalue()))
        confidence = np.max(lang_mlp_output.npvalue()) / np.sum(lang_mlp_output.npvalue())
        lang_prec += 1 if predicted_word_lang == gold_word_lang else 0
        lang_items += 1


        tagged_daf["words"].append(
            {"word": word, "predicted_lang": predicted_word_lang, "confidence": confidence})
        # if we aren't doing validation, calculate the loss
        if not fValidation and not fRunning:
            all_losses.append(-dy.log(dy.pick(lang_mlp_output, lang_vocab[gold_word_lang])))
            word_pos_ans = gold_word_lang
        # otherwise, set the answer to be the argmax
        elif not fRunning and fValidation:
            lang_conf_matrix(lang_vocab[predicted_word_lang], lang_vocab[gold_word_lang])
            word_pos_ans = predicted_word_lang
        else:
            continue

        # run through the prev-pos-mlp
        prev_lang_lstm_state = prev_lang_lstm_state.add_input(lang_enc(word_pos_ans))

        # prev_pos_lstm_state = prev_pos_lstm_state.add_input(pos_enc(''))



    lang_prec = lang_prec / lang_items if lang_items > 0 else None
    # class_prec = class_prec / class_items if class_items > 0 else None

    if fValidation:
        return lang_prec, tagged_daf

    if fRunning:
        return tagged_daf

    total_loss = dy.esum(all_losses) if len(all_losses) > 0 else None
    return total_loss, lang_prec
Beispiel #25
0
all_time = 0
for ITER in range(100):
    random.shuffle(train)
    closs = 0.0
    cwords = 0
    start = time.time()
    batch = []
    for i, tree in enumerate(train, 1):
        sents += 1
        W = dy.parameter(W_)
        h, c = builder.expr_for_tree(tree, True)
        nodes = tree.nonterms()
        losses = [
            dy.pickneglogsoftmax(W * nt._e, l2i[nt.label]) for nt in nodes
        ]
        loss = dy.esum(losses)
        batch.append(loss)
        if len(batch) == 50:
            loss = dy.esum(batch)
            closs += loss.value()
            cwords += len(nodes)
            loss.backward()
            trainer.update()
            batch = []
            dy.renew_cg()
        if sents % 1000 == 0:
            trainer.status()
            print(closs / cwords, file=sys.stderr)
            closs = 0.0
            cwords = 0
    all_time += time.time() - start
    def em_example(self, example):
        encoder_input = example[0]
        ground_labels = example[1]

        goal_vector = encoder_input[0]
        encoder_input = encoder_input[1]

        num_utterances = len(encoder_input)

        logits = self.MLP(goal_vector)
        sentence_initial_state = self.sentence_encoder.initial_state()
        pzs = []
        for sentence in encoder_input:
            embedded_sentence = [self.embeddings[word] for word in sentence]
            final_state = sentence_initial_state.transduce(
                embedded_sentence)[-1]
            final_state = dy.concatenate([final_state, logits])
            # Stochastic node:
            pzs.append(self.prob_z(final_state))

        context_initial_state = self.context_encoder.initial_state()
        context_state = context_initial_state
        z_list = []
        # Expectation
        e_time = time.time()
        for idx in range(num_utterances):
            pz = dy.nobackprop(pzs[idx])
            max_prob = dy.scalarInput(-999999999)
            z_star = -999999999
            z_star_onehot = self.onehotzs[0]
            for z in range(self.num_clusters):
                one_hot_z = self.onehotzs[z]
                state = context_state.add_input(one_hot_z).h()[-1]
                log_papx = dy.nobackprop(
                    self.log_prob_papx(example, idx, state))
                log_pz = dy.nobackprop(dy.log(dy.pick(pz, z)))
                # print("PZ: {}".format(log_pz.npvalue()))
                # print("PAPX: {}".format(log_papx.npvalue()))
                log_prob = dy.esum([log_papx, log_pz])
                # print("PROB: {}".format(log_prob.npvalue()))
                if log_prob.value() > max_prob.value():
                    max_prob = log_prob
                    z_star = z
                    z_star_onehot = one_hot_z
            # print(z_star)
            # print(max_prob.npvalue())
            # print(z_star)
            context_state = context_state.add_input(z_star_onehot)
            z_list.append(z_star)
        self.e_time += (time.time() - e_time)

        # Maximization
        m_time = time.time()
        context_state = context_initial_state
        probs = []
        for idx in range(num_utterances):
            pz = pzs[idx]
            z = z_list[idx]
            log_pz = dy.log(dy.pick(pz, z))

            one_hot_z = self.onehotzs[z]
            state = context_state.add_input(one_hot_z).h()[-1]
            # NO BACKPROP:
            log_papx = dy.nobackprop(self.log_prob_papx(example, idx, state))

            log_prob = dy.esum([log_papx, log_pz])
            probs.append(log_prob)
            # probs.append(log_papx)
        probs = dy.esum(probs)
        self.m_time += (time.time() - m_time)
        # TODO: check this:
        return (-probs, z_list)
Beispiel #27
0
def identify_frames(builders,
                    tokens,
                    postags,
                    lexunit,
                    targetpositions,
                    goldframe=None):
    renew_cg()
    trainmode = (goldframe is not None)

    sentlen = len(tokens) - 1
    emb_x = [v_x[tok] for tok in tokens]
    pos_x = [p_x[pos] for pos in postags]

    emb2_xi = []
    for i in range(sentlen + 1):
        if tokens[i] in pretrained_embeddings_map:
            # If update set to False, prevents pretrained embeddings from being updated.
            emb_without_backprop = lookup(e_x, tokens[i], update=True)
            features_at_i = concatenate(
                [emb_x[i], pos_x[i], emb_without_backprop])
        else:
            features_at_i = concatenate([emb_x[i], pos_x[i], u_x])
        emb2_xi.append(w_e * features_at_i + b_e)

    emb2_x = [rectify(emb2_xi[i]) for i in range(sentlen + 1)]

    # initializing the two LSTMs
    if USE_DROPOUT and trainmode:
        builders[0].set_dropout(DROPOUT_RATE)
        builders[1].set_dropout(DROPOUT_RATE)
    f_init, b_init = [i.initial_state() for i in builders]

    fw_x = f_init.transduce(emb2_x)
    bw_x = b_init.transduce(reversed(emb2_x))

    # only using the first target position - summing them hurts :(
    targetembs = [
        concatenate([fw_x[targetidx], bw_x[sentlen - targetidx - 1]])
        for targetidx in targetpositions
    ]
    targinit = tlstm.initial_state()
    target_vec = targinit.transduce(targetembs)[-1]

    valid_frames = list(lufrmmap[lexunit.id])
    chosenframe = valid_frames[0]
    logloss = None
    if len(valid_frames) > 1:
        if USE_HIER and lexunit.id in relatedlus:
            lu_vec = esum([lu_x[luid] for luid in relatedlus[lexunit.id]])
        else:
            lu_vec = lu_x[lexunit.id]
        fbemb_i = concatenate([target_vec, lu_vec, lp_x[lexunit.posid]])
        # TODO(swabha): Add more Baidu-style features here.
        f_i = w_f * rectify(w_z * fbemb_i + b_z) + b_f
        if trainmode and USE_DROPOUT:
            f_i = dropout(f_i, DROPOUT_RATE)

        logloss = log_softmax(f_i, valid_frames)

        if not trainmode:
            chosenframe = np.argmax(logloss.npvalue())

    if trainmode:
        chosenframe = goldframe.id

    losses = []
    if logloss is not None:
        losses.append(pick(logloss, chosenframe))

    prediction = {
        tidx: (lexunit, Frame(chosenframe))
        for tidx in targetpositions
    }

    objective = -esum(losses) if losses else None
    return objective, prediction
Beispiel #28
0
    def Train(self, trainData, options):
        mloss = 0.0
        eloss = 0.0
        eerrors = 0
        lerrors = 0
        etotal = 0
        ninf = -float('inf')

        beg = time.time()
        start = time.time()

        random.shuffle(
            trainData
        )  # in certain cases the data will already have been shuffled after being read from file or while creating dev data
        print "Length of training data: ", len(trainData)

        errs = []

        self.feature_extractor.Init(options)

        for iSentence, sentence in enumerate(trainData, 1):
            if iSentence % 100 == 0:
                loss_message = 'Processing sentence number: %d'%iSentence + \
                ' Loss: %.3f'%(eloss / etotal)+ \
                ' Errors: %.3f'%((float(eerrors)) / etotal)+\
                ' Labeled Errors: %.3f'%(float(lerrors) / etotal)+\
                ' Time: %.2gs'%(time.time()-start)
                print loss_message
                start = time.time()
                eerrors = 0
                eloss = 0.0
                etotal = 0
                lerrors = 0

            sentence = deepcopy(
                sentence
            )  # ensures we are working with a clean copy of sentence and allows memory to be recycled each time round the loop

            conll_sentence = [
                entry for entry in sentence
                if isinstance(entry, utils.ConllEntry)
            ]
            conll_sentence = conll_sentence[1:] + [conll_sentence[0]]
            self.feature_extractor.getWordEmbeddings(conll_sentence, True,
                                                     options)
            stack = ParseForest([])
            buf = ParseForest(conll_sentence)
            hoffset = 1 if self.headFlag else 0

            for root in conll_sentence:
                root.lstms = [root.vec] if self.headFlag else []
                root.lstms += [
                    self.feature_extractor.paddingVec
                    for _ in range(self.nnvecs - hoffset)
                ]
                root.relation = root.relation if root.relation in self.irels else 'runk'

            while not (len(buf) == 1 and len(stack) == 0):
                scores = self.__evaluate(stack, buf, True)

                #to ensure that we have at least one wrong operation
                scores.append([(None, 4, ninf, None)])

                stack_ids = [sitem.id for sitem in stack.roots]

                s1 = [stack.roots[-2]] if len(stack) > 1 else []
                s0 = [stack.roots[-1]] if len(stack) > 0 else []
                b = [buf.roots[0]] if len(buf) > 0 else []
                beta = buf.roots[1:] if len(buf) > 1 else []

                costs, shift_case = self.calculate_cost(
                    scores, s0, s1, b, beta, stack_ids)

                bestValid = list(
                    (s for s in chain(*scores)
                     if costs[s[1]] == 0 and (s[1] == SHIFT or s[1] == SWAP
                                              or s[0] == s0[0].relation)))

                bestValid = max(bestValid, key=itemgetter(2))
                bestWrong = max(
                    (s for s in chain(*scores)
                     if costs[s[1]] != 0 or (s[1] != SHIFT and s[1] != SWAP
                                             and s[0] != s0[0].relation)),
                    key=itemgetter(2))

                #force swap
                if costs[SWAP] == 0:
                    best = bestValid
                else:
                    #select a transition to follow
                    # + aggresive exploration
                    #1: might want to experiment with that parameter
                    if bestWrong[1] == SWAP:
                        best = bestValid
                    else:
                        best = bestValid if (
                            (not self.oracle) or
                            (bestValid[2] - bestWrong[2] > 1.0) or
                            (bestValid[2] > bestWrong[2]
                             and random.random() > 0.1)) else bestWrong

                if best[1] == LEFT_ARC or best[1] == RIGHT_ARC:
                    child = s0[0]

                #updates for the dynamic oracle
                if self.oracle:
                    self.oracle_updates(best, b, s0, stack_ids, shift_case)

                self.apply_transition(best, stack, buf, hoffset)

                if bestValid[2] < bestWrong[2] + 1.0:
                    loss = bestWrong[3] - bestValid[3]
                    mloss += 1.0 + bestWrong[2] - bestValid[2]
                    eloss += 1.0 + bestWrong[2] - bestValid[2]
                    errs.append(loss)

                #labeled errors
                if best[1] == LEFT_ARC or best[1] == RIGHT_ARC:
                    if (child.pred_parent_id != child.parent_id
                            or child.pred_relation != child.relation):
                        lerrors += 1
                        #attachment error
                        if child.pred_parent_id != child.parent_id:
                            eerrors += 1

                #??? when did this happen and why?
                if best[1] == 0 or best[1] == 2:
                    etotal += 1

            #footnote 8 in Eli's original paper
            if len(errs) > 50:  # or True:
                eerrs = dy.esum(errs)
                scalar_loss = eerrs.scalar_value()  #forward
                eerrs.backward()
                self.trainer.update()
                errs = []
                lerrs = []

                dy.renew_cg()
                self.feature_extractor.Init(options)

        if len(errs) > 0:
            eerrs = (dy.esum(errs))
            eerrs.scalar_value()
            eerrs.backward()
            self.trainer.update()

            errs = []
            lerrs = []

            dy.renew_cg()

        self.trainer.update()
        print "Loss: ", mloss / iSentence
        print "Total Training Time: %.2gs" % (time.time() - beg)
    def train(self, train_data, dev_data, num_epochs=150, batch_size=10):

        for I in range(num_epochs):

            print("EPOCH NUMBER {}".format(I))

            avg_loss = 0.
            random.shuffle(train_data)
            good, bad = 0., 0.
            avg_edit_distance = 0.
            q = 0.
            losses = []

            preds = []

            for i, (x, y) in enumerate(train_data):

                if i % batch_size == 0 and i > 0:

                    loss_sum = dy.esum(losses)
                    loss_sum.forward()
                    loss_sum.backward()
                    self.trainer.update()
                    losses = []

                    # evaluate trainset accuracy

                    for (word_probs, y_true) in preds:

                        generated_string = ""
                        for char_probs in word_probs:

                            generated_string += self.I2C[np.argmax(
                                char_probs.npvalue())]

                        if generated_string == y_true:

                            good += 1
                        else:
                            bad += 1

                    preds = []
                    dy.renew_cg()

                encoded_state, encoded_x = self.encode(x, y, train=True)

                loss, probs = self.decode(encoded_state,
                                          y,
                                          encoded_x,
                                          train=True)
                preds.append((probs, y))

                losses.append(loss)

                if i % 2000 == 0 and i > 0:
                    print(i)
                    #print (avg_loss)
                    avg_loss = 0.
                    #self.test(dev_data)

            #print ('DROPOUT = 0.5')
            #self.embedding_collector.collect()
            print("training accuracy: {}".format(good / (good + bad)))
            acc, edit_dis = self.evaluate(dev_data)
            self.accs.append(acc)

            patience = 20

            if I > 8 and abs(
                    min(self.accs[-patience:]) -
                    max(self.accs[-patience:])) < 0.01:

                return 0

            if acc > self.best_acc:
                self.best_acc = acc
                self.model.save("preds-orto-no-diac-embs-cyclic.m")

            #self.embedding_collector.collect()

        return 0
    def train(self, train_file, epochs, validation_file):
        plot_on = True
        # matplotlib config
        loss_values = []
        validation_data = pickle.load(open(validation_file, 'rb'))
        validation_accs, train_accs = [], []
        

        
        train_data_original = pickle.load(open(train_file, "rb" ))
        
        for i in range(epochs):
            print('started epoch', (i+1))
            losses = []
            train_data = pickle.load(open(train_file, "rb" ))

            # shuffle the training data.
            random.shuffle(train_data)

            step = 0
            for fl in train_data:
                features, label = fl[:-1], fl[-1]
                gold_label = self.vocab.tag2id(label)
                result = self.build_graph(features)

                # getting loss with respect to negative log softmax function and the gold label
                loss = dynet.pickneglogsoftmax(result, gold_label)

                # appending to the minibatch losses
                losses.append(loss)
                step += 1

                if len(losses) >= self.properties.minibatch_size:
                    # now we have enough loss values to get loss for minibatch
                    minibatch_loss = dynet.esum(losses) / len(losses)

                    # calling dynet to run forward computation for all minibatch items
                    minibatch_loss.forward()

                    # getting float value of the loss for current minibatch
                    minibatch_loss_value = minibatch_loss.value()

                    # printing info and plotting
                    loss_values.append((len(loss_values), minibatch_loss_value))
                    if len(loss_values)%10==0:
                        


                            
                        progress = round(100 * float(step) / len(train_data), 2)
                        print('current minibatch loss', minibatch_loss_value, 'progress:', progress, '%')

                    # calling dynet to run backpropagation
                    minibatch_loss.backward()

                    # calling dynet to change parameter values with respect to current backpropagation
                    self.updater.update()

                    # empty the loss vector
                    losses = []

                    # refresh the memory of dynet
                    dynet.renew_cg()
                    
                    # get validation set accuracy
                    if len(loss_values)%100==0: 
                        validation_accs.append((len(loss_values), self.calc_acc(validation_data)))
                        train_accs.append((len(loss_values), self.calc_acc(train_data_original)))

            # there are still some minibatch items in the memory but they are smaller than the minibatch size
            # so we ask dynet to forget them
            dynet.renew_cg()
            
        # return these values just for plotting
        return loss_values, validation_accs, train_accs
Beispiel #31
0
def node_iteration(rel, g, node, opts, assoc_model, trainer, log_file, is_source):
    """
    Perform one iteration of trying to score a node's neighbors above negative samples.
    """
    
    # true instances likelihood
    trues = targets(g, node) if is_source else sources(g, node)
    side = '->' if is_source else '<-'
    if len(trues) == 0: return 0.0
    
    if opts.debug:
        dy.renew_cg(immediate_compute = True, check_validity = True)
    else:
        dy.renew_cg()
    
    # compute association score as dynet expression (can't do this above due to staleness)
    true_scores = []
    for tr in trues:
        if is_source:
            j_assoc_score = assoc_model.word_assoc_score(node, tr, rel)
        else:
            j_assoc_score = assoc_model.word_assoc_score(tr, node, rel)
        if log_file is not None:
            log_file.write('{} {}\tTRUE_{}\t{:.3e}\n'\
                         .format(node, side, tr, j_assoc_score.scalar_value()))
        true_scores.append(j_assoc_score)


    # false targets likelihood - negative sampling (uniform)
    # collect negative samples
    if opts.nll:
        sample_scores = [[ts] for ts in true_scores]
    else:
        margins = []
    neg_samples = [np.random.choice(range(N)) for _ in range(opts.neg_samp * len(trues))]
    # remove source and true targets if applicable
    for t in [node] + trues:
        if t in neg_samples:
            neg_samples.remove(t)
            neg_samples.append(np.random.choice(range(N)))
    for (i,ns) in enumerate(neg_samples):
        # compute association score as dynet expression
        if is_source:
            ns_assoc_score = assoc_model.word_assoc_score(node, ns, rel)
        else:
            ns_assoc_score = assoc_model.word_assoc_score(ns, node, rel)
        if log_file is not None:
            log_file.write('{} {}\tNEG_{}\t{:.3e}\n'\
                         .format(node, side, ns, ns_assoc_score.scalar_value()))
        corresponding_true = i // opts.neg_samp
        if opts.nll:
            sample_scores[corresponding_true].append(ns_assoc_score)
        else:
            # TODO maybe use dy.hinge()
            ctt_score = true_scores[corresponding_true]
            margin = ctt_score - ns_assoc_score
            margins.append(dy.rectify(dy.scalarInput(1.0) - margin))


    # compute overall loss
    if opts.nll:
        if len(sample_scores) == 0:
            dy_loss = dy.scalarInput(0.0)
        else:
            dy_loss = dy.esum([dy.pickneglogsoftmax(dy.concatenate(scrs), 0) for scrs in sample_scores])
    else:
        if len(margins) == 0:
            dy_loss = dy.scalarInput(0.0)
        else:
            dy_loss = dy.esum(margins)
    sc_loss = dy_loss.scalar_value()
    if log_file is not None:
        log_file.write('{}\tLOSS\t{:.3e}\n'\
                         .format(node, sc_loss))
                         
    # backprop and recompute score
    if opts.v > 1:
        timeprint('overall loss for relation {}, node {} as {} = {:.6f}'\
                  .format(rel, node, 'source' if is_source else 'target', sc_loss))

    dy_loss.backward()
    trainer.update()

    return sc_loss
Beispiel #32
0
def CalculateLossForDaf(daf, fValidation=False, fRunning=False):
    dy.renew_cg()
    tagged_daf = {"words": [], "file": daf["file"]}
    daf = daf["words"]

    # add a bos before and after
    seq = ['*BOS*'] + list(' '.join([word
                                     for word, _, _, _ in daf])) + ['*BOS*']

    # get all the char encodings for the daf
    char_embeds = [let_enc(let) for let in seq]

    # run it through the bilstm
    char_bilstm_outputs = bilstm(char_embeds)

    # now iterate and get all the separate word representations by concatenating the bilstm output
    # before and after the word
    word_bilstm_outputs = []
    iLet_start = 0
    for iLet, char in enumerate(seq):
        # if it is a bos, check if it's at the end of the sequence
        if char == '*BOS*':
            if iLet + 1 == len(seq):
                char = ' '
            else:
                continue
        # if we are at a space, take this bilstm output and the one at the letter start
        if char == ' ':
            cur_word_bilstm_output = dy.concatenate(
                [char_bilstm_outputs[iLet_start], char_bilstm_outputs[iLet]])
            # add it in
            word_bilstm_outputs.append(cur_word_bilstm_output)

            # set the iLet_start ocunter to here
            iLet_start = iLet

    # safe-check, make sure word bilstm outputs length is the same as the daf
    if len(word_bilstm_outputs) != len(daf):
        log_message('Size mismatch!! word_bilstm_outputs: ' +
                    str(len(word_bilstm_outputs)) + ', daf: ' + str(len(daf)))

    prev_pos_lstm_state = prev_pos_lstm.initial_state().add_input(
        pos_enc('*BOS*'))

    all_losses = []
    pos_prec = 0.0
    rough_pos_prec = 0.0
    pos_items = 0
    class_prec = 0.0
    class_items = 0.0
    # now iterate through the bilstm outputs, and each word in the daf
    for (word, gold_word_class, gold_word_pos,
         gold_word_lang), bilstm_output in zip(daf, word_bilstm_outputs):
        should_backprop = gold_word_class == 1

        # create the mlp input, a concatenate of the bilstm output and of the prev pos output
        mlp_input = dy.concatenate(
            [bilstm_output, prev_pos_lstm_state.output()])

        # run through the class mlp
        class_mlp_output = class_mlp(mlp_input)

        predicted_word_class = np.argmax(class_mlp_output.npvalue())
        confidence = np.max(class_mlp_output.npvalue()) / np.sum(
            class_mlp_output.npvalue())

        # prec
        if should_backprop:
            class_prec += 1 if predicted_word_class == gold_word_class else 0
            class_items += 1

        # if we aren't doing validation, calculate the loss
        if not fValidation and not fRunning:
            if should_backprop:
                all_losses.append(
                    -dy.log(dy.pick(class_mlp_output, gold_word_class)))
            word_class_ans = gold_word_class
        # otherwise, set the answer to be the argmax
        else:
            word_class_ans = predicted_word_class

        # if the word_class answer is 1, do the pos!
        # alternatively, if validating and it's aramic, do the pos!
        if word_class_ans or (fValidation
                              and gold_word_lang) or (fRunning
                                                      and gold_word_lang):
            # run the pos mlp output
            pos_mlp_output = pos_mlp(mlp_input)
            try:
                temp_pos_array = pos_mlp_output.npvalue()
                possible_pos_array = np.zeros(temp_pos_array.shape)
                pos_list = pos_hashtable[word]
                # pos_list.add('') #concat 'unknown' as possible pos
                possible_pos_indices = [
                    pos_vocab[temp_pos] for temp_pos in pos_list
                ]
                possible_pos_array[possible_pos_indices] = temp_pos_array[
                    possible_pos_indices]
            except KeyError:
                possible_pos_array = pos_mlp_output.npvalue()
                # if fValidation:
                #    possible_pos_array[pos_vocab['']] = 0.0 # don't allow validation to guess UNK b/c it never trained against that TODO this makes sense, right?

            predicted_word_pos = pos_vocab.getItem(
                np.argmax(possible_pos_array))
            confidence = np.max(possible_pos_array) / np.sum(
                possible_pos_array)
            # prec
            if should_backprop:
                pos_prec += 1 if predicted_word_pos == gold_word_pos else 0
                rough_pos_prec += 1 if predicted_word_pos[0] == gold_word_pos[
                    0] else 0  # you got at least the rough pos right
                pos_items += 1

            # if we aren't doing validation, calculate the loss
            if not fValidation and not fRunning:
                if should_backprop:
                    all_losses.append(-dy.log(
                        dy.pick(pos_mlp_output, pos_vocab[gold_word_pos])))
                word_pos_ans = gold_word_pos
            # otherwise, set the answer to be the argmax
            elif not fRunning and fValidation:
                if should_backprop:
                    pos_conf_matrix(pos_vocab[predicted_word_pos],
                                    pos_vocab[gold_word_pos])
                word_pos_ans = predicted_word_pos
            else:
                word_pos_ans = predicted_word_pos

            # run through the prev-pos-mlp
            predicted = predicted_word_pos
            prev_pos_lstm_state = prev_pos_lstm_state.add_input(
                pos_enc(word_pos_ans))
        # if the answer is 0, put a '' through the prev-pos lstm
        else:
            predicted = 'UNK'
            prev_pos_lstm_state = prev_pos_lstm_state.add_input(pos_enc(''))

        tagged_daf["words"].append({
            "word": word,
            "gold_pos": gold_word_pos,
            "gold_class": gold_word_class,
            "predicted": predicted,
            "confidence": confidence,
            "lang": gold_word_lang
        })

    if fRunning:
        return tagged_daf

    pos_prec = pos_prec / pos_items if pos_items > 0 else None
    rough_pos_prec = rough_pos_prec / pos_items if pos_items > 0 else None
    class_prec = class_prec / class_items if class_items > 0 else None

    if fValidation:
        return class_prec, pos_prec, tagged_daf, rough_pos_prec

    total_loss = dy.esum(all_losses) if len(all_losses) > 0 else None
    return total_loss, class_prec, pos_prec, rough_pos_prec
Beispiel #33
0
def soft_average(buffer, word_weights):
    """soft attention"""
    return dy.esum([
        vector * attterion_weight
        for vector, attterion_weight in zip(buffer, word_weights)
    ])
Beispiel #34
0
        else:
            with open(os.curdir + save_dir + 'parsed.txt', 'w') as f:
                pass

        for i in ids:
            # Prepare a triple of the source word's character ids,
            # the target word's character ids and morphosyntactic features.
            d = data[step][i]
            triple = ([vocab._char_dict.x2i[c] for c in d[0]], [
                vocab._char_dict.x2i[c] for c in d[1]
            ], [vocab._feat_dicts[idx].x2i[c] for idx, c in enumerate(d[2])])
            pred_word_indices, loss = mdl.run(triple, isTrain)
            losses.extend(loss)
            if isTrain:
                if len(losses) >= config.batch_size:
                    sum_loss = dy.esum(losses)
                    tot_loss += sum_loss.value()
                    sum_loss.backward()
                    mdl.update_parameters()
                    mdl._global_step += 1
                    losses = []
                    dy.renew_cg()

            else:
                pred_word = ''.join(
                    [vocab._char_dict.i2x[c] for c in pred_word_indices[:-1]])
                if pred_word == d[1]:
                    tot_cor += 1
                with open(os.curdir + save_dir + 'parsed.txt', 'a') as f:
                    f.write(d[0] + '\t' + pred_word + '\n')
Beispiel #35
0
def calculate_scores_vector_for_list_of_words(word_idxs):
    dy.renew_cg()
    b = dy.parameter(mb)
    score_vector = dy.esum([dy.lookup(mW, x) for x in word_idxs])
    return b + score_vector
Beispiel #36
0
def node_iteration(rel, g, node, opts, assoc_model, trainer, log_file,
                   is_source):
    """
    Perform one iteration of trying to score a node's neighbors above negative samples.
    """

    # true instances likelihood
    trues = targets(g, node) if is_source else sources(g, node)
    side = '->' if is_source else '<-'
    if len(trues) == 0: return 0.0

    if opts.debug:
        dy.renew_cg(immediate_compute=True, check_validity=True)
    else:
        dy.renew_cg()

    # compute association score as dynet expression (can't do this above due to staleness)
    true_scores = []
    for tr in trues:
        if is_source:
            j_assoc_score = assoc_model.word_assoc_score(node, tr, rel)
        else:
            j_assoc_score = assoc_model.word_assoc_score(tr, node, rel)
        if log_file is not None:
            log_file.write('{} {}\tTRUE_{}\t{:.3e}\n'\
                         .format(node, side, tr, j_assoc_score.scalar_value()))
        true_scores.append(j_assoc_score)

    # false targets likelihood - negative sampling (uniform)
    # collect negative samples
    if opts.nll:
        sample_scores = [[ts] for ts in true_scores]
    else:
        margins = []
    neg_samples = [
        np.random.choice(range(N)) for _ in range(opts.neg_samp * len(trues))
    ]
    # remove source and true targets if applicable
    for t in [node] + trues:
        if t in neg_samples:
            neg_samples.remove(t)
            neg_samples.append(np.random.choice(range(N)))
    for (i, ns) in enumerate(neg_samples):
        # compute association score as dynet expression
        if is_source:
            ns_assoc_score = assoc_model.word_assoc_score(node, ns, rel)
        else:
            ns_assoc_score = assoc_model.word_assoc_score(ns, node, rel)
        if log_file is not None:
            log_file.write('{} {}\tNEG_{}\t{:.3e}\n'\
                         .format(node, side, ns, ns_assoc_score.scalar_value()))
        corresponding_true = i // opts.neg_samp
        if opts.nll:
            sample_scores[corresponding_true].append(ns_assoc_score)
        else:
            # TODO maybe use dy.hinge()
            ctt_score = true_scores[corresponding_true]
            margin = ctt_score - ns_assoc_score
            margins.append(dy.rectify(dy.scalarInput(1.0) - margin))

    # compute overall loss
    if opts.nll:
        if len(sample_scores) == 0:
            dy_loss = dy.scalarInput(0.0)
        else:
            dy_loss = dy.esum([
                dy.pickneglogsoftmax(dy.concatenate(scrs), 0)
                for scrs in sample_scores
            ])
    else:
        if len(margins) == 0:
            dy_loss = dy.scalarInput(0.0)
        else:
            dy_loss = dy.esum(margins)
    sc_loss = dy_loss.scalar_value()
    if log_file is not None:
        log_file.write('{}\tLOSS\t{:.3e}\n'\
                         .format(node, sc_loss))

    # backprop and recompute score
    if opts.v > 1:
        timeprint('overall loss for relation {}, node {} as {} = {:.6f}'\
                  .format(rel, node, 'source' if is_source else 'target', sc_loss))

    dy_loss.backward()
    trainer.update()

    return sc_loss
Beispiel #37
0
 def pool(input_, _):
     return dy.esum(input_) / len(input_)
Beispiel #38
0
    def train(self, train_file, epochs):
        # matplotlib config
        loss_values = []
        plt.ion()
        ax = plt.gca()
        ax.set_xlim([0, 10])
        ax.set_ylim([0, 3])
        plt.title("Loss over time")
        plt.xlabel("Minibatch")
        plt.ylabel("Loss")

        for i in range(epochs):
            print('started epoch', (i+1))
            losses = []
            train_data = open(train_file, 'r').read().strip().split('\n')

            # shuffle the training data.
            random.shuffle(train_data)

            step = 0
            for line in train_data:

                fields = line.strip().split()
                features, label = fields[:-1], fields[-1]
                gold_label = self.vocab.action2id(label)
                result = self.build_graph(features)

                # getting loss with respect to negative log softmax function and the gold label.
                loss = dynet.pickneglogsoftmax(result, gold_label)

                # appending to the minibatch losses
                losses.append(loss)
                step += 1

                if len(losses) >= self.properties.minibatch_size:
                    # now we have enough loss values to get loss for minibatch
                    minibatch_loss = dynet.esum(losses) / len(losses)

                    # calling dynet to run forward computation for all minibatch items
                    minibatch_loss.forward()

                    # getting float value of the loss for current minibatch
                    minibatch_loss_value = minibatch_loss.value()

                    # printing info and plotting
                    loss_values.append(minibatch_loss_value)
                    if len(loss_values)%10==0:
                        ax.set_xlim([0, len(loss_values)+10])
                        ax.plot(loss_values)
                        plt.draw()
                        plt.pause(0.0001)
                        progress = round(100 * float(step) / len(train_data), 2)
                        print('current minibatch loss', minibatch_loss_value, 'progress:', progress, '%')

                    # calling dynet to run backpropagation
                    minibatch_loss.backward()

                    # calling dynet to change parameter values with respect to current backpropagation
                    self.updater.update()

                    # empty the loss vector
                    losses = []

                    # refresh the memory of dynet
                    dynet.renew_cg()

            # there are still some minibatch items in the memory but they are smaller than the minibatch size
            # so we ask dynet to forget them
            dynet.renew_cg()
Beispiel #39
0
Datei: mdl.py Projekt: we1l1n/nsp
    def _trans_loss(self, superv_acts, superv_terms, buffer, stack_tail,
                    act_tail):
        stack = []
        loss_lst = []

        reduction_flag = False
        reducable_flag = False
        while not (len(stack) == 1 and reduction_flag != False):
            reduction_flag = False
            act_choices = self._legal_acts(stack, reducable_flag)

            w_weights = None
            act = self._act_dict[superv_acts.pop(0)]

            # Accumlate loss in action predication
            if len(stack) > 0 and act_choices[0] != self._ACT_RED:
                stack_emb = stack[-1][0].output()
                act_emb = act_tail.output()
                w_weights = self._atten(stack_emb, buffer)
                buf_emb, _ = nnunits.attention_output(buffer, w_weights,
                                                      'soft_average')
                # buf_emb=self._atten.output(buffer,w_weights)

                for i in xrange(len(stack)):
                    re_idx = len(stack) - 1 - i
                    if stack[re_idx][1] == 'nl':
                        nl_emb = stack[re_idx][2]
                        # Find the raw embedding of the root of subtree
                        # for the leaves.
                        break

                trans_state = dy.concatenate(
                    [buf_emb, stack_emb, nl_emb, act_emb])
                out = self._mlp_layer(trans_state)

                if self._dropout > 0:
                    out = dy.dropout(out, self._dropout)

                if len(act_choices):
                    log_probs_act = dy.log_softmax(self._act_pred_layer(out),
                                                   act_choices)
                    assert act in act_choices, 'illegal action'
                    loss_lst.append(-dy.pick(log_probs_act, act))

            act_emb = self._act_in_layer(self._lookup_act[act])
            act_tail = act_tail.add_input(act_emb)

            # Accumlate loss in term predication
            if act == self._ACT_NT:
                idx_nt = self._nt_dict[superv_terms.pop(0)]
                if w_weights is not None:
                    buf_emb, _ = nnunits.attention_output(
                        buffer, w_weights, 'soft_average')
                    # buf_emb = self._atten.output(buffer, w_weights)
                    log_probs_nt = dy.log_softmax(self._nt_pred_layer(buf_emb))
                    loss_lst.append(-dy.pick(log_probs_nt, idx_nt))

                stack_state, label, _ = stack[-1] if stack else (stack_tail,
                                                                 'ROOT',
                                                                 stack_tail)
                nt_emb = self._nt_in_layer(self._lookup_nt[idx_nt])
                # Here it is called 'raw embedding'

                stack_state = stack_state.add_input(nt_emb)
                stack.append((stack_state, 'nl', nt_emb))
                # 'nl' label represents the non-leaf nodes

            elif act in self._ACT_NT_dg:
                idx_nt = self._nt_dict[superv_terms.pop(0)]
                # There is no terms (operands) for this action

                stack_state, label, _ = stack[-1] if stack else (stack_tail,
                                                                 'ROOT',
                                                                 stack_tail)
                nt_emb = self._nt_in_layer(self._lookup_nt[idx_nt])
                # Here it is called 'raw embedding'

                stack_state = stack_state.add_input(nt_emb)
                stack.append((stack_state, 'nl', nt_emb))
                # 'nl' label represents the non-leaf nodes

            elif act == self._ACT_TER:
                idx_ter = self._ter_dict[superv_terms.pop(0)]
                if buf_emb != None:
                    log_probs_ter = dy.log_softmax(
                        self._ter_pred_layer(buf_emb))
                    loss_lst.append(-dy.pick(log_probs_ter, idx_ter))

                stack_state, label, _ = stack[-1] if stack else (stack_tail,
                                                                 'ROOT',
                                                                 stack_tail)
                ter_emb = self._nt_in_layer(self._lookup_ter[idx_ter])
                # Here it is called 'raw embedding'

                stack_state = stack_state.add_input(ter_emb)
                stack.append((stack_state, 'l', ter_emb))
                # 'nl' label represents the non-leaf nodes

            else:
                leaf_raw_reps = []
                while stack[-1][1] == 'l':
                    top = stack.pop()
                    rep, _, raw_rep = top
                    leaf_raw_reps.append(raw_rep)

                nl_raw_rep = stack.pop()[2]
                subtree_rep = self._red_in_layer(
                    dy.concatenate([dy.average(leaf_raw_reps), nl_raw_rep]))

                # Append the new reduced node
                stack_state, _, _ = stack[-1] if stack else (stack_tail,
                                                             'ROOT',
                                                             stack_tail)
                stack_state = stack_state.add_input(subtree_rep)
                stack.append((stack_state, 'l', subtree_rep))
                reduction_flag = True

            reducable_flag = True if stack[-1][1] != 'nl' else False

        # for loss in loss_lst:
        #     print loss.vec_value()

        return dy.esum(loss_lst)
Beispiel #40
0
    def train(self, conll_path):
        # pylint: disable=invalid-name
        # pylint: disable=missing-docstring
        eloss = 0.0
        mloss = 0.0
        eerrors = 0
        etotal = 0
        start = time.time()

        shuffled_data = list(read_conll(conll_path))
        random.shuffle(shuffled_data)
        errs = []
        lerrs = []
        i_sentence = 0

        for sentence in shuffled_data:
            if i_sentence % 100 == 0 and i_sentence != 0:
                print('Processing sentence number:', i_sentence, 'Loss:',
                      eloss / etotal, 'Errors:', (float(eerrors)) / etotal,
                      'Time',
                      time.time() - start)
                start = time.time()
                eerrors = 0
                eloss = 0.0
                etotal = 0

            conll_sentence = [
                entry for entry in sentence if isinstance(entry, ConllEntry)
            ]

            for entry in conll_sentence:
                c = float(self.words_count.get(entry.norm, 0))
                drop_flag = (random.random() < (c / (0.25 + c)))
                wordvec = self.wlookup[int(self.vocab.get(entry.norm, 0)) if drop_flag else 0] \
                    if self.wdims > 0 else None
                posvec = self.plookup[int(
                    self.pos[entry.pos])] if self.pdims > 0 else None

                entry.vec = concatenate(
                    [_f for _f in [wordvec, posvec, None] if _f])

                entry.lstms = [entry.vec, entry.vec]
                entry.headfov = None
                entry.modfov = None

                entry.rheadfov = None
                entry.rmodfov = None

            if self.blstm_flag:
                lstm_forward = self.builders[0].initial_state()
                lstm_backward = self.builders[1].initial_state()

                for entry, rentry in zip(conll_sentence,
                                         reversed(conll_sentence)):
                    lstm_forward = lstm_forward.add_input(entry.vec)
                    lstm_backward = lstm_backward.add_input(rentry.vec)

                    entry.lstms[1] = lstm_forward.output()
                    rentry.lstms[0] = lstm_backward.output()

                if self.bibi_flag:
                    for entry in conll_sentence:
                        entry.vec = concatenate(entry.lstms)

                    blstm_forward = self.bbuilders[0].initial_state()
                    blstm_backward = self.bbuilders[1].initial_state()

                    for entry, rentry in zip(conll_sentence,
                                             reversed(conll_sentence)):
                        blstm_forward = blstm_forward.add_input(entry.vec)
                        blstm_backward = blstm_backward.add_input(rentry.vec)

                        entry.lstms[1] = blstm_forward.output()
                        rentry.lstms[0] = blstm_backward.output()

            scores, exprs = self._evaluate(conll_sentence)
            gold = [entry.parent_id for entry in conll_sentence]
            heads = decoder.parse_proj(scores,
                                       gold if self.costaug_flag else None)

            if self.labels_flag:
                for modifier, head in enumerate(gold[1:]):
                    rscores, rexprs = self._evaluate_label(
                        conll_sentence, head, modifier + 1)
                    gold_label_ind = self.rels[conll_sentence[modifier +
                                                              1].relation]
                    wrong_label_ind = max(((label, scr)
                                           for label, scr in enumerate(rscores)
                                           if label != gold_label_ind),
                                          key=itemgetter(1))[0]
                    if rscores[gold_label_ind] < rscores[wrong_label_ind] + 1:
                        lerrs.append(rexprs[wrong_label_ind] -
                                     rexprs[gold_label_ind])

            e = sum([1 for h, g in zip(heads[1:], gold[1:]) if h != g])
            eerrors += e
            if e > 0:
                loss = [(exprs[h][i] - exprs[g][i])
                        for i, (h, g) in enumerate(zip(heads, gold))
                        if h != g]  # * (1.0/float(e))
                eloss += e
                mloss += e
                errs.extend(loss)

            etotal += len(conll_sentence)

            if i_sentence % 1 == 0 or errs > 0 or lerrs:
                if errs or lerrs:
                    eerrs = (esum(errs + lerrs))  # * (1.0/(float(len(errs))))
                    eerrs.scalar_value()
                    eerrs.backward()
                    self.trainer.update()
                    errs = []
                    lerrs = []

                renew_cg()

            i_sentence += 1

        if errs:
            eerrs = (esum(errs + lerrs))  # * (1.0/(float(len(errs))))
            eerrs.scalar_value()
            eerrs.backward()
            self.trainer.update()

            renew_cg()

        self.trainer.update()
        print("Loss: ", mloss / i_sentence)
Beispiel #41
0
def get_score_for_hist(hist):
    scores = [dy.parameter(b_m)]
    for h_, W_ in zip(hist, W_m):
        scores.append(dy.lookup(W_, h_))
    return dy.esum(scores)
Beispiel #42
0
def calc_scores(words):
    dy.renew_cg()
    h = dy.esum([dy.lookup(W_emb, x) for x in words])
    for W_h_i, b_h_i in zip(W_h, b_h):
        h = dy.tanh(W_h_i * h + b_h_i)
    return W_sm * h + b_sm
Beispiel #43
0
 good = 0.0
 bad = 0.0
 errors = []
 # batching
 for i, (s1, s2, label) in enumerate(train.data):
     prob = model(s1, s2)
     softmax = dy.softmax(prob).npvalue()
     pred = np.argmax(softmax)
     error = dy.pickneglogsoftmax(prob, label)
     errors.append(error)
     if pred == label:
         good += 1
     else:
         bad += 1
     if i % batch_size == 0 and i > 0:
         sum_errors = dy.esum(errors)
         loss += sum_errors.value()
         sum_errors.backward()
         model.trainer.update()
         checked += batch_size
         dy.renew_cg()
         errors = []
     if i % (batch_size * 5) == 0 and i > 0:
         avgLoss = loss / checked
         losses.append(avgLoss)
         print "-" * 20
         print "Time: " + str(passed_time(start_time))
         print "Epoch: " + str(epoch + 1) + ", Iteration: " + str(
             i) + " Average loss: " + str(avgLoss)
         loss = 0
         checked = 0
Beispiel #44
0
    def CalculateLossForDaf(daf, fValidation=False, fRunning=False):
        dy.renew_cg()
        tagged_daf = {"words":[],"file":daf["file"]}
        daf = daf["words"]

        # add a bos before and after
        seq = ['*BOS*'] + list(' '.join([word for word, _, _, _ in daf])) + ['*BOS*']

        # get all the char encodings for the daf
        char_embeds = [let_enc(let) for let in seq]

        # run it through the bilstm
        char_bilstm_outputs = bilstm(char_embeds)

        # now iterate and get all the separate word representations by concatenating the bilstm output
        # before and after the word
        word_bilstm_outputs = []
        iLet_start = 0
        for iLet, char in enumerate(seq):
            # if it is a bos, check if it's at the end of the sequence
            if char == '*BOS*':
                if iLet + 1 == len(seq):
                    char = ' '
                else:
                    continue
            # if we are at a space, take this bilstm output and the one at the letter start
            if char == ' ':
                cur_word_bilstm_output = dy.concatenate([char_bilstm_outputs[iLet_start], char_bilstm_outputs[iLet]])
                # add it in
                word_bilstm_outputs.append(cur_word_bilstm_output)

                # set the iLet_start ocunter to here
                iLet_start = iLet

        # safe-check, make sure word bilstm outputs length is the same as the daf
        if len(word_bilstm_outputs) != len(daf):
            log_message('Size mismatch!! word_bilstm_outputs: ' + str(len(word_bilstm_outputs)) + ', daf: ' + str(len(daf)))

        s_0 = prev_pos_lstm.initial_state()

        beam = [(['*BOS*'],1.0,s_0,[],0.0,0.0,0.0,0.0,0.0,[])] # seq, prob, lstm_state, losses, class_prec, class_items, pos_prec, rough_pos_prec, pos_items, confidences
        i = 0
        for (word, gold_word_class, gold_word_pos, gold_word_lang), bilstm_output in zip(daf, word_bilstm_outputs):
            should_backprop = gold_word_class == 1
            new_hypos = []
            for hypo in beam:
                seq, hyp_prob, hyp_state, losses, class_prec, class_items, pos_prec, rough_pos_prec, pos_items, confidences = hypo
                new_seq = seq[:]
                new_losses = losses[:]
                new_confidences = confidences[:]

                last_pos = seq[-1]

                next_hyp_state = hyp_state.add_input(pos_enc(last_pos))
                # create the mlp input, a concatenate of the bilstm output and of the prev pos output
                mlp_input = dy.concatenate([bilstm_output, next_hyp_state.output()])

                # run through the class mlp
                class_mlp_output = class_mlp(mlp_input)

                predicted_word_class = np.argmax(class_mlp_output.npvalue())

                new_confidences.append(np.max(class_mlp_output.npvalue()) / np.sum(class_mlp_output.npvalue()))


                # prec
                if should_backprop:
                    class_prec += 1 if predicted_word_class == gold_word_class else 0
                    class_items += 1

                # if we aren't doing validation, calculate the loss
                if not fValidation and not fRunning:
                    if should_backprop: new_losses.append(-dy.log(dy.pick(class_mlp_output, gold_word_class)))
                    word_class_ans = gold_word_class
                # otherwise, set the answer to be the argmax
                else:
                    word_class_ans = predicted_word_class

                # if the word_class answer is 1, do the pos!
                # alternatively, if validating and it's aramic, do the pos!
                if word_class_ans or (fValidation and gold_word_lang) or (fRunning and gold_word_lang):
                    # run the pos mlp output
                    pos_mlp_output = pos_mlp(mlp_input)
                    try:
                        temp_pos_array = pos_mlp_output.npvalue()
                        possible_pos_array = np.zeros(temp_pos_array.shape)
                        pos_list = pos_hashtable[word]
                        # pos_list.add('') #concat 'unknown' as possible pos
                        possible_pos_indices = [pos_vocab[temp_pos] for temp_pos in pos_list]
                        possible_pos_array[possible_pos_indices] = temp_pos_array[possible_pos_indices]
                    except KeyError:
                        possible_pos_array = pos_mlp_output.npvalue()
                        # if fValidation:
                        #    possible_pos_array[pos_vocab['']] = 0.0 # don't allow validation to guess UNK b/c it never trained against that TODO this makes sense, right?

                    poss_pos_sum = np.sum(possible_pos_array)


                    for iprob, prob in enumerate(possible_pos_array):
                        new_seq = seq[:]
                        temp_picked_pos = pos_vocab.getItem(iprob)
                        temp_confidence = possible_pos_array[iprob] / poss_pos_sum
                        new_confidences[-1] = temp_confidence # overwrite class confidence
                        new_pos_prec = pos_prec
                        new_pos_items = pos_items
                        new_rough_pos_prec = rough_pos_prec
                        if should_backprop:
                            new_pos_prec += 1 if temp_picked_pos == gold_word_pos else 0
                            new_rough_pos_prec += 1 if len(temp_picked_pos) > 0 and temp_picked_pos[0] == gold_word_pos[
                                0] else 0  # you got at least the rough pos right
                            new_pos_items += 1

                        if not fValidation and not fRunning:
                            if should_backprop: new_losses.append(
                                -dy.log(dy.pick(pos_mlp_output, pos_vocab[gold_word_pos])))
                        new_seq += [temp_picked_pos]
                        new_prob = hyp_prob + math.log(prob) if prob != 0 else hyp_prob + math.log(1E-10)  # which is log(0.00000001) or something like that
                        new_hypos += [(new_seq, new_prob, next_hyp_state, new_losses, class_prec, class_items, new_pos_prec,
                                      new_rough_pos_prec, new_pos_items, new_confidences)]
                else:
                    # assume prob is 1. It's really good at predicting hebrew / aramaic
                    new_seq = seq[:]
                    new_seq += ['']
                    new_prob = hyp_prob
                    new_hypos += [(new_seq, new_prob, next_hyp_state, new_losses, class_prec, class_items, pos_prec,
                                  rough_pos_prec, pos_items, new_confidences)]

            # pick the best hypos
            new_probs = [p for (s, p, r, l, cp, ci, pp, rpp, pi, c) in new_hypos]
            argmax_indices = util.argmax(new_probs, n=beam_width)
            if type(argmax_indices) == int:
                argmax_indices = [argmax_indices]
            beam = [new_hypos[l] for l in argmax_indices]

            i += 1


            correct_answer_in_beam = False
            for max_ind in argmax_indices:
                if new_hypos[max_ind][0][-1] == gold_word_pos:
                    correct_answer_in_beam = True
                    break
            if not correct_answer_in_beam and not fValidation and not fRunning and with_early_stop:
                # early stop
                break



        final_probs = [p for (s, p, r, l, cp, ci, pp, rpp, pi, c) in beam]
        argmax_index = util.argmax(final_probs)
        final_seq, prob, lstm_state, all_losses, class_prec, class_items, pos_prec, rough_pos_prec, pos_items, confidences = beam[argmax_index]
        for (word, gold_word_class, gold_word_pos, gold_word_lang), pred, conf in zip(daf, final_seq[1:], confidences): # VERY IMPORTANT. final_seq is off-by-one b/c we inited it with BOS
            tagged_daf['words'].append({"word":word,"gold_pos":gold_word_pos,"gold_class":gold_word_class,"predicted":pred,"confidence":conf,"lang":gold_word_lang})
            should_backprop = gold_word_class == 1
            if should_backprop: pos_conf_matrix(pos_vocab[pred], pos_vocab[gold_word_pos])




        if fRunning:
            return tagged_daf

        pos_prec = pos_prec / pos_items if pos_items > 0 else None
        rough_pos_prec = rough_pos_prec / pos_items if pos_items > 0 else None
        class_prec = class_prec / class_items if class_items > 0 else None

        if fValidation:
            return class_prec, pos_prec,tagged_daf, rough_pos_prec

        total_loss = dy.esum(all_losses) if len(all_losses) > 0 else None
        return total_loss, class_prec, pos_prec, rough_pos_prec
Beispiel #45
0
        prev_word = next_word

    softmax_loss = dy.esum(all_losses)

    return kl_loss, softmax_loss


for ITER in range(100):
    # Perform training
    random.shuffle(train)
    train_words, train_loss, train_kl_loss, train_reconstruct_loss = 0, 0.0, 0.0, 0.0
    start = time.time()
    for sent_id, sent in enumerate(train):
        kl_loss, softmax_loss = calc_loss(sent)
        total_loss = dy.esum([kl_loss, softmax_loss])
        train_loss += total_loss.value()

        # Record the KL loss and reconstruction loss separately help you monitor the training.
        train_kl_loss += kl_loss.value()
        train_reconstruct_loss += softmax_loss.value()

        train_words += len(sent)
        total_loss.backward()
        trainer.update()
        if (sent_id + 1) % 1000 == 0:
            print("--finished %r sentences" % (sent_id + 1))

    print("iter %r: train loss/word=%.4f, kl loss/word=%.4f, reconstruction loss/word=%.4f, ppl=%.4f, time=%.2fs" % (
        ITER, train_loss / train_words, train_kl_loss / train_words, train_reconstruct_loss / train_words,
        math.exp(train_loss / train_words), time.time() - start))
def train_item(args, model, document):
    loss = None
    word_lookups = []
    for preprocessed_sentence in document.preprocessed_sentences:
        seq = [
            model.wlookup[int(model.w2i.get(entry, 0))]
            for entry in preprocessed_sentence
        ]
        if len(seq) > 0:
            word_lookups.append(seq)

    sentences_lookups = []
    for seq in word_lookups:
        sentence_encode = encode_sequence(model, seq, model.sentence_rnn)
        global_max = max_pooling(sentence_encode)
        global_min = average_pooling(sentence_encode)
        if len(sentence_encode) > 0:
            att_mlp_outputs = []
            for e in sentence_encode:
                mlp_out = (model.word_attention_w * e) + model.word_attention_b
                att_mlp_outputs.append(mlp_out)

            lst = []
            for o in att_mlp_outputs:
                lst.append(
                    dy.exp(dy.sum_elems(dy.cmult(o, model.word_att_context))))

            sum_all = dy.esum(lst)

            probs = [dy.cdiv(e, sum_all) for e in lst]
            att_context = dy.esum(
                [dy.cmult(p, h) for p, h in zip(probs, sentence_encode)])
            context = dy.concatenate([att_context, global_max, global_min])
            sentences_lookups.append(context)

    document_encode = encode_sequence(model, sentences_lookups,
                                      model.document_rnn)
    global_max = max_pooling(document_encode)
    global_min = average_pooling(document_encode)
    if len(document_encode) > 0:
        att_mlp_outputs = []
        for e in document_encode:
            mlp_out = (model.sentence_attention_w *
                       e) + model.sentence_attention_b
            att_mlp_outputs.append(mlp_out)

        lst = []
        for o in att_mlp_outputs:
            lst.append(
                dy.exp(dy.sum_elems(dy.cmult(o, model.sentence_att_context))))

        sum_all = dy.esum(lst)

        probs = [dy.cdiv(e, sum_all) for e in lst]
        att_context = dy.esum(
            [dy.cmult(p, h) for p, h in zip(probs, document_encode)])
        context = dy.concatenate([att_context, global_max, global_min])
        y_pred = dy.logistic((model.mlp_w * context) + model.mlp_b)

        if document.permissions[args.permission_type]:
            loss = dy.binary_log_loss(y_pred, dy.scalarInput(1))
        else:
            loss = dy.binary_log_loss(y_pred, dy.scalarInput(0))

        loss.backward()
        model.trainer.update()
        loss_val = loss.scalar_value()
        dy.renew_cg()
        return loss_val
    return 0
Beispiel #47
0
  def calc_loss(self, policy_reward, results={}):
    """
    Calc policy networks loss.
    """
    assert len(policy_reward) == len(self.states), "There should be a reward for every action taken"
    batch_size = self.states[0].dim()[1]
    loss = {}

    # Calculate the baseline loss of the reinforce loss for each timestep:
    # b = W_b * s + b_b
    # R = r - b
    # Also calculate the baseline loss
    # b = r_p (predicted)
    # loss_b = squared_distance(r_p - r_r)
    rewards = []
    baseline_loss = []
    units = np.zeros(batch_size)
    for i, state in enumerate(self.states):
      r_p = self.baseline.transform(dy.nobackprop(state))
      rewards.append(policy_reward[i] - r_p)
      if self.valid_pos[i] is not None:
        r_p = dy.pick_batch_elems(r_p, self.valid_pos[i])
        r_r = dy.pick_batch_elems(policy_reward[i], self.valid_pos[i])
        units[self.valid_pos[i]] += 1
      else:
        r_r = policy_reward[i]
        units += 1
      baseline_loss.append(dy.sum_batches(dy.squared_distance(r_p, r_r)))
    loss["rl_baseline"] = losses.LossExpr(dy.esum(baseline_loss), units)

    # Z Normalization
    # R = R - mean(R) / std(R)
    rewards = dy.concatenate(rewards, d=0)
    r_dim = rewards.dim()
    if self.z_normalization:
      rewards_shape = dy.reshape(rewards, (r_dim[0][0], r_dim[1]))
      rewards_mean = dy.mean_elems(rewards_shape)
      rewards_std = dy.std_elems(rewards_shape) + 1e-20
      rewards = (rewards - rewards_mean.value()) / rewards_std.value()
    rewards = dy.nobackprop(rewards)
    # Calculate Confidence Penalty
    if self.confidence_penalty:
      loss["rl_confpen"] = self.confidence_penalty.calc_loss(self.policy_lls)

    # Calculate Reinforce Loss
    # L = - sum([R-b] * pi_ll)
    reinf_loss = []
    units = np.zeros(batch_size)
    for i, (policy, action) in enumerate(zip(self.policy_lls, self.actions)):
      reward = dy.pick(rewards, i)
      ll = dy.pick_batch(policy, action)
      if self.valid_pos[i] is not None:
        ll = dy.pick_batch_elems(ll, self.valid_pos[i])
        reward = dy.pick_batch_elems(reward, self.valid_pos[i])
        units[self.valid_pos[i]] += 1
      else:
        units += 1
      reinf_loss.append(dy.sum_batches(dy.cmult(ll, reward)))
    loss["rl_reinf"] = losses.LossExpr(-dy.esum(reinf_loss), units)

    # Pack up + return
    return losses.FactoredLossExpr(loss)
Beispiel #48
0
 def pool(input_, _):
     return dy.esum(input_) / len(input_)
    def train(self, examples):

        # Train action classifier:
        classifier_data = []
        for example in examples:
            encoder_input = example[0]
            ground_labels = example[1]
            cdata = (encoder_input[0], ground_labels[0], ground_labels[1])
            classifier_data.append(cdata)

        self.classifier.train(classifier_data)

        # Train cluster model:
        # num_examples = len(examples)
        num_examples = 100
        trainer = dy.AdamTrainer(self.params)

        for epoch in range(self.num_epochs):
            batch_loss = []
            loss_sum = 0
            for idx in range(num_examples):
                # if (idx % 1000 == 0):
                # print("(Clusters) Epoch: {} | Example: {} | Loss sum: {}".format(epoch, idx, loss_sum))
                loss = self.train_example(examples[idx])
                batch_loss.append(loss)

                # Minibatching:
                if (idx % self.minibatch == 0) or (idx + 1 == num_examples):
                    batch_loss = dy.esum(batch_loss)
                    loss_sum += batch_loss.value()
                    batch_loss.backward()
                    batch_loss = []
                    trainer.update()
                    dy.renew_cg()
            print("(Clusters) Epoch: {} | Loss: {}".format(
                epoch + 1, loss_sum))

        # Expectation maximization:
        em_time = time.time()
        self.e_time = 0
        self.m_time = 0
        self.back_time = 0
        self.papx_time = 0
        self.pa_time = 0
        self.px_time = 0
        zs = {}
        # Initialize one-hot z's:
        self.onehotzs = []
        for idx in range(self.num_clusters):
            one_hot_z = np.zeros(self.num_clusters)
            one_hot_z[idx] = 1
            one_hot_z = dy.inputVector(one_hot_z)
            self.onehotzs.append(one_hot_z)

        for epoch in range(10):
            batch_loss = []
            loss_sum = 0
            for idx in range(num_examples):
                # if (idx % 100 == 0):
                # 	print("(EM) Epoch: {} | Example: {}".format(epoch, idx))
                if len(examples[idx][1][1]) == 3:  # Agreement occurs
                    loss, z_list = self.em_example(examples[idx])
                    zs[idx] = z_list
                    batch_loss.append(loss)
                else:
                    zs[idx] = []

                # Minibatching:
                if (idx % self.minibatch
                        == 0) or (idx + 1
                                  == num_examples) and (batch_loss != []):
                    batch_loss = dy.esum(batch_loss)
                    loss_sum += batch_loss.value()
                    back_time = time.time()
                    batch_loss.backward()
                    batch_loss = []
                    trainer.update()
                    dy.renew_cg()
                    self.onehotzs = []
                    for idx in range(self.num_clusters):
                        one_hot_z = np.zeros(self.num_clusters)
                        one_hot_z[idx] = 1
                        one_hot_z = dy.inputVector(one_hot_z)
                        self.onehotzs.append(one_hot_z)
                    self.back_time += (time.time() - back_time)
            print("(EM) Epoch: {} | Loss: {}".format(epoch + 1, loss_sum))
        # print("EM time: {}".format(time.time() - em_time))
        # print("E time: {}".format(self.e_time))
        # print("M time: {}".format(self.m_time))
        # print("Backprop time: {}".format(self.back_time))
        # print("PAPX time: {}".format(self.papx_time))
        # print("PA time: {}".format(self.pa_time))
        # print("PX time: {}".format(self.px_time))
        # Print zs to file:
        with open("data/clusters/clusters.txt", 'w') as f:
            for idx in range(num_examples):
                f.write(str(zs[idx]))
                f.write('\n')
Beispiel #50
0
def calc_loss(scores, tags):
    losses = [dy.pickneglogsoftmax(score, tag) for score, tag in zip(scores, tags)]
    return dy.esum(losses)
Beispiel #51
0
def calc_loss(scores, tags):
    losses = [
        dy.pickneglogsoftmax(score, tag) for score, tag in zip(scores, tags)
    ]
    return dy.esum(losses)
def calc_loss(sents):
    dy.renew_cg()

    # Transduce all batch elements with an LSTM
    src_sents = [x[0] for x in sents]
    tgt_sents = [x[1] for x in sents]
    src_cws = []

    src_len = [len(sent) for sent in src_sents]        
    max_src_len = np.max(src_len)
    num_words = 0

    for i in range(max_src_len):
        src_cws.append([sent[i] for sent in src_sents])


    #get the outputs of the first LSTM
    src_outputs = [dy.concatenate([x.output(), y.output()]) for x,y in LSTM_SRC.add_inputs([dy.lookup_batch(LOOKUP_SRC, cws) for cws in src_cws])]
    src_output = src_outputs[-1]

    #gets the parameters for the attention
    src_output_matrix = dy.concatenate_cols(src_outputs)
    w1_att_src = dy.parameter(w1_att_src_p)
    fixed_attentional_component = w1_att_src * src_output_matrix

    #now decode
    all_losses = []

    # Decoder
    #need to mask padding at end of sentence
    tgt_cws = []
    tgt_len = [len(sent) for sent in sents]
    max_tgt_len = np.max(tgt_len)
    masks = []

    for i in range(max_tgt_len):
        tgt_cws.append([sent[i] if len(sent) > i else eos_trg for sent in tgt_sents])
        mask = [(1 if len(sent) > i else 0) for sent in tgt_sents]
        masks.append(mask)
        num_words += sum(mask)



    current_state = LSTM_TRG_BUILDER.initial_state().set_s([src_output, dy.tanh(src_output)])
    prev_words = tgt_cws[0]
    W_sm = dy.parameter(W_sm_p)
    b_sm = dy.parameter(b_sm_p)

    W_m = dy.parameter(W_m_p)
    b_m = dy.parameter(b_m_p)

    for next_words, mask in zip(tgt_cws[1:], masks):
        #feed the current state into the 
        current_state = current_state.add_input(dy.lookup_batch(LOOKUP_TRG, prev_words))
        output_embedding = current_state.output()
        att_output, _ = calc_attention(src_output_matrix, output_embedding, fixed_attentional_component)
        middle_expr = dy.tanh(dy.affine_transform([b_m, W_m, dy.concatenate([output_embedding, att_output])]))
        s = dy.affine_transform([b_sm, W_sm, middle_expr])
        loss = (dy.pickneglogsoftmax_batch(s, next_words))
        mask_expr = dy.inputVector(mask)
        mask_expr = dy.reshape(mask_expr, (1,),len(sents))
        mask_loss = loss * mask_expr
        all_losses.append(mask_loss)
        prev_words = next_words
    return dy.sum_batches(dy.esum(all_losses)), num_words
def CalculateLossForDaf(daf, fValidation=False, fRunning=False):
    dy.renew_cg()
    tagged_daf = {"words":[],"file":daf["file"]}
    daf = daf["words"]

    # add a bos before and after
    seq = ['*BOS*'] + list(' '.join([word for word, _, _, _ in daf])) + ['*BOS*']

    # get all the char encodings for the daf
    char_embeds = [let_enc(let) for let in seq]

    # run it through the bilstm
    char_bilstm_outputs = bilstm(char_embeds)

    # now iterate and get all the separate word representations by concatenating the bilstm output
    # before and after the word
    word_bilstm_outputs = []
    iLet_start = 0
    for iLet, char in enumerate(seq):
        # if it is a bos, check if it's at the end of the sequence
        if char == '*BOS*':
            if iLet + 1 == len(seq):
                char = ' '
            else:
                continue
        # if we are at a space, take this bilstm output and the one at the letter start
        if char == ' ':
            cur_word_bilstm_output = dy.concatenate([char_bilstm_outputs[iLet_start], char_bilstm_outputs[iLet]])
            # add it in
            word_bilstm_outputs.append(cur_word_bilstm_output)

            # set the iLet_start ocunter to here
            iLet_start = iLet

    # safe-check, make sure word bilstm outputs length is the same as the daf
    if len(word_bilstm_outputs) != len(daf):
        log_message('Size mismatch!! word_bilstm_outputs: ' + str(len(word_bilstm_outputs)) + ', daf: ' + str(len(daf)))

    prev_pos_lstm_state = prev_pos_lstm.initial_state().add_input(pos_enc('*BOS*'))

    all_losses = []
    pos_prec = 0.0
    rough_pos_prec = 0.0
    pos_items = 0
    class_prec = 0.0
    class_items = 0.0
    # now iterate through the bilstm outputs, and each word in the daf
    for (word, gold_word_class, gold_word_pos, gold_word_lang), bilstm_output in zip(daf, word_bilstm_outputs):
        should_backprop = gold_word_class == 1

        # create the mlp input, a concatenate of the bilstm output and of the prev pos output
        mlp_input = dy.concatenate([bilstm_output, prev_pos_lstm_state.output()])

        # run through the class mlp
        class_mlp_output = class_mlp(mlp_input)

        predicted_word_class = np.argmax(class_mlp_output.npvalue())
        confidence = np.max(class_mlp_output.npvalue()) / np.sum(class_mlp_output.npvalue())


        # prec
        if should_backprop:
            class_prec += 1 if predicted_word_class == gold_word_class else 0
            class_items += 1

        # if we aren't doing validation, calculate the loss
        if not fValidation and not fRunning:
            if should_backprop: all_losses.append(-dy.log(dy.pick(class_mlp_output, gold_word_class)))
            word_class_ans = gold_word_class
        # otherwise, set the answer to be the argmax
        else:
            word_class_ans = predicted_word_class

        # if the word_class answer is 1, do the pos!
        # alternatively, if validating and it's aramic, do the pos!
        if word_class_ans or (fValidation and gold_word_lang) or (fRunning and gold_word_lang):
            # run the pos mlp output
            pos_mlp_output = pos_mlp(mlp_input)
            try:
                temp_pos_array = pos_mlp_output.npvalue()
                possible_pos_array = np.zeros(temp_pos_array.shape)
                pos_list = pos_hashtable[word]
                # pos_list.add('') #concat 'unknown' as possible pos
                possible_pos_indices = [pos_vocab[temp_pos] for temp_pos in pos_list]
                possible_pos_array[possible_pos_indices] = temp_pos_array[possible_pos_indices]
            except KeyError:
                possible_pos_array = pos_mlp_output.npvalue()
                # if fValidation:
                #    possible_pos_array[pos_vocab['']] = 0.0 # don't allow validation to guess UNK b/c it never trained against that TODO this makes sense, right?

            predicted_word_pos = pos_vocab.getItem(np.argmax(possible_pos_array))
            confidence = np.max(possible_pos_array) / np.sum(possible_pos_array)
            # prec
            if should_backprop:
                pos_prec += 1 if predicted_word_pos == gold_word_pos else 0
                rough_pos_prec += 1 if predicted_word_pos[0] == gold_word_pos[0] else 0 # you got at least the rough pos right
                pos_items += 1

            # if we aren't doing validation, calculate the loss
            if not fValidation and not fRunning:
                if should_backprop: all_losses.append(-dy.log(dy.pick(pos_mlp_output, pos_vocab[gold_word_pos])))
                word_pos_ans = gold_word_pos
            # otherwise, set the answer to be the argmax
            elif not fRunning and fValidation:
                if should_backprop: pos_conf_matrix(pos_vocab[predicted_word_pos], pos_vocab[gold_word_pos])
                word_pos_ans = predicted_word_pos
            else:
                word_pos_ans = predicted_word_pos

            # run through the prev-pos-mlp
            predicted = predicted_word_pos
            prev_pos_lstm_state = prev_pos_lstm_state.add_input(pos_enc(word_pos_ans))
        # if the answer is 0, put a '' through the prev-pos lstm
        else:
            predicted = 'UNK'
            prev_pos_lstm_state = prev_pos_lstm_state.add_input(pos_enc(''))

        tagged_daf["words"].append({"word":word,"gold_pos":gold_word_pos,"gold_class":gold_word_class,"predicted":predicted,"confidence":confidence, "lang": gold_word_lang})

    if fRunning:
        return tagged_daf

    pos_prec = pos_prec / pos_items if pos_items > 0 else None
    rough_pos_prec = rough_pos_prec / pos_items if pos_items > 0 else None
    class_prec = class_prec / class_items if class_items > 0 else None

    if fValidation:
        return class_prec, pos_prec,tagged_daf, rough_pos_prec

    total_loss = dy.esum(all_losses) if len(all_losses) > 0 else None
    return total_loss, class_prec, pos_prec, rough_pos_prec
Beispiel #54
0
def train_beam_graph(e, beam_size, traj_type, loss_fn):
    dy.renew_cg()

    tags = e[tk_tags_key]
    m = model_init(e)
    beam_costs_prev = np.array([0], dtype="int")
    beam_costs = []
    losses = []
    for i in xrange(len(e["tk_words"])):
        scores = model_scores(m)

        # transition
        scores_np = scores.npvalue()
        beam_indices, tag_indices = beam_argtopk(scores_np, beam_size)

        beam_costs_cur = beam_costs_prev[beam_indices] + (tag_indices !=
                                                          tags[i]).astype('int')

        # compute the loss if there is score accumulation or always
        next_beam_size = beam_size if i < len(e["tk_words"]) - 1 else 1
        if (not cfg["update_only_on_cost_increase"]) or (
                cfg["update_only_on_cost_increase"] and
                beam_costs_prev.min() < beam_costs_cur[:next_beam_size].min()):
            loss = loss_fn(tags, i, beam_costs_prev, scores, beam_size)
            losses.append(loss)

        if traj_type == "stop":
            if beam_costs_cur.min() > 0:
                break
        elif traj_type == "continue":
            pass
        elif traj_type == "reset":
            if beam_costs_cur.min() > 0:
                b_gold_idx = beam_costs_prev.argmin()
                beam_indices = np.array([b_gold_idx], dtype='int')
                tag_indices = np.array([tags[i]], dtype='int')
                beam_costs_cur = np.array([0], dtype='int')
        elif traj_type == "reset_multiple":
            # NOTE: this is similar to the reset option. replace the last element
            # in the beam with the correct one.
            if beam_costs_cur.min() > 0:
                b_gold_idx = beam_costs_prev.argmin()
                beam_indices[-1] = b_gold_idx
                tag_indices[-1] = tags[i]
                beam_costs_cur[-1] = beam_costs_prev[b_gold_idx]
                # this should be zero
                # assert beam_costs_prev[-1] == 0
        # NOTE: there is probably a less repetitive way of doing this.
        elif traj_type == "oracle":
            t_idx = tags[i]
            beam_size_prev = beam_costs_prev.shape[0]
            costs = beam_costs_prev.reshape((beam_size_prev, 1)) * np.ones(
                (1, num_tags))
            costs += 1.0
            costs[:, t_idx] -= 1.0
            beam_indices, tag_indices = beam_argtopk(-costs, beam_size)
            beam_costs_cur = beam_costs_prev[beam_indices] + (
                tag_indices != tags[i]).astype('int')

        else:
            raise ValueError

        beam_costs.append(beam_costs_cur)
        beam_costs_prev = beam_costs_cur
        model_step(m, beam_indices, tag_indices)

    if len(losses) > 0:
        return dy.esum(losses)
    else:
        return dy.zeros(1)
Beispiel #55
0
    def on_calc_additional_loss(self, translator_loss):
        if not self.learn_segmentation or self.segment_decisions is None:
            return None
        reward = -translator_loss["mle"]
        if not self.log_reward:
            reward = dy.exp(reward)
        reward = dy.nobackprop(reward)

        # Make sure that reward is not scalar, but rather based on the each batch item
        assert reward.dim()[1] == len(self.src_sent)
        # Mask
        enc_mask = self.enc_mask.get_active_one_mask().transpose(
        ) if self.enc_mask is not None else None
        # Compose the lose
        ret = LossBuilder()
        ## Length prior
        alpha = self.length_prior_alpha.value(
        ) if self.length_prior_alpha is not None else 0
        if alpha > 0:
            reward += self.segment_length_prior * alpha
        # reward z-score normalization
        if self.z_normalization:
            reward = dy.cdiv(reward - dy.mean_batches(reward),
                             dy.std_batches(reward) + EPS)
        ## Baseline Loss
        if self.use_baseline:
            baseline_loss = []
            for i, baseline in enumerate(self.bs):
                loss = dy.squared_distance(reward, baseline)
                if enc_mask is not None:
                    loss = dy.cmult(dy.inputTensor(enc_mask[i], batched=True),
                                    loss)
                baseline_loss.append(loss)

            ret.add_loss("Baseline", dy.esum(baseline_loss))

        if self.print_sample:
            print(
                dy.exp(self.segment_logsoftmaxes[i]).npvalue().transpose()[0])
        ## Reinforce Loss
        lmbd = self.lmbd.value()
        if lmbd > 0.0:
            reinforce_loss = []
            # Calculating the loss of the baseline and reinforce
            for i in range(len(self.segment_decisions)):
                ll = dy.pick_batch(self.segment_logsoftmaxes[i],
                                   self.segment_decisions[i])
                if self.use_baseline:
                    r_i = reward - dy.nobackprop(self.bs[i])
                else:
                    r_i = reward
                if enc_mask is not None:
                    ll = dy.cmult(dy.inputTensor(enc_mask[i], batched=True),
                                  ll)
                reinforce_loss.append(r_i * -ll)
            loss = dy.esum(reinforce_loss) * lmbd
            ret.add_loss("Reinforce", loss)
        if self.confidence_penalty:
            ls_loss = self.confidence_penalty(self.segment_logsoftmaxes,
                                              enc_mask)
            ret.add_loss("Confidence Penalty", ls_loss)
        # Total Loss
        return ret
    def __call__(self,
                 context,
                 F2I,
                 only_train_words,
                 dropout_rate=1.0,
                 activate_sub_word=False,
                 stop_updating_lookup=False):

        num_params = len(self.params)  # get the length of params vector
        lookup = self.lookup

        # if the user choose to continue learning the pre-trained words
        if not stop_updating_lookup:
            # if sub word feature is not activated
            if not activate_sub_word:
                emb_vectors = [lookup[F2I.get(i)]
                               for i in context]  # get embedding of the words
            else:  # sum embedding of word, suffix and prefix for words that allow it
                emb_vectors = []
                for word in context:
                    # get the word alone if len<=3 or word=start/end/unk
                    if len(word) <= 3 or (word in ["<s>", "</s>", "UUUNKKK"]):
                        emb_vectors.append(lookup[F2I.get(word)])
                    else:
                        pref = False
                        suff = False
                        # check if prefix exist in F2I. relevant for test/dev sets
                        if F2I.has_key(word[:3]):
                            prefix_embd = lookup[F2I.get(word[:3])]
                            pref = True
                        # check if suffix exist in F2I. relevant for test/dev sets
                        if F2I.has_key(word[-3:]):
                            suffix_embd = lookup[F2I.get(word[-3:])]
                            suff = True

                        word_embd = lookup[F2I.get(word)]

                        # sum vectors of word with existing prefix/suffix
                        if pref and suff:
                            sum_embd = dy.esum(
                                [prefix_embd, suffix_embd, word_embd])
                        elif pref and suff == False:
                            sum_embd = dy.esum([prefix_embd, word_embd])
                        elif suff and pref == False:
                            sum_embd = dy.esum([suffix_embd, word_embd])
                        else:
                            sum_embd = dy.esum([word_embd])

                        emb_vectors.append(sum_embd)

        # if the user choose to stop learning the pre-trained words
        if stop_updating_lookup:
            # if sub word feature is not activated
            if not activate_sub_word:
                emb_vectors = []
                for word in context:
                    # if it's a word from the corpus continue training it
                    if word in only_train_words:
                        emb_vectors.append(lookup[F2I.get(word)])
                    # if it's a word from the pre-train stop training it
                    else:
                        emb_vectors.append(dy.nobackprop(
                            lookup[F2I.get(word)]))
            else:  # sum embedding of word, suffix and prefix for words that allow it
                emb_vectors = []
                for word in context:
                    # get the word alone if len<=3 or word=start/end/unk
                    if len(word) <= 3 or (word in ["<s>", "</s>", "UUUNKKK"]):
                        # if it's a word from the corpus continue training it
                        if word in only_train_words:
                            emb_vectors.append(lookup[F2I.get(word)])
                        # if it's a word from the pre-train stop training it
                        else:
                            emb_vectors.append(
                                dy.nobackprop(lookup[F2I.get(word)]))
                    else:
                        pref = False
                        suff = False
                        # check if prefix exist in F2I. relevant for test/dev sets
                        if F2I.has_key(word[:3]):
                            # if it's a word from the corpus continue training it
                            if word[:3] in only_train_words:
                                prefix_embd = lookup[F2I.get(word[:3])]
                            # if it's a word from the pre-train stop training it
                            else:
                                prefix_embd = dy.nobackprop(lookup[F2I.get(
                                    word[:3])])
                            pref = True
                        # check if suffix exist in F2I. relevant for test/dev sets
                        if F2I.has_key(word[-3:]):
                            # if it's a word from the corpus continue training it
                            if word[-3:] in only_train_words:
                                suffix_embd = lookup[F2I.get(word[-3:])]
                            # if it's a word from the pre-train stop training it
                            else:
                                suffix_embd = dy.nobackprop(lookup[F2I.get(
                                    word[-3:])])
                            suff = True

                        # if it's a word from the corpus continue training it
                        if word in only_train_words:
                            word_embd = lookup[F2I.get(word)]
                        # if it's a word from the pre-train stop training it
                        else:
                            word_embd = dy.nobackprop(lookup[F2I.get(word)])

                        # sum vectors of word with existing prefix/suffix
                        if pref and suff:
                            sum_embd = dy.esum(
                                [prefix_embd, suffix_embd, word_embd])
                        elif pref and suff == False:
                            sum_embd = dy.esum([prefix_embd, word_embd])
                        elif suff and pref == False:
                            sum_embd = dy.esum([suffix_embd, word_embd])
                        else:
                            sum_embd = dy.esum([word_embd])

                        emb_vectors.append(sum_embd)

        net_input = dy.concatenate(emb_vectors)

        for i in xrange(
                0, num_params - 2, 2
        ):  # calculate the activation of each subsequent layers and apply the bernoulli mask
            W = dy.parameter(self.params[i])  # from parameters to expressions
            b = dy.parameter(self.params[i + 1])
            if i == 0:  # first layer
                activation = dy.tanh((W * net_input) + b)
            else:  # other layers
                activation = dy.tanh((W * activation) + b)

            if dropout_rate != 1.0:
                activation = dy.dropout(activation, dropout_rate)

        W = dy.parameter(self.params[num_params -
                                     2])  # from parameters to expressions
        b = dy.parameter(self.params[num_params - 1])
        net_output = dy.softmax(
            (W * activation) + b)  # apply sfotmax on last layer
        return net_output
def test_item(model, document):
    word_lookups = []
    for preprocessed_sentence in document.preprocessed_sentences:
        seq = [
            model.wlookup[int(model.w2i.get(entry, 0))]
            for entry in preprocessed_sentence
        ]
        if len(seq) > 0:
            word_lookups.append(seq)

    sentences_lookups = []
    for seq in word_lookups:
        sentence_encode = encode_sequence(model, seq, model.sentence_rnn)
        global_max = max_pooling(sentence_encode)
        global_min = average_pooling(sentence_encode)
        if len(sentence_encode) > 0:
            att_mlp_outputs = []
            for e in sentence_encode:
                mlp_out = (model.word_attention_w * e) + model.word_attention_b
                att_mlp_outputs.append(mlp_out)

            lst = []
            for o in att_mlp_outputs:
                lst.append(
                    dy.exp(dy.sum_elems(dy.cmult(o, model.word_att_context))))

            sum_all = dy.esum(lst)

            probs = [dy.cdiv(e, sum_all) for e in lst]
            att_context = dy.esum(
                [dy.cmult(p, h) for p, h in zip(probs, sentence_encode)])
            context = dy.concatenate([att_context, global_max, global_min])
            sentences_lookups.append(context)

    document_encode = encode_sequence(model, sentences_lookups,
                                      model.document_rnn)
    global_max = max_pooling(document_encode)
    global_min = average_pooling(document_encode)
    if len(document_encode) > 0:
        att_mlp_outputs = []
        for e in document_encode:
            mlp_out = (model.sentence_attention_w *
                       e) + model.sentence_attention_b
            att_mlp_outputs.append(mlp_out)

        lst = []
        for o in att_mlp_outputs:
            lst.append(
                dy.exp(dy.sum_elems(dy.cmult(o, model.sentence_att_context))))

        sum_all = dy.esum(lst)

        probs = [dy.cdiv(e, sum_all) for e in lst]
        att_context = dy.esum(
            [dy.cmult(p, h) for p, h in zip(probs, document_encode)])
        context = dy.concatenate([att_context, global_max, global_min])
        y_pred = dy.logistic((model.mlp_w * context) + model.mlp_b)
        document.prediction_result = y_pred.scalar_value()
        dy.renew_cg()
        return document.prediction_result
    return 0
Beispiel #58
0
    # make a forward pass
    pred = forward_pass(x)
    # calculate loss for each example
    loss = dy.binary_log_loss(pred, y) 
    losses.append(loss)
    pred
    y


# Now let's accumulate the loss and backpropogate it.

# In[24]:


# get total loss for dataset
total_loss = dy.esum(losses)
# apply the calculations of the computational graph
total_loss.forward()
# calculate loss to backpropogate
total_loss.backward()
# update parameters with backpropogated error
trainer.update()


# Let's make sure that our parameter `W_1` has been updated (e.g. it "learned" something).

# In[25]:


# confirm that parameters updated
dy.renew_cg()
Beispiel #59
0
    def __call__(self,
                 inputs,
                 masks,
                 truth,
                 iters,
                 is_train=True,
                 is_tree=True):
        sent_len = len(inputs)
        batch_size = inputs[0].dim()[1]
        flat_len = sent_len * batch_size

        print('===Vào call===')
        print('input length: ', inputs.__len__())  # input length:  46
        print('input dim: ', inputs[1].dim())  # input dim:  ((400,), 2)
        print('sent_len', sent_len)  # sent_len 46
        print('batch_size', batch_size)  # batch_size 2
        print('flat_len', flat_len)  # flat_len 92

        # H -> hidden size, L -> sentence length, B -> batch size
        # ((H, L), B)
        X = dy.concatenate_cols(inputs)
        print('X dim: ', X.dim())  # X dim:  ((400, 46), 2)
        if is_train:
            X = dy.dropout_dim(X, 1, self.cfg.MLP_DROP)

        # A_H -> ARC MLP hidden size, R_H -> REL MLP hidden size
        # ((A_H, L), B)
        head_arc = self.head_arc_MLP(X, is_train)
        dept_arc = self.dept_arc_MLP(X, is_train)
        print('head_arc dim: ', head_arc.dim())
        print('dept_arc dim: ', dept_arc.dim())
        # head_arc dim:  ((300, 46), 2)
        # dept_arc dim:  ((300, 46), 2)

        # ((R_H, L), B)
        head_rel = self.head_rel_MLP(X, is_train)
        dept_rel = self.dept_rel_MLP(X, is_train)
        print('head_rel dim: ', head_rel.dim())
        print('dept_rel dim: ', dept_rel.dim())
        # head_rel dim:  ((100, 46), 2)
        # dept_rel dim:  ((100, 46), 2)

        if is_train:
            total_token = sum(masks['flat'].tolist())
            head_arc = dy.dropout_dim(head_arc, 1, self.cfg.MLP_DROP)
            head_rel = dy.dropout_dim(head_rel, 1, self.cfg.MLP_DROP)
            dept_arc = dy.dropout_dim(dept_arc, 1, self.cfg.MLP_DROP)
            dept_rel = dy.dropout_dim(dept_rel, 1, self.cfg.MLP_DROP)

        # ((L, L), B)

        masks_2D = 1e9 * (1 - dy.inputTensor(masks['2D'], True))

        masks_flat = dy.inputTensor(masks['flat'], True)

        gnn_losses = []
        arc_norm = math.sqrt(self.arc_size)
        rel_norm = math.sqrt(self.rel_size)
        for k in range(self.cfg.GRAPH_LAYERS):
            print('----layer-----', k)
            # Graph Weights
            # ((L, L), B)
            arc_mat = self.arc_attn_mat[k](head_arc,
                                           dept_arc) / arc_norm - masks_2D
            arc_prob = dy.softmax(arc_mat)

            # arc_mat dim:  ((46, 46), 2)
            # arc_prob dim:  ((46, 46), 2)

            # Layer-wise Loss
            if is_train:
                arc_prob = dy.dropout(arc_prob, self.cfg.ARC_DROP)
                # ((L,), L*B)
                arc_mat = dy.reshape(arc_mat, (sent_len, ), flat_len)
                # ((1,), L*B)
                print('arc_mat val', arc_mat.value())
                print('arc_mat dim', arc_mat.dim())
                print("truth['head'] value", truth['head'])
                print("truth['head'] lengt", truth['head'].__len__())

                arc_loss = dy.pickneglogsoftmax_batch(arc_mat, truth['head'])
                print('arc_loss', arc_loss.value())
                print('arc_loss', arc_loss.dim())

                # (1,)

                arc_loss = dy.sum_batches(arc_loss * masks_flat) / total_token
                print('arc_loss', arc_loss.value)
                print('arc_loss', arc_loss.dim())

                gnn_losses.append(arc_loss.value())
                input("pause")

            # Aggregation Function
            # Fusion head and dept representation
            # ((A_H, L), B)
            HX = head_arc * arc_prob
            DX = dept_arc * dy.transpose(arc_prob)
            FX = HX + DX

            print('HX dim: ', HX.dim())
            print('DX dim: ', DX.dim())
            print('FX dim: ', FX.dim())
            # HX dim:  ((300, 46), 2)
            # DX dim:  ((300, 46), 2)
            # FX dim:  ((300, 46), 2)

            # Async Update Function
            # Head-first
            # ((A_H, L), B)
            head_arc = self.head_gnn(FX, head_arc)
            FX_new = head_arc * arc_prob + DX
            dept_arc = self.dept_gnn(FX_new, dept_arc)

            print('head_arc dim: ', head_arc.dim())
            print('FX_new dim: ', FX_new.dim())
            print('dept_arc dim: ', dept_arc.dim())
            # head_arc dim:  ((300, 46), 2)
            # FX_new dim:  ((300, 46), 2)
            # dept_arc dim:  ((300, 46), 2)

            # Relation Aggregation Function
            # Sync update
            # ((R_H, L), B)
            HR = head_rel * arc_prob
            DR = dept_rel * dy.transpose(arc_prob)
            FX = HR + DR
            head_rel = self.head_rel_gnn(FX, head_rel) + head_rel
            dept_rel = self.dept_rel_gnn(FX, dept_rel) + dept_rel

            print('HR dim: ', HR.dim())
            print('DR dim: ', DR.dim())
            print('FX dim: ', FX.dim())
            # HR dim:  ((100, 46), 2)
            # DR dim:  ((100, 46), 2)
            # FX dim:  ((100, 46), 2)

            print('head_rel dim: ', head_rel.dim())
            print('dept_rel dim: ', dept_rel.dim())
# head_rel dim:  ((100, 46), 2)
# dept_rel dim:  ((100, 46), 2)

# ((L, L), B)
        arc_mat = self.arc_attn_mat[-1](head_arc,
                                        dept_arc) / arc_norm - masks_2D
        # ((L,), L*B)
        arc_mat = dy.reshape(arc_mat, (sent_len, ), flat_len)
        # Predict Relation
        # (R_H, L*B)
        head_rel = dy.reshape(head_rel, (self.rel_size, flat_len))
        # ((R_H,), L*B)
        dept_rel = dy.reshape(dept_rel, (self.rel_size, ), flat_len)

        print('arc_mat dim: ', arc_mat.dim())
        print('head_rel dim: ', head_rel.dim())
        print('dept_rel dim: ', dept_rel.dim())
        # arc_mat dim:  ((46,), 92)
        # head_rel dim:  ((100, 92), 1)
        # dept_rel dim:  ((100,), 92)

        if is_train:
            # ((1,), L*B)
            arc_losses = dy.pickneglogsoftmax_batch(arc_mat, truth['head'])
            # (1,)
            arc_loss = dy.sum_batches(arc_losses * masks_flat) / total_token
            # ((R_H,), L*B)
            truth_rel = dy.pick_batch(head_rel, truth['flat_head'], 1)
            # R -> Relation Set Size
            # ((R,), L*B)
            rel_mask = 1e9 * dy.inputTensor(self.rel_mask)
            rel_mat = self.rel_attn(dept_rel, truth_rel) / rel_norm - rel_mask
            # Calculate Relation Classification Loss
            # ((1,), L*B)
            rel_losses = dy.pickneglogsoftmax_batch(rel_mat, truth['rel'])
            # (1,)
            rel_loss = dy.sum_batches(rel_losses * masks_flat) / total_token
            # Final Total Loss with Layer-wise
            warm = [int(iters >= x) for x in self.warm_list]
            losses = rel_loss*self.cfg.LAMBDA2 * \
                warm[-1]+arc_loss*self.cfg.LAMBDA2*warm[-1]
            if gnn_losses:
                for i in range(self.cfg.GRAPH_LAYERS):
                    gnn_losses[i] *= warm[i]
                losses += dy.esum(gnn_losses) * self.cfg.LAMBDA1
            losses_list = gnn_losses + [arc_loss, rel_loss]
            return losses, losses_list
        else:
            if is_tree:
                # MST Inference, Achieve Tree Edge.
                arc_probs = dy.softmax(arc_mat).npvalue()
                arc_probs = np.reshape(arc_probs,
                                       (sent_len, sent_len, batch_size), 'F')
                arc_probs = np.transpose(arc_probs)
                # Mask PAD
                arc_masks = [
                    np.array(masks['flat'][i:i + sent_len])
                    for i in range(0, flat_len, sent_len)
                ]
                arc_pred = []
                # Inference One By One.
                for msk, arc_prob in zip(arc_masks, arc_probs):
                    msk[0] = 1
                    seq_len = int(np.sum(msk))
                    tmp_pred = MST_inference(arc_prob, seq_len, msk)
                    tmp_pred[0] = 0
                    arc_pred.extend(tmp_pred)
            else:
                # Greedy Inference (argmax)
                arc_pred = np.argmax(arc_mat.npvalue(), 0)
            # Pick Predicted Edge's <Head, Dept> pair.
            flat_pred = [
                j + (i // sent_len) * sent_len for i, j in enumerate(arc_pred)
            ]
            pred_rel = dy.pick_batch(head_rel, flat_pred, 1)
            # Predict Relation (mask ROOT)
            rel_mask = 1e9 * dy.inputTensor(self.rel_mask)
            rel_mat = self.rel_attn(dept_rel, pred_rel) / rel_norm - rel_mask
            rel_mat = dy.reshape(rel_mat, (self.rel_num, )).npvalue()
            rel_pred = np.argmax(rel_mat, 0)
            pred = {}
            pred['head'], pred['rel'] = arc_pred, rel_pred
            return pred
Beispiel #60
0
def calc_score_of_history(words):
  # Create a list of things to sum up with only the bias vector at first
  score_vecs = [dy.parameter(b_sm)]
  for word_id, lookup_param in zip(words, W_sm): 
    score_vecs.append(lookup_param[word_id])
  return dy.esum(score_vecs)