def train_item(args, model, sentence):
    loss = None
    seq = [
        model.wlookup[int(model.w2i.get(entry, 0))]
        for entry in sentence.preprocessed_sentence
    ]
    if len(seq) > 0:
        encoded_sequence = encode_sequence(model, seq, model.sentence_rnn)
        last_output = encoded_sequence[-1]
        global_max = max_pooling(encoded_sequence)
        global_min = average_pooling(encoded_sequence)
        context = dy.concatenate([last_output, global_max, global_min])
        y_pred = dy.logistic((model.mlp_w * context) + model.mlp_b)

        if sentence.permissions[args.permission_type]:
            loss = dy.binary_log_loss(y_pred, dy.scalarInput(1))
        else:
            loss = dy.binary_log_loss(y_pred, dy.scalarInput(0))

        loss.backward()
        model.trainer.update()
        loss_val = loss.scalar_value()
        dy.renew_cg()
        return loss_val
    return 0
Exemple #2
0
    def scorer(self, q_d_hists, q_idf, bm25_score, overlap_features, p):
        """
        Makes all the calculations and returns a relevance score
        """
        idf_vec = dy.inputVector(q_idf)
        bm25_score = dy.scalarInput(bm25_score)
        overlap_features = dy.inputVector(overlap_features)
        # Pass each query term representation through the MLP
        term_scores = []
        for hist in q_d_hists:
            q_d_hist = dy.reshape(dy.inputVector(hist), (1, len(hist)))
            hidd_out = dy.rectify(q_d_hist * self.W_1 + self.b_1)
            for i in range(0, self.mlp_layers):
                hidd_out = dy.rectify(hidd_out * self.W_n[i] + self.b_n[i])
            term_scores.append(hidd_out * self.W_last + self.b_last)

        # Term Gating
        gating_weights = idf_vec * self.w_g
        
        bm25_feature = bm25_score * self.W_bm25 + self.b_bm25 
        drop_out =  dy.scalarInput(1)
        drop_num = (np.random.rand(1) < p)/p #p= probability of keeping a unit active
        drop_out.set(drop_num)
        
        bm25_feature *= drop_out
        drmm_score = dy.transpose(dy.concatenate(term_scores)) * dy.reshape(gating_weights, (len(q_idf), 1)) #basic MLPs output
        doc_score = dy.transpose(dy.concatenate([drmm_score, overlap_features])) * self.W_scores + self.b_scores #extra features layer
        
        
        return doc_score
Exemple #3
0
def __train(model, data):
    tagged_loss = 0
    untagged_loss = 0
    for index, sentence_report in enumerate(data):
        for phrase in sentence_report.all_phrases:
            loss = None
            encoded_phrase = __encode_sequence(model, phrase)

            if model.options.external_info != "no_info":
                encoded_phrase = dy.concatenate(
                    [encoded_phrase, model.doclookup[sentence_report.app_id]])

            y_pred = dy.logistic((model.mlp_w * encoded_phrase) + model.mlp_b)

            if sentence_report.mark:
                loss = dy.binary_log_loss(y_pred, dy.scalarInput(1))
            else:
                loss = dy.binary_log_loss(y_pred, dy.scalarInput(0))

            if sentence_report.mark:
                tagged_loss += loss.scalar_value() / (index + 1)
            else:
                untagged_loss += loss.scalar_value() / (index + 1)
            loss.backward()
            model.trainer.update()
            dy.renew_cg()
def perceptron_loss(scores, reference):
    if use_cost_augmented:
        predictions = hamming_augmented_decode(scores, reference)
    else:
        predictions = [np.argmax(score.npvalue()) for score in scores]

    margin = dy.scalarInput(-2)

    if predictions != reference:
        reference_score = calc_sequence_score(scores, reference)
        prediction_score = calc_sequence_score(scores, predictions)
        if use_cost_augmented:
            # One could actually get the hamming augmented value during decoding, but we didn't do it here for
            # demonstration purpose.
            hamming = dy.scalarInput(hamming_cost(predictions, reference))
            loss = prediction_score + hamming - reference_score
        else:
            loss = prediction_score - reference_score

        if use_hinge:
            loss = dy.emax([dy.scalarInput(0), loss - margin])

        return loss
    else:
        return dy.scalarInput(0)
def perceptron_loss(scores, reference):
    if use_cost_augmented:
        predictions = hamming_augmented_decode(scores, reference)
    else:
        predictions = [np.argmax(score.npvalue()) for score in scores]

    margin = dy.scalarInput(-2)

    if predictions != reference:
        reference_score = calc_sequence_score(scores, reference)
        prediction_score = calc_sequence_score(scores, predictions)
        if use_cost_augmented:
            # One could actually get the hamming augmented value during decoding, but we didn't do it here for
            # demonstration purpose.
            hamming = dy.scalarInput(hamming_cost(predictions, reference))
            loss = prediction_score + hamming - reference_score
        else:
            loss = prediction_score - reference_score

        if use_hinge:
            loss = dy.emax([dy.scalarInput(0), loss - margin])

        return loss
    else:
        return dy.scalarInput(0)
Exemple #6
0
    def __train(self, data):
        def encode_sequence(seq):
            rnn_forward = self.phrase_rnn[0].initial_state()
            for entry in seq:
                vec = self.wlookup[int(self.w2i.get(entry, 0))]
                rnn_forward = rnn_forward.add_input(vec)
            return rnn_forward.output()
        tagged_loss = 0
        untagged_loss = 0
        for index, sentence_report in enumerate(data):
            for phrase in sentence_report.all_phrases:
                loss = None
                encoded_phrase = encode_sequence(phrase)
                y_pred = dy.logistic((self.mlp_w*encoded_phrase) + self.mlp_b)

                if sentence_report.mark:
                    loss = dy.binary_log_loss(y_pred, dy.scalarInput(1))
                else:
                    loss = dy.binary_log_loss(y_pred, dy.scalarInput(0))
                if index % 1000 == 0:
                    print("Description : {}".format(index+1))
                    print("Marked {} Prediction Result {} : ".format(sentence_report.mark, y_pred.scalar_value()))
                    print("Tagged loss {} Untagged Loss {} Total loss {}".format(tagged_loss, untagged_loss, tagged_loss+untagged_loss))

                if sentence_report.mark:
                    tagged_loss += loss.scalar_value()/(index+1)
                else:
                    untagged_loss += loss.scalar_value()/(index+1)
                loss.backward()
                self.trainer.update()
                dy.renew_cg()
Exemple #7
0
    def learn(self, characters, target_mgc, guided_att=True):
        num_mgc = target_mgc.shape[0]
        # print num_mgc
        dy.renew_cg()
        output_mgc, output_stop, output_attention = self._predict(
            characters, target_mgc)
        losses = []
        index = 0
        for mgc, real_mgc in zip(output_mgc, target_mgc):
            t_mgc = dy.inputVector(real_mgc)
            # losses.append(self._compute_binary_divergence(mgc, t_mgc) )
            losses.append(dy.l1_distance(mgc, t_mgc))

            if index % 3 == 0:
                # attention loss
                if guided_att:
                    att = output_attention[index / 3]
                    losses.append(
                        self._compute_guided_attention(att, index / 3,
                                                       len(characters) + 2,
                                                       num_mgc / 3))
                # EOS loss
                stop = output_stop[index / 3]
                if index >= num_mgc - 6:
                    losses.append(dy.l1_distance(stop, dy.scalarInput(-0.8)))
                else:
                    losses.append(dy.l1_distance(stop, dy.scalarInput(0.8)))
            index += 1
        loss = dy.esum(losses)
        loss_val = loss.value() / num_mgc
        loss.backward()
        self.trainer.update()
        return loss_val
Exemple #8
0
    def beam_search(self, char_seq, truth = None, mu =0.): 
        start_agenda = Agenda(self.options['beam_size'])
        init_state = self.params['lstm'].initial_state().add_input(self.param_exprs['<bos>'])
        init_y = dy.tanh(self.param_exprs['pW'] * init_state.output() + self.param_exprs['pb'])
        init_score = dy.scalarInput(0.)
        start_agenda.push(Sentence(score=init_score.scalar_value(),score_expr=init_score,LSTMState =init_state, y= init_y , prevState = None, wlen=None))
        agenda = [start_agenda]

        for idx, _ in enumerate(char_seq,1): # from left to right, character by character
            now = Agenda(self.options['beam_size'])
            for wlen in xrange(1,min(idx,self.options['max_word_len'])+1): # generate candidate word vectors
                word = self.word_repr(char_seq[idx-wlen:idx])
                word_score = dy.dot_product(word,self.param_exprs['U'])
                for sent in agenda[idx-wlen]: # join segmentation
                    if truth is not None:
                        margin = dy.scalarInput(mu*wlen if truth[idx-1]!=wlen else 0.)
                        score = margin + sent.score_expr + dy.dot_product(sent.y, word) + word_score 
                    else:
                        score = sent.score_expr + dy.dot_product(sent.y, word) + word_score 
                    
                    if now.happy_with(score.scalar_value()):
                        new_state = sent.LSTMState.add_input(word)
                        new_y = dy.tanh(self.param_exprs['pW'] * new_state.output() + self.param_exprs['pb'])
                        now.push(Sentence(score=score.scalar_value(),score_expr=score,LSTMState=new_state,y=new_y, prevState=sent, wlen=wlen))
            agenda.append(now)

        if truth is not None:
            return agenda[-1].max().score_expr
        return agenda
Exemple #9
0
    def beam_train_max_margin_with_answer_guidence(self, init_state, gold_ans):
        # perform two beam search; one for prediction and the other for state action suff
        # max reward y = argmax(r(y)) with the help of gold_ans
        # max y' = argmax f(x,y) - R(y')
        # loss = max(f(x,y') - f(x,y) + R(y) - R(y') , 0)

        #end_state_list = self.beam_predict(init_state)
        end_state_list = self.beam_predict_max_violation(
            init_state, gold_ans)  # have to use this to make it work....
        reward_list = [x.reward(gold_ans) for x in end_state_list]
        violation_list = [
            s.path_score_expression.value() - reward
            for s, reward in zip(end_state_list, reward_list)
        ]

        best_score_state_idx = violation_list.index(max(
            violation_list))  # find the best scoring seq with minimal reward
        best_score_state = end_state_list[best_score_state_idx]
        best_score_state_reward = reward_list[best_score_state_idx]

        loss_value = 0

        if self.only_one_best:
            best_states = self.beam_find_actions_with_answer_guidence(
                init_state, gold_ans)
            if best_states == []:
                return 0, []
            best_reward_state = best_states[0]
            #print ("debug: found best_reward_state: qid =", best_reward_state.qinfo.seq_qid, best_reward_state)
            best_reward_state_reward = best_reward_state.reward(gold_ans)
            #print ("debug: best_reward_state_reward =", best_reward_state_reward)
            loss = dt.rectify(best_score_state.path_score_expression -
                              best_reward_state.path_score_expression +
                              dt.scalarInput(best_reward_state_reward -
                                             best_score_state_reward))
        else:
            best_states = self.beam_find_actions_with_answer_guidence(
                init_state, gold_ans)
            best_states_rewards = [s.reward(gold_ans) for s in best_states]
            max_reward = max(best_states_rewards)
            best_states = [
                s for s, r in zip(best_states, best_states_rewards)
                if r == max_reward
            ]
            loss = dt.average([
                dt.rectify(best_score_state.path_score_expression -
                           best_reward_state.path_score_expression +
                           dt.scalarInput(max_reward -
                                          best_score_state_reward))
                for best_reward_state in best_states
            ])

        loss_value = loss.value()
        loss.backward()

        self.neural_model.learner.update()

        #print ("debug: beam_train_max_margin_with_answer_guidence done. loss_value =", loss_value)

        return loss_value, best_states
Exemple #10
0
    def training_session(self, sentence, print_logger, pool):
        lstm_output = self.network.get_lstm_output(sentence)
        length = len(sentence)
        raw_exprs = self.network.edge_eval.get_complete_raw_exprs(lstm_output)
        yield raw_exprs

        scores = self.network.edge_eval.raw_exprs_to_scores(raw_exprs, length)
        exprs = self.network.edge_eval.raw_exprs_to_exprs(raw_exprs, length)

        gold = [entry.parent_id for entry in sentence]
        heads_future = pool.apply_async(self.decoder,
                                        (scores, gold if self.options.cost_augment else None))
        yield None
        heads = heads_future.get()

        if self.labelsFlag:
            edges = [(head, sentence[modifier].relation, modifier)
                     for modifier, head in enumerate(gold[1:], 1)]
            label_exprs = list(self.network.get_label_scores(lstm_output, edges))
            yield label_exprs
            label_loss = self.label_decoder(edges, label_exprs, self.statistics.labels, True)
        else:
            label_loss = dn.scalarInput(0.0)
            yield []

        head_exprs = [(exprs[h][i] - exprs[g][i] + 1)
                      for i, (h, g) in enumerate(zip(heads, gold)) if
                      h != g]
        print_logger.correct_edge += len(sentence) - len(head_exprs)
        print_logger.total_edge += len(sentence)
        head_loss = dn.esum(head_exprs) if head_exprs else dn.scalarInput(0.0)
        yield label_loss + head_loss
Exemple #11
0
    def train(self, indices, gold_arcs, gold_labels, pos_indices=None):
        total_arc_loss = 0
        total_label_loss = 0
        start = time.time()

        for i in range(len(indices)):
            states = self.states(
                indices[i],
                pos_indices[i] if pos_indices is not None else None)
            arc_scores = self.score_arcs(states, value=False)
            label_scores = self.score_labels(states, gold_arcs[i], value=False)

            arc_loss = self.arc_loss(gold_arcs[i], arc_scores)
            label_loss = self.label_loss(gold_labels[i], label_scores)

            if len(arc_loss) > 0:
                arc_loss = dynet.esum(arc_loss)
            else:
                arc_loss = dynet.scalarInput(0)
            if len(label_loss) > 0:
                label_loss = dynet.esum(label_loss)
            else:
                label_loss = dynet.scalarInput(0)

            loss = dynet.esum([arc_loss, label_loss])
            arc_loss = arc_loss.value()
            label_loss = label_loss.value()
            total_arc_loss += arc_loss
            total_label_loss += label_loss
            loss.backward()
            self.trainer.update()

            dynet.renew_cg()
        print(time.time() - start)
        return total_arc_loss, total_label_loss
 def copy_src_probs_pick(token_type, token_literal):
     if token_type not in copy_atts:
         return dy.scalarInput(0.0)
     selected_indexes = copy_history[token_type][token_literal]
     if len(selected_indexes) == 0:
         return dy.scalarInput(0.0)
     probs = copy_src_probs(token_type)
     return dy.sum_elems(dy.select_rows(probs, selected_indexes))
Exemple #13
0
    def decomp_attend(self, vecsA, vecsB):
        # Fq^T Fc -> need to expedite using native matrix/tensor multiplication
        Fq = vecsA  # the original word vector, not yet passing a NN as in Eq.1, # need a function F
        Fc = vecsB  # need a function F

        expE = []
        for fq in Fq:
            row = []
            for fc in Fc:
                row.append(dt.exp(dt.dot_product(fq, fc)))
            expE.append(row)
        #print ("debug: expE", expE[0][0].value())

        invSumExpEi = []
        for i in xrange(len(Fq)):
            invSumExpEi.append(dt.pow(dt.esum(expE[i]), dt.scalarInput(-1)))

        invSumExpEj = []
        for j in xrange(len(Fc)):
            invSumExpEj.append(
                dt.pow(dt.esum([expE[i][j] for i in xrange(len(Fq))]),
                       dt.scalarInput(-1)))

        beta = []
        for i in xrange(len(Fq)):
            s = dt.esum([Fc[j] * expE[i][j] for j in xrange(len(Fc))])
            beta.append(s * invSumExpEi[i])
        #print("debug: beta", beta[0].value())

        alpha = []
        for j in xrange(len(Fc)):
            s = dt.esum([Fc[j] * expE[i][j] for i in xrange(len(Fq))])
            alpha.append(s * invSumExpEj[j])
        #print("debug: alpha", alpha[0].value())

        # Compare
        v1i = [
            dt.logistic(dt.concatenate([Fq[i], beta[i]]))
            for i in xrange(len(Fq))
        ]  # need a function G
        v2j = [
            dt.logistic(dt.concatenate([Fc[j], alpha[j]]))
            for j in xrange(len(Fc))
        ]  # need a function G

        #print ("debug: v1i", v1i[0].value())
        #print ("debug: v2j", v2j[0].value())

        # Aggregate

        v1 = dt.esum(v1i)
        v2 = dt.esum(v2j)

        #print ("debug: v1.value()", v1.value())
        #print ("debug: v2.value()", v2.value())

        #colScore = dt.logistic(dt.dot_product(self.SelHW, dt.concatenate([v1,v2])))
        return dt.dot_product(v1, v2)
Exemple #14
0
 def other_loss_function(self, pred, gold, word_id, lexicon):
     if len(lexicon[word_id]) > 0:
         n_labels = len(lexicon[word_id])
         return dynet.scalarInput(1 / n_labels) * dynet.esum(
             [-dynet.log(pred[k]) for k in sorted(lexicon[word_id])])
     else:
         n_labels = len(lexicon["LEX_POS"])
         return dynet.scalarInput(1 / n_labels) * dynet.esum(
             [-dynet.log(pred[k]) for k in sorted(lexicon["LEX_POS"])])
Exemple #15
0
 def _attend(self, query, mask=None):
     # query ((H), B)
     # mask  ((T, 1), B)
     query = unsqueeze(query, 0)
     # ((1, H), B) * ((H, T), B) -> ((1, T), B) -> ((T, 1), B)
     attn_scores = dy.cdiv(dy.transpose(query * self.context), dy.scalarInput(self.scale))
     if mask is not None:
         attn_scores = dy.cmult(attn_scores, mask[0]) + (mask[1] * dy.scalarInput(-1e9))
     return dy.softmax(attn_scores)  # ((T, 1), B)
Exemple #16
0
    def _make_input(self, seq, lang_id, runtime):
        x_list = []
        encoder_states_list = [None]
        lang_emb = self.lang_embeddings[lang_id]
        # add the root

        x_list.append(self.padd_embeddings[0])

        for entry in seq:
            word = entry.word

            # prepare lexical embeddings
            char_emb, encoder_states = self.character_network.compute_embeddings(
                word, runtime=runtime, language_embeddings=lang_emb)
            encoder_states_list.append(encoder_states)

            word = word.lower()

            if word in self.encodings.word2int:
                holistic_emb = self.holistic_embeddings[
                    self.encodings.word2int[word]]
            else:
                holistic_emb = self.holistic_embeddings[
                    self.encodings.word2int['<UNK>']]

            # dropout lexical embeddings
            if runtime:
                w_emb = char_emb + holistic_emb
            else:
                p1 = random.random()
                p2 = random.random()

                m1 = 1
                m2 = 1

                if p1 < self.config.input_dropout_prob:
                    m1 = 0
                if p2 < self.config.input_dropout_prob:
                    m2 = 0
                scale = 1.0
                if m1 + m2 > 0:
                    scale = float(2) / (m1 + m2)
                m1 = dy.scalarInput(m1)
                m2 = dy.scalarInput(m2)

                scale = dy.scalarInput(scale)
                w_emb = (char_emb * m1 + holistic_emb * m2) * scale

            x_list.append(dy.concatenate([w_emb, lang_emb]))

        # close sequence
        x_list.append(self.padd_embeddings[1])

        encoder_states_list.append(None)
        return x_list, encoder_states_list
Exemple #17
0
    def greedy_search(self, char_seq, truth = None, mu =0.):
        init_state = self.params['lstm'].initial_state().add_input(self.param_exprs['<bos>'])
        init_y = dy.tanh(self.param_exprs['pW'] * init_state.output() + self.param_exprs['pb'])
        init_score = dy.scalarInput(0.)
        init_sentence = Sentence(score=init_score.scalar_value(),score_expr=init_score,LSTMState =init_state, y= init_y , prevState = None, wlen=None, golden=True)
        
        if truth is not None:
            cembs = [ dy.dropout(dy.lookup(self.params['embed'],char),self.options['dropout_rate']) for char in char_seq ]
        else:
            cembs = [dy.lookup(self.params['embed'],char) for char in char_seq ]
            #cembs = [ dy.dropout(dy.lookup(self.params['embed'],char),self.options['dropout_rate']) for char in char_seq ]

        start_agenda = init_sentence
        agenda = [start_agenda]

        for idx, _ in enumerate(char_seq,1): # from left to right, character by character
            now = None
            for wlen in range(1,min(idx,self.options['max_word_len'])+1): # generate word candidate vectors
                # join segmentation sent + word
                word = self.word_repr(char_seq[idx-wlen:idx], cembs[idx-wlen:idx])
                sent = agenda[idx-wlen]

                if truth is not None:
                    word = dy.dropout(word,self.options['dropout_rate'])
                
                word_score = dy.dot_product(word,self.param_exprs['U'])

                if truth is not None:
                    golden =  sent.golden and truth[idx-1]==wlen
                    margin = dy.scalarInput(mu*wlen if truth[idx-1]!=wlen else 0.)
                    score = margin + sent.score_expr + dy.dot_product(sent.y, word) + word_score
                else:
                    golden = False
                    score = sent.score_expr + dy.dot_product(sent.y, word) + word_score


                good = (now is None or now.score < score.scalar_value())
                if golden or good:
                    new_state = sent.LSTMState.add_input(word)
                    new_y = dy.tanh(self.param_exprs['pW'] * new_state.output() + self.param_exprs['pb'])
                    new_sent = Sentence(score=score.scalar_value(),score_expr=score,LSTMState=new_state,y=new_y, prevState=sent, wlen=wlen, golden=golden)
                    if good:
                        now = new_sent
                    if golden:
                        golden_sent = new_sent

            agenda.append(now)
            if truth is not None and truth[idx-1]>0 and (not now.golden):
                return (now.score_expr - golden_sent.score_expr)

        if truth is not None:
            return (now.score_expr - golden_sent.score_expr)

        return agenda
Exemple #18
0
 def _attend(self, query, mask=None):
     # query ((H), B)
     # mask  ((T, 1), B)
     query = unsqueeze(query, 0)
     # ((1, H), B) * ((H, T), B) -> ((1, T), B) -> ((T, 1), B)
     attn_scores = dy.cdiv(dy.transpose(query * self.context),
                           dy.scalarInput(self.scale))
     if mask is not None:
         attn_scores = dy.cmult(attn_scores,
                                mask[0]) + (mask[1] * dy.scalarInput(-1e9))
     return dy.softmax(attn_scores)  # ((T, 1), B)
Exemple #19
0
    def get_last_layer_context_representations(
            self, sentence, context_representations_for_crf_loss,
            context_representations_for_md_loss):
        last_layer_context_representations = context_representations_for_crf_loss

        if self.parameters['active_models'] in [1, 2, 3]:

            if self.parameters['active_models'] == 1 and \
                   self.parameters['integration_mode'] != 0:
                assert False, "integration_mode should be set to zero when active_models == 1"

            if self.parameters['debug'] == 1:
                print(("str_words", sentence["str_words"]))
            morph_analysis_representations, morph_analysis_scores = \
                self.get_morph_analysis_representations_and_scores(sentence,
                                                                   context_representations_for_md_loss)

            selected_morph_analysis_representations = \
                self.disambiguate_morph_analyzes(morph_analysis_scores)

            if 'golden_morph_analysis_indices' in list(sentence.keys()):
                md_loss = dynet.esum([
                    dynet.pickneglogsoftmax(morph_analysis_scores_for_word,
                                            golden_idx)
                    for golden_idx, morph_analysis_scores_for_word in zip(
                        sentence['golden_morph_analysis_indices'],
                        morph_analysis_scores)
                ])
            else:
                md_loss = dynet.scalarInput(0)

            if self.parameters['integration_mode'] == 2:
                # on the other hand, we can implement two layer of contexts, which we use the
                # first for morphological disambiguation and then concatenate the predicted/computed/
                # selected morphological analysis representation to use for calculating tag_scores
                last_layer_context_representations = \
                    [dynet.concatenate([context,
                                        morph_analysis_representations[word_pos]
                                        [selected_morph_analysis_representation_pos]])
                     for word_pos, (selected_morph_analysis_representation_pos, context) in
                     enumerate(
                         zip(selected_morph_analysis_representations, context_representations_for_crf_loss))]
            if md_loss.value() > 1000:
                logging.error("BEEP")
        else:
            # only the plain old NER model
            # we must decide whether we should implement the morphological embeddings scheme here.
            md_loss = dynet.scalarInput(0)
            selected_morph_analysis_representations = None
            last_layer_context_representations = context_representations_for_crf_loss

        assert last_layer_context_representations is not None
        return last_layer_context_representations, md_loss, selected_morph_analysis_representations
Exemple #20
0
def get_factor_expressions(fws,
                           bws,
                           tfemb,
                           tfdict,
                           valid_fes,
                           sentence,
                           spaths_x=None,
                           cpaths_x=None):
    factexprs = {}
    sentlen = len(fws)

    sortedtfd = sorted(list(tfdict.keys()))
    targetspan = (sortedtfd[0], sortedtfd[-1])

    for j in range(sentlen):
        istart = 0
        if USE_SPAN_CLIP and j > ALLOWED_SPANLEN:
            istart = max(0, j - ALLOWED_SPANLEN)
        for i in range(istart, j + 1):

            spanlen = dy.scalarInput(j - i + 1)
            logspanlen = dy.scalarInput(math.log(j - i + 1))
            spanwidth = sp_x[SpanWidth.howlongisspan(i, j)]
            spanpos = ap_x[ArgPosition.whereisarg((i, j), targetspan)]

            fbemb_ij_basic = dy.concatenate([
                fws[i][j], bws[i][j], tfemb, spanlen, logspanlen, spanwidth,
                spanpos
            ])
            if USE_DEPS:
                outs = oh_s[OutHeads.getnumouts(i, j, sentence.outheads)]
                shp = spaths_x[sentence.shortest_paths[(i, j, targetspan[0])]]
                fbemb_ij = dy.concatenate([fbemb_ij_basic, outs, shp])
            elif USE_CONSTITS:
                isconstit = dy.scalarInput((i, j) in sentence.constitspans)
                lca = ct_x[sentence.lca[(i, j)][1]]
                phrp = cpaths_x[sentence.cpaths[(i, j, targetspan[0])]]
                fbemb_ij = dy.concatenate(
                    [fbemb_ij_basic, isconstit, lca, phrp])
            else:
                fbemb_ij = fbemb_ij_basic

            for y in valid_fes:
                fctr = Factor(i, j, y)
                if USE_HIER and y in feparents:
                    fefixed = dy.esum([fe_x[y]] +
                                      [fe_x[par] for par in feparents[y]])
                else:
                    fefixed = fe_x[y]
                fbemb_ijy = dy.concatenate([fefixed, fbemb_ij])
                factexprs[fctr] = w_f * dy.rectify(w_z * fbemb_ijy + b_z) + b_f
    return factexprs
Exemple #21
0
    def beam_search(self, char_seq, truth=None, mu=0.):
        start_agenda = Agenda(self.options['beam_size'])
        init_state = self.params['lstm'].initial_state().add_input(
            self.param_exprs['<bos>'])
        init_y = dy.tanh(self.param_exprs['pW'] * init_state.output() +
                         self.param_exprs['pb'])
        init_score = dy.scalarInput(0.)
        start_agenda.push(
            Sentence(score=init_score.scalar_value(),
                     score_expr=init_score,
                     LSTMState=init_state,
                     y=init_y,
                     prevState=None,
                     wlen=None))
        agenda = [start_agenda]

        for idx, _ in enumerate(
                char_seq, 1):  # from left to right, character by character
            now = Agenda(self.options['beam_size'])
            for wlen in xrange(1,
                               min(idx, self.options['max_word_len']) +
                               1):  # generate candidate word vectors
                word = self.word_repr(char_seq[idx - wlen:idx])
                word_score = dy.dot_product(word, self.param_exprs['U'])
                for sent in agenda[idx - wlen]:  # join segmentation
                    if truth is not None:
                        margin = dy.scalarInput(
                            mu * wlen if truth[idx - 1] != wlen else 0.)
                        score = margin + sent.score_expr + dy.dot_product(
                            sent.y, word) + word_score
                    else:
                        score = sent.score_expr + dy.dot_product(
                            sent.y, word) + word_score

                    if now.happy_with(score.scalar_value()):
                        new_state = sent.LSTMState.add_input(word)
                        new_y = dy.tanh(self.param_exprs['pW'] *
                                        new_state.output() +
                                        self.param_exprs['pb'])
                        now.push(
                            Sentence(score=score.scalar_value(),
                                     score_expr=score,
                                     LSTMState=new_state,
                                     y=new_y,
                                     prevState=sent,
                                     wlen=wlen))
            agenda.append(now)

        if truth is not None:
            return agenda[-1].max().score_expr
        return agenda
Exemple #22
0
def calc_loss(batch_distances, batch_scores, lamb):

    batch_losses = [
        lamb * dy.scalarInput(d) - (batch_scores[0] - s)
        for s, d in zip(batch_scores[1:], batch_distances[1:])
    ]
    losses_pos = [
        l if l.npvalue() >= 0 else dy.scalarInput(0) for l in batch_losses
    ]

    if len(losses_pos) == 0:
        return 0

    return dy.esum(losses_pos)
Exemple #23
0
        def rule_loss_collector(
                beam_item  # type: BeamItem
        ):
            if beam_item.sync_rule is None:
                yield dn.scalarInput(0.0)
                raise StopIteration
            node_info = beam_item.node_info_ref()
            if node_info.early_updated:
                yield dn.scalarInput(0.0)
                raise StopIteration

            if self.options.use_graph_embedding:
                # calculate rule loss
                correspondents = node_info.correspondents
                pred_expr = correspondents[beam_item.sync_rule]
                gold_expr = correspondents[node_info.gold_rule]
                loss = (pred_expr - gold_expr) if pred_expr is not gold_expr else dn.scalarInput(0.0)
            else:
                loss = dn.scalarInput(0.0)

            # add rule correctness statistics
            print_logger.total_count += 1
            if beam_item.sync_rule == node_info.gold_rule:
                print_logger.correct_count += 1

            # calculate edge loss
            gold_item = node_info.gold_item
            if gold_item is not beam_item:
                predict_edge_scores = edge_feature_to_scores(
                    beam_item.own_features - gold_item.own_features,
                    return_expr=True)  # ignore common features
                gold_edge_scores = edge_feature_to_scores(
                    gold_item.own_features - beam_item.own_features, True)
                print_logger.total_gold_score += gold_item.score
                print_logger.total_predict_score += beam_item.score

                struct_loss = predict_edge_scores - gold_edge_scores
                loss += struct_loss
            yield loss

            # loss of children node
            if beam_item.left is not None:
                for i in rule_loss_collector(beam_item.left):
                    yield i
            if beam_item.right is not None:
                for i in rule_loss_collector(beam_item.right):
                    yield i
            node_info.early_updated = 1
Exemple #24
0
    def greedy_train_max_sumlogllh(self, init_state, gold_actions):

        total_obj = dt.scalarInput(0)

        cur_state = init_state
        res = 0
        idx = 0
        while True:
            if cur_state.is_end():
                break

            action_list = list(cur_state.get_action_set())
            new_expression_list, meta_info_list = cur_state.get_next_score_expressions(
                action_list)
            prob_list = dt.softmax(new_expression_list)
            gold_action = gold_actions[idx]
            action_idx = action_list.index(gold_action)
            total_obj += -(dt.log(prob_list[action_idx]))

            cur_state = cur_state.get_new_state_after_action(
                gold_action, meta_info_list[action_idx])
            idx += 1
            #print (cur_state)

        res = total_obj.scalar_value()
        total_obj.backward()
        self.neural_model.learner.update()

        return res
Exemple #25
0
    def learn(self, seq):
        output, proj_x3 = self._predict(seq, runtime=False)

        # arcs
        for iSrc in range(len(seq)):
            for iDst in range(len(seq)):
                if iDst > iSrc:
                    o = output[iSrc][iDst]  # the softmax portion
                    t = get_link(seq, iSrc, iDst)
                    # if t==1:
                    # self.losses.append(-dy.log(dy.pick(o, t)))
                    self.losses.append(dy.binary_log_loss(
                        o, dy.scalarInput(t)))

        # labels
        gs_chains, labels = self._get_gs_chains(seq)

        for chain, label in zip(gs_chains, labels):
            label_rnn = self.label_decoder.initial_state()
            for index in chain:
                label_rnn = label_rnn.add_input(proj_x3[index])
            label_softmax = dy.softmax(
                self.label_w.expr(update=True) * label_rnn.output() +
                self.label_b.expr(update=True))
            self.losses.append(-dy.log(
                dy.pick(label_softmax, self.encodings.label2int[label])))
    def decode(self, states, y, encoded_input, train=False):
        def sample(probs):
            return np.argmax(probs)

        s = self.decoder_rnn.initial_state()

        start_encoded = self.l2e["sep"].encode("<s>", "sep")
        out = []
        loss = dy.scalarInput(0.)
        #s =  s.add_input(states[-1]) #s.add_input(dy.concatenate([start_encoded, states[-1]]))
        s = s.add_input(dy.concatenate([start_encoded, states[-1]]))

        generated_string = []

        for char in y:
            true_char_encoded = self.l2e["l"].encode(char, "l")

            scores = self.predict_letter(s.output(), y)

            generated_string.append(scores)

            weighted_states = self.attend(s.output(), states, encoded_input)
            #s = s.add_input(weighted_states) #s.add_input(dy.concatenate([true_char_encoded, weighted_states]))
            s = s.add_input(
                dy.concatenate([true_char_encoded, weighted_states]))
            if char in self.C2I:
                loss += dy.pickneglogsoftmax(scores, self.C2I[char])

        return loss, generated_string
    def get_loss_classification(self, inputs, seq):
        """
        Computes classification loss for this sequence based on input vectors.
        """
        W_ent1 = dy.parameter(self.l2ent1)
        W_ent1b = dy.parameter(self.l2ent1b)
        W_ent2 = dy.parameter(self.l2ent2)
        W_ent2b = dy.parameter(self.l2ent2b)

        def ff(h):
            return W_ent2 * dy.tanh(W_ent1 * h + W_ent1b) + W_ent2b

        boundaries = get_boundaries(seq.l_seq) if self.__is_training else \
                     get_boundaries(seq.bio_pred)
        losses = []
        if not self.__is_training: seq.ent_pred = []
        for (s, t, entity) in boundaries:
            h = bilstm_single(inputs[s:t + 1], self.elstm1, self.elstm2)
            g = ff(h)
            if self.__is_training:
                gold = self.e_enc[entity]
                losses.append(dy.pickneglogsoftmax(g, gold))
            else:
                seq.ent_pred.append(self.e_dec[np.argmax(g.npvalue())])
                if self.entemb_path:
                    string = stringfy(seq.w_seq, s, t)
                    with open(self.entemb_path, 'a') as inf:
                        inf.write(string + '\t')
                        for val in g.vec_value():
                            inf.write(str(val) + ' ')
                        inf.write('\n')

        classification_loss = dy.esum(losses) if losses else dy.scalarInput(0.)

        return classification_loss
    def get_loss_boundary(self, inputs, seq):
        """
        Computes boundary loss for this sequence based on input vectors.
        """
        W_bio1 = dy.parameter(self.l2bio1)
        W_bio1b = dy.parameter(self.l2bio1b)
        W_bio2 = dy.parameter(self.l2bio2)
        W_bio2b = dy.parameter(self.l2bio2b)

        def ff(h):
            return W_bio2 * dy.tanh(W_bio1 * h + W_bio1b) + W_bio2b

        gs = [ff(h) for h in inputs]  # Inputs now 3 dimensional ("BIO scores")

        if self.loss == "global":
            boundary_loss = self.get_loss_boundary_global(gs, seq)
        elif self.loss == "local":
            boundary_loss = self.get_loss_boundary_local(gs, seq)
        else:
            sys.exit("Unknown loss \"{0}\"".format(self.loss))

        losses = []
        if not self.__is_training: seq.bio_pred = []
        for i, g in enumerate(gs):
            if self.__is_training:
                gold = self.__BIO_ENC[seq.l_seq[i][0]]
                losses.append(dy.pickneglogsoftmax(g, gold))
            else:
                seq.bio_pred.append(self.__BIO_DEC[np.argmax(g.npvalue())])
        boundary_loss = dy.esum(losses) if losses else dy.scalarInput(0.)

        return boundary_loss
    def get_loss_boundary_global(self, score_vecs, seq):
        start_b = dy.parameter(self.start_bias)
        T = dy.parameter(self.trans_mat)
        end_b = dy.parameter(self.end_bias)

        if not self.__is_training:
            seq.bio_pred = viterbi(start_b, T, end_b, score_vecs, self.valid)
            return dy.scalarInput(0.)

        pi = [[None for _ in xrange(3)] for _ in xrange(len(score_vecs))]

        for y in xrange(3):
            pi[0][y] = score_vecs[0][y] + start_b[y]

        for i in xrange(1, len(pi)):
            for y in xrange(3):
                pi[i][y] = dy.logsumexp([
                    pi[i - 1][y_prev] + T[y_prev][y] + score_vecs[i][y]
                    for y_prev in xrange(3)
                ])

        normalizer = dy.logsumexp([pi[-1][y] + end_b[y] for y in xrange(3)])
        gold_score = score_crf(start_b, T, end_b, score_vecs,
                               [self.__BIO_ENC[l[0]] for l in seq.l_seq])

        return normalizer - gold_score
Exemple #30
0
    def beam_train_max_margin(self, init_state, gold_ans):
        #still did not use the gold sequence but use the min risk training
        #max reward y = argmax(r(y))
        #max y' = argmax f(x,y) - R(y')
        # loss = max(f(x,y') - f(x,y) + R(y) - R(y') , 0)

        end_state_list = self.beam_predict(init_state)
        reward_list = [x.reward(gold_ans) for x in end_state_list]
        violation_list = [
            s.score - reward for s, reward in zip(end_state_list, reward_list)
        ]

        best_score_state_idx = violation_list.index(max(
            violation_list))  # find the best scoring seq with minimal reward
        best_reward_state_idx = reward_list.index(
            max(reward_list))  # find seq with the max reward in beam

        best_score_state = end_state_list[best_score_state_idx]
        best_reward_state = end_state_list[best_reward_state_idx]

        best_score_state_reward = reward_list[best_score_state_idx]
        best_reward_state_reward = reward_list[best_reward_state_idx]

        loss = dt.rectify(best_score_state.path_score_expression -
                          best_reward_state.path_score_expression +
                          dt.scalarInput(best_reward_state_reward -
                                         best_score_state_reward))
        loss_value = loss.value()

        loss.backward()

        self.neural_model.learner.update()
        return loss_value
Exemple #31
0
    def train(self, mini_batch, num_train, k):
        words, pos_tags, chars, langs, signs, masks = mini_batch
        # Getting the last hidden layer from BiLSTM.
        rnn_out = self.rnn_mlp(mini_batch, True)
        h_out = rnn_out[-1]
        t_out_d = dy.reshape(h_out, (h_out.dim()[0][0], h_out.dim()[1]))
        t_out = dy.transpose(t_out_d)

        # Calculating the kq values for NCE.
        kq = dy.scalarInput(float(k) / num_train)
        lkq = dy.log(kq)

        loss_values = []
        for i in range(len(langs)):
            for j in range(i + 1, len(langs)):
                if (langs[i] != langs[j]) and (signs[i] == 1 or signs[j] == 1):
                    lu = -dy.squared_distance(t_out[i], t_out[j])
                    denom = dy.log(dy.exp(lu) + kq)
                    if signs[i] == signs[j]:  # both one
                        nom = lu
                    else:
                        nom = lkq
                    loss_values.append(denom - nom)

        err_value = 0
        if len(loss_values) > 0:
            err = dy.esum(loss_values) / len(loss_values)
            err.forward()
            err_value = err.value()
            err.backward()
            self.trainer.update()
        dy.renew_cg()
        return err_value
Exemple #32
0
    def beam_train_max_margin_with_goldactions(self, init_state, gold_actions):
        #max y = gold y
        #max y' = argmax f(x,y)
        # loss = max(f(x,y') - f(x,y) + R(y) - R(y') , 0)

        #loss
        #end_state_list = self.beam_predict(init_state)  # top-k argmax_y f(x,y)
        end_state_list = self.beam_predict_max_violation(
            init_state, gold_actions
        )  # top-k argmax_y f(x,y) + R(y*) - R(y)  // Current implementation is the same as Hamming distance
        best_score_state = end_state_list[0]
        reward_list = [x.reward(gold_actions) for x in end_state_list]

        best_reward_state = self.get_goldstate_with_gold_actions(
            init_state, gold_actions)
        best_reward = best_reward_state.reward(gold_actions)

        loss = dt.rectify(best_score_state.path_score_expression -
                          best_reward_state.path_score_expression +
                          dt.scalarInput(best_reward - reward_list[0]))
        loss_value = loss.value()

        loss.backward()
        self.neural_model.learner.update()
        return loss_value
Exemple #33
0
def get_constit_loss(fws, bws, goldspans):
    if not USE_PTB_CONSTITS:
        raise Exception("should not be using the constit loss now!",
                        USE_PTB_CONSTITS)

    if len(goldspans) == 0:
        return None, 0

    losses = []
    sentlen = len(fws)

    for j in range(sentlen):
        istart = 0
        if USE_SPAN_CLIP and j > ALLOWED_SPANLEN:
            istart = max(0, j - ALLOWED_SPANLEN)
        for i in range(istart, j + 1):
            constit_ij = w_c * dy.rectify(
                w_fb * dy.concatenate([fws[i][j], bws[i][j]]) + b_fb) + b_c
            logloss = dy.log_softmax(constit_ij)

            isconstit = int((i, j) in goldspans)
            losses.append(pick(logloss, isconstit))

    ptbconstitloss = dy.scalarInput(DELTA) * -esum(losses)
    numspanstagged = len(losses)
    return ptbconstitloss, numspanstagged
Exemple #34
0
def loss_function(recon_x, x, mu, logvar):
    BCE = dy.binary_log_loss(recon_x, x)  # equiv to torch.nn.functional.binary_cross_entropy(?,?, size_average=False)
    # see Appendix B from VAE paper:
    # Kingma and Welling. Auto-Encoding Variational Bayes. ICLR, 2014
    # https://arxiv.org/abs/1312.6114
    # 0.5 * sum(1 + log(sigma^2) - mu^2 - sigma^2)
    KLD = -0.5 * dy.sum_elems(1 + logvar - dy.pow(mu, dy.scalarInput(2)) - dy.exp(logvar))

    return BCE + KLD
Exemple #35
0
 def _attend(self, query, mask=None):
     # query ((H), B)
     # mask  ((T, 1), B)
     projected_state = self.decoder * query  # ((H,), B)
     non_lin = dy.tanh(dy.colwise_add(self.context_proj, projected_state))  # ((H, T), B)
     attn_scores = dy.transpose(self.v * non_lin)  # ((1, H), B) * ((H, T), B) -> ((1, T), B) -> ((T, 1), B)
     if mask is not None:
         attn_scores = dy.cmult(attn_scores, mask[0]) + (mask[1] * dy.scalarInput(-1e9))
     return dy.softmax(attn_scores)  # ((T, 1), B)
Exemple #36
0
    def truth_score(self, word_seq):

        wembs = [self.param_exprs['<bos>']]+[self.word_repr(word) for word in word_seq]
        init_state = self.params['lstm'].initial_state()
        hidden_states = init_state.transduce(wembs)
        score = dy.scalarInput(0.)
        for h, w in zip(hidden_states[:-1],wembs[1:]):
            y = dy.tanh(self.param_exprs['pW'] * h + self.param_exprs['pb'])
            score = score + dy.dot_product(y,w) +dy.dot_product(w,self.param_exprs['U']) 
        return score
Exemple #37
0
    def score_sentence(self, emissions, tags):
        """Get the score of a given sentence.

        :param emissions: List[dy.Expression ((H,), B)]
        :param tags: List[int]

        Returns:
            dy.Expression ((1,), B)
        """
        tags = np.concatenate((np.array([self.start_idx], dtype=int), tags))
        score = dy.scalarInput(0)
        transitions = self.transitions
        for i, e in enumerate(emissions):
            # Due to Dynet being column based it is best to use the transition
            # matrix so that x -> y is T[y, x].
            score += dy.pick(dy.pick(transitions, tags[i + 1]), tags[i]) + dy.pick(e, tags[i + 1])

        score += dy.pick(dy.pick(transitions, self.end_idx), tags[-1])
        return score
Exemple #38
0
def node_iteration(rel, g, node, opts, assoc_model, trainer, log_file, is_source):
    """
    Perform one iteration of trying to score a node's neighbors above negative samples.
    """
    
    # true instances likelihood
    trues = targets(g, node) if is_source else sources(g, node)
    side = '->' if is_source else '<-'
    if len(trues) == 0: return 0.0
    
    if opts.debug:
        dy.renew_cg(immediate_compute = True, check_validity = True)
    else:
        dy.renew_cg()
    
    # compute association score as dynet expression (can't do this above due to staleness)
    true_scores = []
    for tr in trues:
        if is_source:
            j_assoc_score = assoc_model.word_assoc_score(node, tr, rel)
        else:
            j_assoc_score = assoc_model.word_assoc_score(tr, node, rel)
        if log_file is not None:
            log_file.write('{} {}\tTRUE_{}\t{:.3e}\n'\
                         .format(node, side, tr, j_assoc_score.scalar_value()))
        true_scores.append(j_assoc_score)


    # false targets likelihood - negative sampling (uniform)
    # collect negative samples
    if opts.nll:
        sample_scores = [[ts] for ts in true_scores]
    else:
        margins = []
    neg_samples = [np.random.choice(range(N)) for _ in range(opts.neg_samp * len(trues))]
    # remove source and true targets if applicable
    for t in [node] + trues:
        if t in neg_samples:
            neg_samples.remove(t)
            neg_samples.append(np.random.choice(range(N)))
    for (i,ns) in enumerate(neg_samples):
        # compute association score as dynet expression
        if is_source:
            ns_assoc_score = assoc_model.word_assoc_score(node, ns, rel)
        else:
            ns_assoc_score = assoc_model.word_assoc_score(ns, node, rel)
        if log_file is not None:
            log_file.write('{} {}\tNEG_{}\t{:.3e}\n'\
                         .format(node, side, ns, ns_assoc_score.scalar_value()))
        corresponding_true = i // opts.neg_samp
        if opts.nll:
            sample_scores[corresponding_true].append(ns_assoc_score)
        else:
            # TODO maybe use dy.hinge()
            ctt_score = true_scores[corresponding_true]
            margin = ctt_score - ns_assoc_score
            margins.append(dy.rectify(dy.scalarInput(1.0) - margin))


    # compute overall loss
    if opts.nll:
        if len(sample_scores) == 0:
            dy_loss = dy.scalarInput(0.0)
        else:
            dy_loss = dy.esum([dy.pickneglogsoftmax(dy.concatenate(scrs), 0) for scrs in sample_scores])
    else:
        if len(margins) == 0:
            dy_loss = dy.scalarInput(0.0)
        else:
            dy_loss = dy.esum(margins)
    sc_loss = dy_loss.scalar_value()
    if log_file is not None:
        log_file.write('{}\tLOSS\t{:.3e}\n'\
                         .format(node, sc_loss))
                         
    # backprop and recompute score
    if opts.v > 1:
        timeprint('overall loss for relation {}, node {} as {} = {:.6f}'\
                  .format(rel, node, 'source' if is_source else 'target', sc_loss))

    dy_loss.backward()
    trainer.update()

    return sc_loss
Exemple #39
0
pW1 = m.add_parameters((HIDDEN_SIZE, 2), device="GPU:1")
pb1 = m.add_parameters(HIDDEN_SIZE, device="GPU:1")
pW2 = m.add_parameters((HIDDEN_SIZE, HIDDEN_SIZE), device="GPU:0")
pb2 = m.add_parameters(HIDDEN_SIZE, device="GPU:0")
pV = m.add_parameters((1, HIDDEN_SIZE), device="CPU")
pa = m.add_parameters(1, device="CPU")

if len(sys.argv) == 2:
  m.populate_from_textfile(sys.argv[1])

dy.renew_cg()
W1, b1, W2, b2, V, a = dy.parameter(pW1, pb1, pW2, pb2, pV, pa)

x = dy.vecInput(2, "GPU:1")
y = dy.scalarInput(0, "CPU")
h1 = dy.tanh((W1*x) + b1)
h1_gpu0 = dy.to_device(h1, "GPU:0")
h2 = dy.tanh((W2*h1_gpu0) + b2)
h2_cpu = dy.to_device(h2, "CPU")
if xsent:
    y_pred = dy.logistic((V*h2_cpu) + a)
    loss = dy.binary_log_loss(y_pred, y)
    T = 1 
    F = 0 
else:
    y_pred = (V*h2_cpu) + a 
    loss = dy.squared_distance(y_pred, y)
    T = 1 
    F = -1
Exemple #40
0
HIDDEN_SIZE = 8
ITERATIONS = 2000

m = dy.Model()
trainer = dy.SimpleSGDTrainer(m)

W = m.add_parameters((HIDDEN_SIZE, 2))
b = m.add_parameters(HIDDEN_SIZE)
V = m.add_parameters((1, HIDDEN_SIZE))
a = m.add_parameters(1)

if len(sys.argv) == 2:
  m.populate_from_textfile(sys.argv[1])

x = dy.vecInput(2)
y = dy.scalarInput(0)
h = dy.tanh((W*x) + b)
if xsent:
    y_pred = dy.logistic((V*h) + a)
    loss = dy.binary_log_loss(y_pred, y)
    T = 1
    F = 0
else:
    y_pred = (V*h) + a
    loss = dy.squared_distance(y_pred, y)
    T = 1
    F = -1


for iter in range(ITERATIONS):
    mloss = 0.0