Exemple #1
0
    def Predict(self, conll_path, BATCH_SIZE=5):
        nwords=0
        nsents=0
        with open(conll_path, 'r') as conllFP:
            for iSentence, sentence_batch in enumerate(read_conll_batch(conllFP, False, BATCH_SIZE),1):
                self.Init()

                # init initial stack and buffer pairs into sents
                sents = [self.init_sentence(s) for s in sentence_batch]

                hoffset = 1 if self.headFlag else 0
                while sents:
                    new_sents = []
                    exprs = []
                    for (stack, buf) in sents:
                        if (len(buf)==1 and len(stack) == 0):
                            continue
                        
                        new_sents.append((stack, buf))

                        routput, output = self.__evaluate(stack, buf, False)
                        exprs.append((routput, output, stack, buf))
                    sents = new_sents

                    # run forward on all the expressions
                    _es = [_x[0] for _x in exprs]+[_x[1] for _x in exprs]
                    if _es: dy.forward(_es)
                    for routput, output, stack, buf in exprs:
                        scores = self.exprs_to_scores(routput, output, stack, buf, False)
                        best = max(chain(*scores), key = itemgetter(2) )
                        self.apply_action(best, stack, buf, hoffset)
                renew_cg()
                for sent in sentence_batch: yield sent
Exemple #2
0
    def train(self, trees, data_train=None):
        print_logger = PrintLogger()
        pool = Pool(self.options.concurrent_count)
        print_per = (100 // self.options.batch_size + 1) * self.options.batch_size
        self.sent_embeddings.rnn.set_dropout(self.options.lstm_dropout)
        for sentence_idx, batch_idx, batch_trees in split_to_batches(
                trees, self.options.batch_size):
            if sentence_idx % print_per == 0 and sentence_idx != 0:
                print_logger.print(sentence_idx)
            sessions = [self.training_session(tree, print_logger, pool)
                        for tree in batch_trees]

            batch_size_2 = int(math.ceil(len(sessions) / 2) + 0.5)
            assert batch_size_2 * 2 >= len(sessions)
            for _, _, sub_sessions in split_to_batches(
                    sessions, batch_size_2):
                exprs = [i for session in sub_sessions for i in next(session)]
                if exprs:
                    dn.forward(exprs)
                futures = [next(session) for session in sub_sessions]
            loss = dn.esum([next(session) for session in sessions]) / len(sessions)

            # update
            print_logger.total_loss += loss.scalar_value()
            loss.backward()
            self.optimizer.update()
            dn.renew_cg()
Exemple #3
0
    def train_gen(self, sentences, update=True):
        print_logger = PrintLogger()
        pool = Pool(self.options.concurrent_count)
        self.network.sent_embedding.rnn.set_dropout(self.options.lstm_dropout)
        print_per = (100 // self.options.batch_size + 1) * self.options.batch_size

        for sentence_idx, batch_idx, batch_sentences in split_to_batches(
                sentences, self.options.batch_size):
            if sentence_idx % print_per == 0 and sentence_idx != 0:
                print_logger.print(sentence_idx)
            sessions = [self.training_session(sentence, print_logger, pool)
                        for sentence in batch_sentences]
            all_exprs = [next(i) for i in sessions]
            if all_exprs:
                dn.forward(all_exprs)
            # spawn decoders
            for i in sessions:
                next(i)
            all_labels_exprs = [j for i in sessions for j in next(i)]
            if all_labels_exprs:
                dn.forward(all_labels_exprs)
            loss = sum(next(i) for i in sessions) / len(sessions)
            print_logger.total_loss_value += loss.value()
            if update:
                loss.backward()
                self.optimizer.update()
                dn.renew_cg()
            yield (loss if not update else None)
Exemple #4
0
 def predict(self, trees):
     self.sent_embeddings.rnn.disable_dropout()
     for sentence_idx, batch_idx, batch_trees in split_to_batches(
             trees, self.options.batch_size):
         sessions = [self.predict_session(tree) for tree in batch_trees]
         exprs = [i for session in sessions for i in next(session)]
         dn.forward(exprs)
         for session in sessions:
             yield next(session)
         dn.renew_cg()
def rerank(data_file, filename, scoring_model, qrels, qrels_file):
    queries_to_rerank = rerank_query_generator_ram(data_file)

    res_dict = {'questions': []}
    pred_batch_size = 100
    for q in tqdm(queries_to_rerank, desc='queries', dynamic_ncols=True):

        query = q['token_inds']
        query_idf = q['idf']
        scores = []
        dev_batches = chunks(range(len(q['retrieved_documents']['doc_list'])),
                             pred_batch_size)
        for batch in dev_batches:
            batch_preds = []
            dy.renew_cg()  # new computation graph
            for i in batch:
                doc = q['retrieved_documents']['doc_list'][i]
                doc_bm25 = q['retrieved_documents']['doc_normBM25'][i]
                doc_overlap = q['retrieved_documents']['doc_overlap'][i]
                batch_preds.append(
                    scoring_model.predict_doc_score(doc, query_idf, doc_bm25,
                                                    doc_overlap))
            dy.forward(batch_preds)
            scores += [pred.npvalue()[0] for pred in batch_preds]
        retr_scores = list(zip(q['retrieved_documents']['doc_ids'], scores))
        sorted_retr_scores = sorted(retr_scores,
                                    key=lambda x: x[1],
                                    reverse=True)
        res_dict['questions'].append({
            'id':
            q['id'],
            'documents': [
                'http://www.ncbi.nlm.nih.gov/pubmed/' + d[0]
                for d in sorted_retr_scores
            ]
        })

    path_bioasq = write_bioasq_results_dict(res_dict, filename)
    trec_eval_metrics = trec_eval_custom(qrels_file, path_bioasq)

    for i in range(len(res_dict['questions'])):
        res_dict['questions'][i]['documents'] = res_dict['questions'][i][
            'documents'][:10]

    path_bioasq = write_bioasq_results_dict(res_dict, filename + '_top10')
    bioasq_metrics = bioasq_eval_custom(path_bioasq, qrels_file)
    precision_at_5 = get_precision_at_k(res_dict, qrels, 5)

    reported_results = {
        'P_5': precision_at_5,
        'MAP(bioasq)': bioasq_metrics['map'],
        'GMAP(bioasq)': bioasq_metrics['gmap']
    }
    return reported_results
 def compose(self, composed_words, batch_size):
   outputs = [[] for _ in range(batch_size)]
   exprs = []
   # Batching expression
   for expr_list, batch_num, position, start, end in composed_words:
     self.set_word(self.src_sent[batch_num][start:end])
     expr = self.transduce(expr_list)
     if expr is not None:
       outputs[batch_num].append(expr)
       exprs.append(expr)
   dy.forward(exprs)
   return outputs
Exemple #7
0
    def predict_logprobs(self, X, Y):
        """
        Returns the log probabilities of the predictions for this model (batched version).
        Returns a matrix of log probabilities.
        @param X: the input indexes from which to predict 
        @param Y: a list of references indexes for which to extract the prob. 
        @return the matrix of predicted logprobabilities for each of the provided ref y in Y
        as a numpy array
        """
        assert (len(X) == len(Y))
        assert (all([len(x) == len(y) for x, y in zip(X, Y)]))

        nlines = len(X)
        X = zip(*X)  #transposes the batch
        Y = zip(*Y)  #transposes the batch
        if self.tied:
            dy.renew_cg()
            state = self.rnn.initial_state()
            E = dy.parameter(self.embedding_matrix)
            preds = []
            lookups = [dy.pick_batch(E, xcolumn) for xcolumn in X]
            outputs = state.transduce(lookups)
            ypred_batch = [
                dy.pickneglogsoftmax_batch(E * lstm_out, y)
                for lstm_out, y in zip(outputs, Y)
            ]
            dy.forward(ypred_batch)
            if nlines > 1:
                preds = [(-col.npvalue()).tolist()[0] for col in ypred_batch]
            else:
                preds = [(-col.npvalue()).tolist() for col in ypred_batch]
            return list(zip(*preds))  #final back transposition
        else:
            dy.renew_cg()
            state = self.rnn.initial_state()
            O = dy.parameter(self.output_weights)
            E = dy.parameter(self.embedding_matrix)
            preds = []
            lookups = [dy.pick_batch(E, xcolumn) for xcolumn in X]
            outputs = state.transduce(lookups)
            ypred_batch = [
                dy.pickneglogsoftmax_batch(O * lstm_out, y)
                for lstm_out, y in zip(outputs, Y)
            ]
            dy.forward(ypred_batch)
            preds = [(-col.npvalue()).tolist()[0] for col in ypred_batch]
            if nlines > 1:
                preds = [(-col.npvalue()).tolist()[0] for col in ypred_batch]
            else:
                preds = [(-col.npvalue()).tolist() for col in ypred_batch]
            return list(zip(*preds))  #final back transposition
 def calc_nll(self, src_batch, trg_batch) -> dy.Expression:
     self.actions.clear()
     self.outputs.clear()
     event_trigger.start_sent(src_batch)
     batch_loss = []
     # For every item in the batch
     for src, trg in zip(src_batch, trg_batch):
         # Initial state with no read/write actions being taken
         current_state = self._initial_state(src)
         src_len = src.sent_len()
         # Reading + Writing
         src_encoding = []
         loss_exprs = []
         now_action = []
         outputs = []
         # Simultaneous greedy search
         while not self._stoping_criterions_met(current_state, trg):
             # Define action based on state
             action = self.next_action(current_state, src_len,
                                       len(src_encoding))
             if action == self.Action.READ:
                 # Reading + Encoding
                 current_state = current_state.read(src)
                 src_encoding.append(current_state.encoder_state.output())
             else:
                 # Predicting next word
                 current_state = current_state.calc_context(src_encoding)
                 current_output = self.add_input(
                     current_state.prev_written_word, current_state)
                 # Calculating losses
                 ground_truth = self._select_ground_truth(
                     current_state, trg)
                 loss_exprs.append(
                     self.decoder.calc_loss(current_output.state,
                                            ground_truth))
                 # Use word from ref/model depeding on settings
                 next_word = self._select_next_word(ground_truth,
                                                    current_output.state)
                 # The produced words
                 outputs.append(next_word)
                 current_state = current_state.write(next_word)
             now_action.append(action.value)
         self.actions.append(now_action)
         self.outputs.append(outputs)
         # Accumulate loss
         batch_loss.append(dy.esum(loss_exprs))
     dy.forward(batch_loss)
     loss = dy.esum(batch_loss)
     return loss if not self.freeze_decoder_param else dy.nobackprop(loss)
Exemple #9
0
 def predict(self, graphs):
     self.network.sent_embedding.rnn.disable_dropout()
     for sentence_idx, batch_idx, batch_sentences in split_to_batches(
             graphs, self.options.batch_size):
         sessions = [self.predict_session(sentence)
                     for sentence in batch_sentences]
         all_exprs = [next(i) for i in sessions]
         if all_exprs:
             dn.forward(all_exprs)
         all_labels_exprs = [j for i in sessions for j in next(i)]
         if all_labels_exprs:
             dn.forward(all_labels_exprs)
         for i in sessions:
             yield next(i)
         dn.renew_cg()
Exemple #10
0
 def predict(self, trees, return_derivation=False):
     print_logger = PrintLogger()
     for sent_idx, batch_idx, batch_trees in split_to_batches(
             trees, self.options.batch_size):
         sessions = [self.training_session(tree, print_logger)
                     for tree in batch_trees]
         exprs = [expr for session in sessions for expr in next(session)]
         dn.forward(exprs)
         for tree, session in zip(batch_trees, sessions):
             final_beam_item = next(session)
             graph = final_beam_item.sub_graph.graph
             if return_derivation:
                 yield tree.extra["ID"], graph, list(self.construct_derivation(final_beam_item))
             else:
                 yield tree.extra["ID"], graph
         dn.renew_cg()
Exemple #11
0
 def train(self, trees):
     print_logger = PrintLogger()
     print_per = (100 // self.options.batch_size + 1) * self.options.batch_size
     for sent_idx, batch_idx, batch_trees in split_to_batches(
             trees, self.options.batch_size):
         if sent_idx % print_per == 0 and sent_idx != 0:
             print_logger.print(sent_idx)
         sessions = [self.training_session(tree, print_logger,
                                           self.derivations[tree.extra["ID"]])
                     for tree in batch_trees]
         exprs = [expr for session in sessions for expr in next(session)]
         dn.forward(exprs)
         loss = sum(next(session) for session in sessions) / len(sessions)
         print_logger.total_loss += loss.value()
         loss.backward()
         self.optimizer.update()
         dn.renew_cg()
Exemple #12
0
 def predict(self, sentences):
     self.network.sent_embedding.rnn.disable_dropout()
     pool = Pool(self.options.concurrent_count)
     for sentence_idx, batch_idx, batch_sentences in split_to_batches(
             sentences, self.options.test_batch_size):
         sessions = [self.predict_session(sentence, pool)
                     for sentence in batch_sentences]
         all_exprs = [next(i) for i in sessions]
         if all_exprs:
             dn.forward(all_exprs)
         # spawn decoders
         for i in sessions:
             next(i)
         all_labels_exprs = [j for i in sessions for j in next(i)]
         if all_labels_exprs:
             dn.forward(all_labels_exprs)
         for i in sessions:
             yield next(i)
         dn.renew_cg()
Exemple #13
0
    def embed_sent(self, x: Any) -> expression_seqs.ExpressionSequence:
        """Embed a full sentence worth of words. By default, just do a for loop.

    Args:
      x: This will generally be a list of word IDs, but could also be a list of strings or some other format.
         It could also be batched, in which case it will be a (possibly masked) :class:`xnmt.batcher.Batch` object

    Returns:
      An expression sequence representing vectors of each word in the input.
    """
        # single mode
        if not batchers.is_batched(x):
            expr = expression_seqs.ExpressionSequence(
                expr_list=[self.embed(word) for word in x])
        # minibatch mode
        elif type(self) == LookupEmbedder:
            embeddings = []
            for word_i in range(x.sent_len()):
                batch = batchers.mark_as_batch(
                    [single_sent[word_i] for single_sent in x])
                embeddings.append(self.embed(batch))
            expr = expression_seqs.ExpressionSequence(expr_list=embeddings,
                                                      mask=x.mask)
        else:
            assert type(
                x[0]
            ) == sent.SegmentedSentence, "Need to use CharFromWordTextReader for non standard embeddings."
            embeddings = []
            all_embeddings = []
            for sentence in x:
                embedding = []
                for i in range(sentence.len_unpadded()):
                    embed_word = self.embed(sentence.words[i])
                    embedding.append(embed_word)
                    all_embeddings.append(embed_word)
                embeddings.append(embedding)
            # Useful when using dy.autobatch
            dy.forward(all_embeddings)
            all_embeddings.clear()
            # Pad the results
            expr = batchers.pad_embedding(embeddings)

        return expr
def evaluate(data, graph, vocab, outputFile):
    start = time.time()
    output = open(outputFile, 'w', encoding='utf-8')
    arc_total_test, arc_correct_test, rel_total_test, rel_correct_test = 0, 0, 0, 0

    for onebatch in data_iter(data, config.test_batch_size, False, False):
        dy.renew_cg()
        batch_arc_probs, batch_rel_probs = [], []
        for words, extwords, tags, heads, rels in sentences_numberize(onebatch, vocab):
            arc_probs, rel_probs = graph.parse(words, extwords, tags)
            batch_arc_probs.append(arc_probs)
            batch_rel_probs.append(rel_probs)

        dy.forward(batch_arc_probs + batch_rel_probs)

        batch_size = len(onebatch)
        for index in range(batch_size):
            seq_len = len(onebatch[index])
            arc_probs = batch_arc_probs[index].npvalue()
            arc_probs = np.transpose(np.reshape(arc_probs, (seq_len, seq_len), 'F'))
            rel_probs = batch_rel_probs[index].npvalue()
            rel_probs = np.transpose(np.reshape(rel_probs, (vocab.rel_size, seq_len, seq_len), 'F'))
            arc_pred = arc_argmax(arc_probs, seq_len)
            rel_probs = rel_probs[np.arange(len(arc_pred)), arc_pred]
            rel_pred = rel_argmax(rel_probs, seq_len, vocab.ROOT)

            tree = append2Tree(arc_pred, rel_pred, vocab, onebatch[index])
            printDepTree(output, tree)
            arc_total, arc_correct, rel_total, rel_correct = evalDepTree(tree, onebatch[index])
            arc_total_test += arc_total
            arc_correct_test += arc_correct
            rel_total_test += rel_total
            rel_correct_test += rel_correct

    output.close()

    uas = arc_correct_test * 100.0 / arc_total_test
    las = rel_correct_test * 100.0 / rel_total_test

    end = time.time()
    print('sentence num:' + str(len(data)) + ', parse time: ', end - start)

    return arc_correct_test, rel_correct_test, arc_total_test, uas, las
Exemple #15
0
 def predict(self, trees):
     self.span_ebd_network.rnn.disable_dropout()
     pool = ThreadPool(self.options.concurrent_count)
     for sentence_idx, batch_idx, batch_trees in split_to_batches(
             trees, len(self.decoders)):
         self.span_ebd_network.init_special()
         sessions = [
             self.predict_session(tree, pool, decoder)
             for tree, decoder in zip(batch_trees, self.decoders)
         ]
         # stage1: generate all expressions and forward
         expressions = [j for i in sessions for j in next(i)]
         dn.forward(expressions)
         # stage2: spawn all decoders
         for session in sessions:
             next(session)
         # stage3: get all results
         for session in sessions:
             yield next(session)
         dn.renew_cg()
Exemple #16
0
    def calc_nll(self, src_batch, trg_batch) -> losses.LossExpr:
        event_trigger.start_sent(src_batch)
        self.create_trajectories(src_batch,
                                 trg_batch,
                                 force_oracle=not self._is_action_forced())

        batch_loss = []
        for src, trg, decoder_state in zip(src_batch, trg_batch,
                                           self.decoder_states):
            seq_loss = [
                self.decoder.calc_loss(decoder_state[i], trg[i])
                for i in range(len(decoder_state))
            ]
            batch_loss.append(dy.esum(seq_loss))

        dy.forward(batch_loss)
        total_loss = dy.concatenate_to_batch(batch_loss)
        total_units = [
            trg_batch[i].len_unpadded() for i in range(trg_batch.batch_size())
        ]
        return losses.LossExpr(total_loss, total_units)
Exemple #17
0
    def calc_policy_nll(self, src_batch, trg_batch) -> losses.LossExpr:
        assert self.policy_network is not None

        event_trigger.start_sent(src_batch)
        self.create_trajectories(src_batch,
                                 trg_batch,
                                 force_oracle=not self._is_action_forced())

        batch_loss = []
        for src, action, model_states in zip(src_batch, self.actions,
                                             self.model_states):
            policy_actions = model_states[-1].find_backward("policy_action")
            seq_ll = [
                dy.pick(act.log_likelihood, act.content)
                for act in policy_actions
            ]
            batch_loss.append(-dy.esum(seq_ll))

        dy.forward(batch_loss)
        total_loss = dy.concatenate_to_batch(batch_loss)
        total_units = [len(x) for x in self.actions]
        return losses.LossExpr(total_loss, total_units)
Exemple #18
0
    def train_gen(self, graphs, update=True, extra=None):
        """
        :type graphs: list[graph_utils.Graph]
        """
        self.logger = PrintLogger()
        self.network.sent_embedding.rnn.set_dropout(self.options.lstm_dropout)
        print_per = (100 // self.options.batch_size + 1) * self.options.batch_size

        if extra is not None:
            for sentence_idx, batch_idx, batch_sentences in split_to_batches(
                    extra, self.options.batch_size):
                if sentence_idx % print_per == 0 and sentence_idx != 0:
                    self.logger.print(sentence_idx)
                sessions = [self.training_session(sentence, self.logger, loose_var=self.options.loose)
                            for sentence in batch_sentences]
                all_exprs = [next(i) for i in sessions]
                if all_exprs:
                    dn.forward(all_exprs)
                all_labels_exprs = [j for i in sessions for j in next(i)]
                if all_labels_exprs:
                    dn.forward(all_labels_exprs)
                loss = sum(next(i) for i in sessions) / len(sessions)
                self.logger.total_loss_value += loss.value()
                if update:
                    loss.backward()
                    self.trainer.update()
                    dn.renew_cg()
                    sessions.clear()

        for sentence_idx, batch_idx, batch_sentences in split_to_batches(
                graphs, self.options.batch_size):
            if sentence_idx % print_per == 0 and sentence_idx != 0:
                self.logger.print(sentence_idx)
            sessions = [self.training_session(sentence, self.logger)
                        for sentence in batch_sentences]
            all_exprs = [next(i) for i in sessions]
            if all_exprs:
                dn.forward(all_exprs)
            all_labels_exprs = [j for i in sessions for j in next(i)]
            if all_labels_exprs:
                dn.forward(all_labels_exprs)
            loss = sum(next(i) for i in sessions) / len(sessions)
            self.logger.total_loss_value += loss.value()
            if update:
                loss.backward()
                self.trainer.update()
                dn.renew_cg()
                sessions.clear()
            yield (loss if not update else None)
Exemple #19
0
 def compose(self, composed_words, sample_size, batch_size):
     batches = []
     batch_maps = []
     batch_words = []
     seq_len = np.zeros((sample_size, batch_size), dtype=int)
     composed_words = sorted(composed_words, key=lambda x: x[5] - x[4])
     # Batching expression
     now_length = -1
     for expr_list, sample_num, batch_num, position, start, end in composed_words:
         length = end - start
         if length != now_length:
             now_length = length
             now_map = {}
             now_batch = []
             now_words = []
             now_idx = 0
             batches.append(now_batch)
             batch_maps.append(now_map)
             batch_words.append(now_words)
         now_batch.append(expr_list)
         now_words.append(self.src_sent[batch_num][start:end])
         now_map[now_idx] = (sample_num, batch_num, position)
         seq_len[sample_num, batch_num] += 1
         now_idx += 1
     # Composing
     outputs = [[[None for _ in range(seq_len[i, j])]
                 for j in range(batch_size)] for i in range(sample_size)]
     expr_list = []
     for batch, batch_map, batch_word in zip(batches, batch_maps,
                                             batch_words):
         self.set_words(batch_word)
         results = self.transduce(dy.concatenate_to_batch(batch))
         results.value()
         for idx, (sample_num, batch_num, position) in batch_map.items():
             expr_list.append(dy.pick_batch_elem(results, idx))
             outputs[sample_num][batch_num][position] = expr_list[-1]
     dy.forward(expr_list)
     return outputs
Exemple #20
0
    def predict_logprobs(self, X, Y, hidden_out=False):
        """
        Returns the log probabilities of the predictions for this model (batched version).

        @param X: the input indexes from which to predict (each xdatum is expected to be an iterable of integers) 
        @param Y: a list of references indexes for which to extract the prob.
        @param hidden_out: outputs an additional list of hidden dimension vectors
        @return the list of predicted logprobabilities for each of the provided ref y in Y
        """
        #TODO: implement the hidden_out output
        assert (len(X) == len(Y))
        assert (all(len(x) == self.input_length for x in X))

        preds = []

        if self.tied:
            dy.renew_cg()
            W = dy.parameter(self.hidden_weights)
            E = dy.parameter(self.embedding_matrix)
            for x, y in zip(X, Y):
                embeddings = [dy.pick(E, widx) for widx in x]
                xdense = dy.concatenate(embeddings)
                ypred = dy.pickneglogsoftmax(E * dy.tanh(W * xdense), y)
                preds.append(ypred)
            dy.forward(preds)
            return [-ypred.value() for ypred in preds]
        else:
            dy.renew_cg()
            O = dy.parameter(self.output_weights)
            W = dy.parameter(self.hidden_weights)
            E = dy.parameter(self.embedding_matrix)
            for x, y in zip(X, Y):
                embeddings = [dy.pick(E, widx) for widx in x]
                xdense = dy.concatenate(embeddings)
                ypred = dy.pickneglogsoftmax(O * dy.tanh(W * xdense), y)
                preds.append(ypred)
            dy.forward(preds)
            return [-ypred.value() for ypred in preds]
Exemple #21
0
    def train(self, trees, data_train=None):
        print_logger = PrintLogger()
        print_per = (100 // self.options.batch_size +
                     1) * self.options.batch_size
        self.sent_embeddings.rnn.set_dropout(self.options.lstm_dropout)
        for sentence_idx, batch_idx, batch_trees in split_to_batches(
                trees, self.options.batch_size):
            if sentence_idx % print_per == 0 and sentence_idx != 0:
                print_logger.print(sentence_idx)
            sessions = [
                self.training_session(tree, print_logger)
                for tree in batch_trees
            ]
            exprs = [i for session in sessions for i in next(session)]
            dn.forward(exprs)
            loss = dn.esum([next(session)
                            for session in sessions]) / len(sessions)

            # update
            print_logger.total_loss += loss.scalar_value()
            loss.backward()
            self.optimizer.update()
            dn.renew_cg()
Exemple #22
0
    def train(self, sentences):
        self.span_ebd_network.rnn.set_dropout(self.options.lstm_dropout)
        self.span_ebd_network.init_special()
        pool = ThreadPool(self.options.concurrent_count)
        print_logger = PrintLogger()
        print_per = (100 // self.options.batch_size +
                     1) * self.options.batch_size

        for sentence_idx, batch_idx, batch_trees in split_to_batches(
                sentences, self.options.batch_size):
            if sentence_idx != 0 and sentence_idx % print_per == 0:
                print_logger.print(sentence_idx)

            self.span_ebd_network.init_special()
            sessions = [
                self.train_session(tree, print_logger, pool, decoder)
                for tree, decoder in zip(batch_trees, self.decoders)
            ]

            batch_size_2 = int(math.ceil(len(sessions) / 2) + 0.5)
            assert batch_size_2 * 2 >= len(sessions)
            for _, _, sub_sessions in split_to_batches(sessions, batch_size_2):
                # stage1: generate all expressions and forward
                expressions = [j for i in sub_sessions for j in next(i)]
                dn.forward(expressions)
                # stage2: spawn all decoders
                for session in sub_sessions:
                    next(session)
            # stage3: get all losses
            loss = dn.esum([next(session) for session in sessions])
            loss /= len(sessions)
            print_logger.total_loss += loss.value()
            loss.backward()
            self.optimizer.update()
            dn.renew_cg()
            self.span_ebd_network.init_special()
Exemple #23
0
                confusion = [[0 for _ in xrange(10)] for _ in xrange(10)]
                correct = 0
                dev_start = time.time()
                for s in range(0, len(testing), args.minibatch_size):
                    dy.renew_cg()
                    classify.renew_cg()
                    e = min(len(testing), s + args.minibatch_size)
                    minibatch = testing[s:e]
                    scores = []
                    for lbl, img in minibatch:
                        x = dy.inputVector(img)
                        logits = classify(x)
                        scores.append((lbl, logits))

                    # This evaluates all the logits in a batch if autobatching is on.
                    dy.forward([logits for _, logits in scores])

                    # now we can retrieve the batch-computed logits cheaply
                    for lbl, logits in scores:
                        prediction = np.argmax(logits.npvalue())
                        if lbl == prediction:
                            correct += 1
                        confusion[prediction][lbl] += 1
                dev_end = time.time()
                acc = float(correct) / len(testing)
                dev_time += dev_end - dev_start
                print("Held out accuracy {} ({} instances/sec)".format(
                    acc,
                    len(testing) / (dev_end - dev_start)))
                print '   ' + ''.join(
                    ('T' + str(x)).ljust(6) for x in xrange(10))
            print 'epoch ' + str(
                epoch) + ' batch ' + str(i_batch) + ' loss ' + str(
                    batch_loss.npvalue())  # this calls forward on the batch
        batch_loss.backward()
        trainer.update()

training_duration = datetime.datetime.now() - pretrain_time
print 'Done! training took: ' + str(training_duration) + '\n'

print "\n\nPrediction time!\n"
# prediction code:
correct = 0
for batch in test_batches:
    dy.renew_cg()  # new computation graph
    batch_preds = []
    for sequence, label in batch:
        vecs = [embeds[char2int[i]] for i in sequence]
        preds = dy.softmax(acceptor(vecs))
        batch_preds.append(preds)

    # now that we accumulated the prediction expressions,
    # we run forward on all of them:
    dy.forward(batch_preds)
    # and now we can efficiently access the individual values:
    for preds in batch_preds:
        vals = preds.npvalue()
        if np.argmax(vals) == label:
            correct += 1

print 'accuracy: ' + str(float(correct) / len(test))
Exemple #25
0
def evaluate(file,
             char_acceptor,
             word_acceptor,
             char_embed,
             word_embed,
             crf_acceptor=None):
    '''evaluate performance of model on file
    @param file: string, path to test/dev file
    @param char_acceptor: CharBiLSTMAcceptor
    @param word_acceptor: WordBiLSTMAcceptor
    @param char_embed: lookup parameter
    @param word_embed: lookup parameter
    @param crf_acceptor: CRFAcceptor, default=None
    @return: float, float: accuracy, f1 score
    '''
    # extract evaluating samples
    eval_samples = list(readSample(file))
    # counts are used to calculate acc, f1
    count_tag = 0
    count_correct_tag = 0
    count_gold_chunk = 0
    count_pred_chunk = 0
    count_correct_chunk = 0

    num_batch = len(list(genBatch(eval_samples)))
    showpro = ShowProcess(num_batch, "evaluating done.")

    for batch in genBatch(eval_samples):  # for each batch
        dy.renew_cg()
        logitss = []  # shape=(#sentence,#word), elem_type=expression
        goldss = []  # gold tags, shape=(#sentence,#word), elem_type=int
        for wids, tids, cidss in batch:  # for each sentence
            # feed into biLSTMs and get logits
            wembeds1 = [word_embed[wid] for wid in wids]
            wembeds2 = [
                char_acceptor([char_embed[cid] for cid in cids])
                for cids in cidss
            ]
            wembeds = [
                dy.concatenate([embed1, embed2])
                for embed1, embed2 in zip(wembeds1, wembeds2)
            ]
            logitss.append(word_acceptor(wembeds))
            # record gold tags
            goldss.append(tids)
        # in order to use dy.forward, flatten logitss into a list of expressions
        logits_flt = []
        for logits in logitss:
            logits_flt.extend(logits)
        # do farward
        # note: no backward or update here in evaluating stage
        dy.forward(logits_flt)
        # use logits to predict
        # predss: shape=(#sentence,#word), elem_type=int
        if crf_acceptor is None:  # do not use crf
            predss = [[np.argmax(vec.npvalue()) for vec in vecs] \
                     for vecs in logitss]
        else:  # use crf
            predss = [crf_acceptor.predict(vecs) for vecs in logitss]
        # update counts
        for golds, preds in zip(goldss, predss):  # for each sentence
            gold_chunks = get_chunks(golds)
            pred_chunks = get_chunks(preds)
            count_gold_chunk += len(gold_chunks)
            count_pred_chunk += len(pred_chunks)
            count_correct_chunk += len(gold_chunks & pred_chunks)
            count_tag += len(golds)
            golds_np = np.asarray(golds, dtype=np.int8)
            preds_np = np.asarray(preds, dtype=np.int8)
            correct = golds_np == preds_np
            count_correct_tag += np.sum(correct)

        showpro()

    # calculate accuracy
    acc = float(count_correct_tag) / count_tag
    # calculate f1 score
    p = float(count_correct_chunk) / count_pred_chunk
    r = float(count_correct_chunk) / count_gold_chunk
    f1 = 2 * p * r / (p + r) if p + r > 0 else 0.0

    return acc, f1
    def train(self, trees):
        if isinstance(self.scorer_network, CountBasedHRGScorer):
            logger.info("No need to train a count-based scorer.")
            return

        total_count = sys.float_info.epsilon
        correct_count = 0
        pending = []
        total_loss = 0
        for tree_idx, tree in enumerate(trees):
            # if tree_idx == 5000:
            #     return
            if (tree_idx + 1) % 100 == 0:
                logger.info(
                    "Sent {}, Correctness: {:.2f}, loss: {:.2f}".format(
                        tree_idx + 1, correct_count / total_count * 100,
                        total_loss))
                total_count = sys.float_info.epsilon
                correct_count = 0
                total_loss = 0
            sent_id = tree.extra["ID"]
            derivations = self.derivations[sent_id]  # type: List[CFGRule]

            sentence_interface = tree.to_sentence()
            self.span_ebd_network.init_special()
            span_features = self.span_ebd_network.get_span_features(
                sentence_interface)

            cfg_nodes = list(tree.generate_rules())  # type: List[ConstTree]
            assert len(derivations) == len(cfg_nodes)

            for gold_rule, tree_node in zip(derivations, cfg_nodes):
                if tree_node.tag.endswith("#0"):
                    continue
                try:
                    correspondents = set(
                        self.rule_lookup(tree_node, True).items())
                except ValueError as e:
                    print(e)
                    continue

                # print(span_features[tree_node.span].npvalue().shape)
                pending.append((self.scorer_network.get_best_rule(
                    span_features[tree_node.span], correspondents,
                    gold_rule), gold_rule))

            if tree_idx % self.options.batch_size == 0 or tree_idx == len(
                    trees) - 1:
                # generate expressions
                exprs = []
                for item, gold_rule in pending:
                    exprs.extend(next(item))

                # do batch calculation
                if exprs:
                    dn.forward(exprs)

                # calculate loss
                loss = dn.scalarInput(0.0)
                for item, gold_rule in pending:
                    best_rule, this_loss, real_best_rule = next(item)
                    if this_loss is not None:
                        total_count += 1
                        loss += this_loss
                        if real_best_rule == gold_rule:
                            correct_count += 1
                loss.forward()
                total_loss += loss.scalar_value()
                loss.backward()
                self.optimizer.update()
                dn.renew_cg()
                pending = []
Exemple #27
0
      if (i > 0) and (i % dev_report == 0):
        confusion = [[0 for _ in range(10)] for _ in range(10)]
        correct = 0
        dev_start = time.time()
        for s in range(0, len(testing), args.minibatch_size):
          dy.renew_cg()
          e = min(len(testing), s + args.minibatch_size)
          minibatch = testing[s:e]
          scores = []
          for lbl, img in minibatch:
            x = dy.inputVector(img)
            logits = classify(x)
            scores.append((lbl, logits))

          # This evaluates all the logits in a batch if autobatching is on.
          dy.forward([logits for _, logits in scores])

          # now we can retrieve the batch-computed logits cheaply
          for lbl, logits in scores:
            prediction = np.argmax(logits.npvalue())
            if lbl == prediction:
              correct += 1
            confusion[prediction][lbl] += 1
        dev_end = time.time()
        acc = float(correct) / len(testing)
        dev_time += dev_end - dev_start
        print(("Held out accuracy {} ({} instances/sec)".format(
            acc, len(testing) / (dev_end - dev_start))))
        print('   ' + ''.join(('T'+str(x)).ljust(6) for x in range(10)))
        for p, row in enumerate(confusion):
          s = 'P' + str(p) + ' '