Beispiel #1
0
def main(config_path):
    args = parse_config(config_path)

    # Load sentences
    test_sentences = load_sentences(args["path_test"], args["replace_digit"])

    # Update tagging scheme (IOB/IOBES)
    update_tag_scheme(test_sentences, args["tag_scheme"])

    # Load mappings from disk
    id_to_word, id_to_char, id_to_tag = load_mappings(args["mappings_path"])
    word_to_id = {v: k for k, v in id_to_word.items()}
    char_to_id = {v: k for k, v in id_to_char.items()}
    tag_to_id = {v: k for k, v in id_to_tag.items()}

    # Index data
    test_data = prepare_dataset(test_sentences, word_to_id, char_to_id,
                                tag_to_id, None, args["lowercase"])
    test_iter = iterators.SerialIterator(test_data,
                                         args["batch_size"],
                                         repeat=False,
                                         shuffle=False)

    model = Model(len(word_to_id), len(char_to_id), len(tag_to_id), args)

    serializers.load_npz(args['path_model'], model)

    model.id_to_tag = id_to_tag
    model.parameters = args

    device = args['gpus']
    if device['main'] >= 0:
        cuda.get_device_from_id(device['main']).use()
        model.to_gpu()

    pred_tags = []
    gold_tags = []
    words = []

    # Collect predictions
    for ts, ys, xs in predict(test_iter, model, args['mode']):
        gold_tags.extend(ts)
        pred_tags.extend(ys)
        words.extend(xs)

    evaluate(model, pred_tags, gold_tags, words)
Beispiel #2
0
    def evaluate(self):
        iterator = self._iterators['main']
        target = self._targets['main']

        it = copy.copy(iterator)
        summary = reporter.DictSummary()
        ys_final, ts_final, raw_xs = [], [], []

        for batch in it:
            # Read batch data and sort sentences in descending order for CRF layer
            observation = {}

            raw_words = [x['str_words'] for x in batch]
            words = [self.xp.array(x['words']).astype('i') for x in batch]
            chars = [
                self.xp.array(y, dtype=self.xp.int32) for x in batch
                for y in x['chars']
            ]
            tags = self.xp.vstack(
                [self.xp.array(x['tags']).astype('i') for x in batch])

            # Init index to keep track of words
            index_start = self.xp.arange(F.hstack(words).shape[0])
            index_end = index_start + 1
            index = self.xp.column_stack((index_start, index_end))

            # Nest level + 1
            max_depth = len(batch[0]['tags'][0])
            sentence_len = xp.array([x.shape[0] for x in words])
            section = xp.cumsum(sentence_len[:-1])

            # Init
            predicts_depths = self.xp.empty(
                (0, self.xp.sum(sentence_len))).astype('i')

            with reporter.report_scope(observation):
                for depth in range(max_depth):
                    accuracy, loss, next, index, extend_predicts, words, chars = target(
                        chars, words, tags[:, depth], index, False)
                    predicts_depths = self.xp.vstack(
                        (predicts_depths, extend_predicts))

                    if not next:
                        break

            summary.add(observation)
            predicts_depths = self.xp.split(predicts_depths, section, axis=1)
            ts_depths = self.xp.split(self.xp.transpose(tags), section, axis=1)
            ys_final.extend(predicts_depths)
            ts_final.extend(ts_depths)
            raw_xs.extend(raw_words)

        fmeasure = summary.compute_mean()

        fmeasure['dev/main/fscore'] = evaluate(target, ys_final, ts_final,
                                               raw_xs)

        return fmeasure
def main(config_path):
    args = parse_config(config_path)

    # Load sentences
    test_sentences = load_sentences(args["path_test"], args["replace_digit"])

    # Update tagging scheme (IOB/IOBES)
    update_tag_scheme(test_sentences, args["tag_scheme"])

    # Load mappings from disk
    id_to_word, id_to_char, id_to_tag = load_mappings(args["mappings_path"])
    word_to_id = {v: k for k, v in id_to_word.items()}
    char_to_id = {v: k for k, v in id_to_char.items()}
    tag_to_id = {v: k for k, v in id_to_tag.items()}

    # Index data
    test_data = prepare_dataset(test_sentences, word_to_id, char_to_id,
                                tag_to_id, None, args["lowercase"])
    test_iter = iterators.SerialIterator(test_data,
                                         args["batch_size"],
                                         repeat=False,
                                         shuffle=False)

    model = Model(len(word_to_id), len(char_to_id), len(tag_to_id), args)

    serializers.load_npz(args['path_model'], model)

    model.id_to_tag = id_to_tag
    model.parameters = args

    device = args['gpus']
    if device['main'] >= 0:
        cuda.get_device_from_id(device['main']).use()
        model.to_gpu()

    pred_tags = []
    gold_tags = []
    words = []

    # Collect predictions
    out = open(args['predictions_path'], "w", encoding="utf-8")

    all_true = {}
    all_pred = {}
    idx = 0
    for ts, ys, xs in predict(test_iter, model, args['mode']):
        gold_tags.extend(ts)
        pred_tags.extend(ys)
        words.extend(xs)

        # for sentence in batch size
        for i in range(len(xs)):
            true_entities = get_entities(xs[i], ts[i], id_to_tag)
            pred_entities = get_entities(xs[i], ys[i], id_to_tag)

            out.write("%s\t%s\n" % ("|".join(
                ["%s %s %s" % (v[1], v[2], v[3])
                 for v in true_entities]), "|".join(
                     ["%s %s %s" % (v[1], v[2], v[3])
                      for v in pred_entities])))
            for sid, start, end, label in true_entities:
                all_true[(idx, sid, start, end, label)] = 1
            for sid, start, end, label in pred_entities:
                all_pred[(idx, sid, start, end, label)] = 1

            idx += 1

    out.close()

    calc_f(all_pred, all_true)
    evaluate(model, pred_tags, gold_tags, words)