コード例 #1
0
 def sentence_break(self):
     if len(self.texts) == 0:
         return
     if self.config.iobes:
         self.tags = iob_to_iobes(self.tags)
     tokens = [Token(t, g) for t, g in zip(self.texts, self.tags)]
     self.document.add_child(Sentence(tokens=tokens))
     self.texts = []
     self.tags = []
コード例 #2
0
def make_document(token_texts, label):
    """Return Document object initialized with given token texts."""
    tokens = [Token(t) for t in token_texts]
    # We don't have sentence splitting, but the data structure expects
    # Documents to contain Sentences which in turn contain Tokens.
    # Create a dummy sentence containing all document tokens to work
    # around this constraint.
    sentences = [Sentence(tokens=tokens)]
    return Document(target_str=label, sentences=sentences)
コード例 #3
0
def sent2tokens(sent, embeddings=None):
    words = sent2words(sent)
    postags = sent2postags(sent)
    word_vectors = sent2embeddings(sent, embeddings)
    # chunk tags, dependency features or embedding features can also be added
    tokens = [
        Token(surface=w, pos=postag, embedding=emb)
        for (w, postag, emb) in zip(words, postags, word_vectors)
    ]
    for t in tokens:
        # generate features
        t.add(token_features(token=t))
    return tokens
コード例 #4
0
def main(argv):
    # test example generation
    from data import Token, ConllLoader, load_labels
    from label import Iob2TokenLabeler, LabelEncoder
    from transformers import AutoConfig, AutoTokenizer

    options = argparser().parse_args(argv[1:])
    seq_len = options.max_seq_length

    word_labels = load_labels(options.labels)
    token_labeler = Iob2TokenLabeler(word_labels)  # TODO add argument
    token_labels = token_labeler.labels()
    label_func = token_labeler.label_tokens
    label_encoder = LabelEncoder(token_labels, padding_label='O')  # TODO
    encode_labels = label_encoder.encode

    config = AutoConfig.from_pretrained(options.model_name,
                                        cache_dir=options.cache_dir)
    tokenizer = AutoTokenizer.from_pretrained(options.model_name,
                                              config=config,
                                              cache_dir=options.cache_dir)
    tokenize_func = tokenizer.tokenize
    encode_tokens = lambda t: tokenizer.encode(t, add_special_tokens=False)

    document_loader = ConllLoader(tokenize_func, label_func)
    example_generator = WrapSentenceExampleGenerator(
        seq_len, Token(tokenizer.cls_token, is_special=True, masked=False),
        Token(tokenizer.sep_token, is_special=True, masked=False),
        Token(tokenizer.pad_token, is_special=True, masked=True),
        encode_tokens, encode_labels)

    for fn in options.conll_data:
        documents = list(document_loader.load(fn))
        examples = list(example_generator.examples(documents))
        for i, example in enumerate(examples):
            print(f'example {i}')
            print(example)
コード例 #5
0
    def loadSrc(self):
        corpus = self.corpus
        src = self.src

        tokens = []
        id = 0
        with open(src, 'r') as fin:
            for line in fin.readlines():
                if line == '\n':
                    tmp_tokens = list(tokens)
                    le = len(tmp_tokens)
                    for i, token in enumerate(tmp_tokens):
                        h_id = token.h_id
                        h_rel = token.rel

                        if i < le - 1:
                            tmp_tokens[i].add_d_id_rel(i + 1, '@+1@')

                        if i > 0:
                            tmp_tokens[i].add_u_id_rel(i - 1, '@-1@')

                        if h_id != -1:
                            tmp_tokens[h_id].add_d_id_rel(i, h_rel)

                    sent = Sentence(tmp_tokens)
                    corpus.append(sent)
                    tokens = []

                    id = 0
                else:
                    items = line.strip().split()
                    t_str = items[0]
                    h_id = int(items[1])
                    rel = items[2]
                    label = items[3]
                    token = Token(id, t_str, h_id, rel, label)
                    tokens.append(token)
                    id += 1
        return corpus
コード例 #6
0
def main(argv):
    options = argparser().parse_args(argv[1:])
    logger.info(f'train.py arguments: {options}')

    # word_labels are the labels assigned to words in the original
    # data, token_labeler.labels() the labels assigned to tokens in
    # the tokenized data. The two are differentiated to allow distinct
    # labels to be added e.g. to continuation wordpieces.
    word_labels = load_labels(options.labels)
    token_labeler = IobesTokenLabeler(word_labels)
    num_labels = len(token_labeler.labels())
    label_encoder = LabelEncoder(token_labeler.labels())
    logger.info(f'token labels: {token_labeler.labels()}')

    logger.info('loading pretrained model')
    pretrained_model, tokenizer, config = load_pretrained(
        options.model_name, cache_dir=options.cache_dir)
    logger.info('pretrained model config:')
    logger.info(config)

    if options.max_seq_length > config.max_position_embeddings:
        raise ValueError(f'--max_seq_length {options.max_seq_length} not '
                         f'supported by model')
    seq_len = options.max_seq_length

    encode_tokens = lambda t: tokenizer.encode(t, add_special_tokens=False)

    document_loader = ConllLoader(tokenizer.tokenize,
                                  token_labeler.label_tokens,
                                  options.separator)

    example_generator = EXAMPLE_GENERATORS[options.examples](
        seq_len, Token(tokenizer.cls_token, is_special=True, masked=False),
        Token(tokenizer.sep_token, is_special=True, masked=False),
        Token(tokenizer.pad_token, is_special=True,
              masked=True), encode_tokens, label_encoder.encode)

    train_documents = document_loader.load(options.train_data)
    dev_documents = document_loader.load(options.dev_data)
    # containers instead of generators for statistics
    train_documents = list(train_documents)
    dev_documents = list(dev_documents)
    log_dataset_statistics('train', train_documents)
    log_dataset_statistics('dev', dev_documents)

    decoder = ViterbiDecoder(label_encoder.label_map)
    decoder.estimate_probabilities(train_documents)
    logger.info(f'init_prob:\n{decoder.init_prob}')
    logger.info(f'trans_prob:\n{decoder.trans_prob}')

    train_examples = example_generator.examples(train_documents)
    dev_examples = example_generator.examples(dev_documents)
    # containers instead of generators for len() and logging
    train_examples = list(train_examples)
    dev_examples = list(dev_examples)
    num_train_examples = len(train_examples)
    log_examples(train_examples, count=2)

    train_x, train_y = examples_to_inputs(train_examples)
    dev_x, dev_y = examples_to_inputs(dev_examples)

    ner_model = build_ner_model(pretrained_model, num_labels, seq_len)

    optimizer, lr_schedule = get_optimizer(
        options.lr,
        options.num_train_epochs,
        options.batch_size,
        options.warmup_proportion,
        num_train_examples,
    )

    ner_model.compile(
        optimizer=optimizer,
        loss='sparse_categorical_crossentropy',
        sample_weight_mode='temporal',  # TODO is this necessary?
        metrics=['sparse_categorical_accuracy'])
    logger.info('ner model:')
    ner_model.summary(print_fn=logger.info)

    lr_history = LRHistory(lr_schedule)
    history = ner_model.fit(train_x,
                            train_y,
                            epochs=options.num_train_epochs,
                            batch_size=options.batch_size,
                            validation_data=(dev_x, dev_y),
                            callbacks=[lr_history])
    for k, v in history.history.items():
        logger.info(f'{k} history: {v}')
    logger.info(f'lr history: {lr_history.by_epoch}')

    dev_predictions = ner_model.predict(dev_x,
                                        verbose=1,
                                        batch_size=options.batch_size)
    assert len(dev_examples) == len(dev_predictions)
    for example, preds in zip(dev_examples, dev_predictions):
        assert len(example.tokens) == len(preds)
        for pos, (token, pred) in enumerate(zip(example.tokens, preds)):
            token.predictions.append((pos, pred))

    documents = unique(t.document for e in dev_examples for t in e.tokens
                       if not t.is_special)
    check_predictions(documents)

    for n, r in evaluate_assign_labels_funcs(documents, label_encoder).items():
        print(f'{n}: prec {r.prec:.2%} rec {r.rec:.2%} f {r.fscore:.2%}')

    summarize_predictions = PREDICTION_SUMMARIZERS[options.summarize_preds]
    assign_labels = LABEL_ASSIGNERS[options.assign_labels]
    for document in documents:
        summarize_predictions(document)
        assign_labels(document, label_encoder)

    for n, r in evaluate_viterbi(documents, decoder.init_prob,
                                 decoder.trans_prob, label_encoder).items():
        print(f'{n}: prec {r.prec:.2%} rec {r.rec:.2%} f {r.fscore:.2%}')

    for document in documents:
        assign_labels(document, label_encoder)  # greedy

    print(conlleval_report(documents))

    if options.output_file is not None:
        with open(options.output_file, 'w') as out:
            write_conll(documents, out=out)

    if options.ner_model_dir is not None:
        save_ner_model(options.ner_model_dir, ner_model, decoder, tokenizer,
                       word_labels, config)

    return 0
コード例 #7
0
def main(argv):
    options = argparser().parse_args(argv[1:])

    ner_model, decoder, tokenizer, word_labels, config = load_ner_model(
        options.ner_model_dir)

    token_labeler = IobesTokenLabeler(word_labels)
    label_encoder = LabelEncoder(token_labeler.labels())

    encode_tokens = lambda t: tokenizer.encode(t, add_special_tokens=False)

    document_loader = ConllLoader(
        tokenizer.tokenize,
        token_labeler.label_tokens,
        options.separator,
        #test=True
    )

    example_generator = 'wrap'  # TODO read from config
    seq_len = 512  # TODO read from config
    example_generator = EXAMPLE_GENERATORS[example_generator](
        seq_len, Token(tokenizer.cls_token, is_special=True, masked=False),
        Token(tokenizer.sep_token, is_special=True, masked=False),
        Token(tokenizer.pad_token, is_special=True,
              masked=True), encode_tokens, label_encoder.encode)

    test_documents = document_loader.load(options.data)
    test_examples = example_generator.examples(test_documents)
    test_examples = list(test_examples)  # TODO stream
    test_x, test_y = examples_to_inputs(test_examples)

    test_predictions = ner_model.predict(test_x)
    for example, preds in zip(test_examples, test_predictions):
        assert len(example.tokens) == len(preds)
        for pos, (token, pred) in enumerate(zip(example.tokens, preds)):
            token.predictions.append((pos, pred))

    documents = unique(t.document for e in test_examples for t in e.tokens
                       if not t.is_special)

    summarize_preds = 'avg'  # TODO read from config
    assign_labels = 'first'  # TODO read from config
    summarize_predictions = PREDICTION_SUMMARIZERS[summarize_preds]
    assign_labels = LABEL_ASSIGNERS[assign_labels]

    for document in documents:
        summarize_predictions(document)
        assign_labels(document, label_encoder)

    with open('greedy.tsv', 'w') as out:
        write_conll(documents, out=out)

    print(conlleval_report(documents))

    for document in documents:
        for sentence in document.sentences:
            tokens = [t for w in sentence.words for t in w.tokens]
            cond_prob = [t.pred_summary for t in tokens]
            path = decoder.viterbi_path(cond_prob, weight=4)
            assert len(path) == len(tokens)
            for idx, token in zip(path, tokens):
                label = label_encoder.inv_label_map[idx]
                label = iobes_to_iob2(label)
                token.viterbi_label = label
            for word in sentence.words:
                word.predicted_label = word.tokens[0].viterbi_label

    with open('viterbi.tsv', 'w') as out:
        write_conll(documents, out=out)

    print(conlleval_report(documents))