def sentence_break(self): if len(self.texts) == 0: return if self.config.iobes: self.tags = iob_to_iobes(self.tags) tokens = [Token(t, g) for t, g in zip(self.texts, self.tags)] self.document.add_child(Sentence(tokens=tokens)) self.texts = [] self.tags = []
def make_document(token_texts, label): """Return Document object initialized with given token texts.""" tokens = [Token(t) for t in token_texts] # We don't have sentence splitting, but the data structure expects # Documents to contain Sentences which in turn contain Tokens. # Create a dummy sentence containing all document tokens to work # around this constraint. sentences = [Sentence(tokens=tokens)] return Document(target_str=label, sentences=sentences)
def sent2tokens(sent, embeddings=None): words = sent2words(sent) postags = sent2postags(sent) word_vectors = sent2embeddings(sent, embeddings) # chunk tags, dependency features or embedding features can also be added tokens = [ Token(surface=w, pos=postag, embedding=emb) for (w, postag, emb) in zip(words, postags, word_vectors) ] for t in tokens: # generate features t.add(token_features(token=t)) return tokens
def main(argv): # test example generation from data import Token, ConllLoader, load_labels from label import Iob2TokenLabeler, LabelEncoder from transformers import AutoConfig, AutoTokenizer options = argparser().parse_args(argv[1:]) seq_len = options.max_seq_length word_labels = load_labels(options.labels) token_labeler = Iob2TokenLabeler(word_labels) # TODO add argument token_labels = token_labeler.labels() label_func = token_labeler.label_tokens label_encoder = LabelEncoder(token_labels, padding_label='O') # TODO encode_labels = label_encoder.encode config = AutoConfig.from_pretrained(options.model_name, cache_dir=options.cache_dir) tokenizer = AutoTokenizer.from_pretrained(options.model_name, config=config, cache_dir=options.cache_dir) tokenize_func = tokenizer.tokenize encode_tokens = lambda t: tokenizer.encode(t, add_special_tokens=False) document_loader = ConllLoader(tokenize_func, label_func) example_generator = WrapSentenceExampleGenerator( seq_len, Token(tokenizer.cls_token, is_special=True, masked=False), Token(tokenizer.sep_token, is_special=True, masked=False), Token(tokenizer.pad_token, is_special=True, masked=True), encode_tokens, encode_labels) for fn in options.conll_data: documents = list(document_loader.load(fn)) examples = list(example_generator.examples(documents)) for i, example in enumerate(examples): print(f'example {i}') print(example)
def loadSrc(self): corpus = self.corpus src = self.src tokens = [] id = 0 with open(src, 'r') as fin: for line in fin.readlines(): if line == '\n': tmp_tokens = list(tokens) le = len(tmp_tokens) for i, token in enumerate(tmp_tokens): h_id = token.h_id h_rel = token.rel if i < le - 1: tmp_tokens[i].add_d_id_rel(i + 1, '@+1@') if i > 0: tmp_tokens[i].add_u_id_rel(i - 1, '@-1@') if h_id != -1: tmp_tokens[h_id].add_d_id_rel(i, h_rel) sent = Sentence(tmp_tokens) corpus.append(sent) tokens = [] id = 0 else: items = line.strip().split() t_str = items[0] h_id = int(items[1]) rel = items[2] label = items[3] token = Token(id, t_str, h_id, rel, label) tokens.append(token) id += 1 return corpus
def main(argv): options = argparser().parse_args(argv[1:]) logger.info(f'train.py arguments: {options}') # word_labels are the labels assigned to words in the original # data, token_labeler.labels() the labels assigned to tokens in # the tokenized data. The two are differentiated to allow distinct # labels to be added e.g. to continuation wordpieces. word_labels = load_labels(options.labels) token_labeler = IobesTokenLabeler(word_labels) num_labels = len(token_labeler.labels()) label_encoder = LabelEncoder(token_labeler.labels()) logger.info(f'token labels: {token_labeler.labels()}') logger.info('loading pretrained model') pretrained_model, tokenizer, config = load_pretrained( options.model_name, cache_dir=options.cache_dir) logger.info('pretrained model config:') logger.info(config) if options.max_seq_length > config.max_position_embeddings: raise ValueError(f'--max_seq_length {options.max_seq_length} not ' f'supported by model') seq_len = options.max_seq_length encode_tokens = lambda t: tokenizer.encode(t, add_special_tokens=False) document_loader = ConllLoader(tokenizer.tokenize, token_labeler.label_tokens, options.separator) example_generator = EXAMPLE_GENERATORS[options.examples]( seq_len, Token(tokenizer.cls_token, is_special=True, masked=False), Token(tokenizer.sep_token, is_special=True, masked=False), Token(tokenizer.pad_token, is_special=True, masked=True), encode_tokens, label_encoder.encode) train_documents = document_loader.load(options.train_data) dev_documents = document_loader.load(options.dev_data) # containers instead of generators for statistics train_documents = list(train_documents) dev_documents = list(dev_documents) log_dataset_statistics('train', train_documents) log_dataset_statistics('dev', dev_documents) decoder = ViterbiDecoder(label_encoder.label_map) decoder.estimate_probabilities(train_documents) logger.info(f'init_prob:\n{decoder.init_prob}') logger.info(f'trans_prob:\n{decoder.trans_prob}') train_examples = example_generator.examples(train_documents) dev_examples = example_generator.examples(dev_documents) # containers instead of generators for len() and logging train_examples = list(train_examples) dev_examples = list(dev_examples) num_train_examples = len(train_examples) log_examples(train_examples, count=2) train_x, train_y = examples_to_inputs(train_examples) dev_x, dev_y = examples_to_inputs(dev_examples) ner_model = build_ner_model(pretrained_model, num_labels, seq_len) optimizer, lr_schedule = get_optimizer( options.lr, options.num_train_epochs, options.batch_size, options.warmup_proportion, num_train_examples, ) ner_model.compile( optimizer=optimizer, loss='sparse_categorical_crossentropy', sample_weight_mode='temporal', # TODO is this necessary? metrics=['sparse_categorical_accuracy']) logger.info('ner model:') ner_model.summary(print_fn=logger.info) lr_history = LRHistory(lr_schedule) history = ner_model.fit(train_x, train_y, epochs=options.num_train_epochs, batch_size=options.batch_size, validation_data=(dev_x, dev_y), callbacks=[lr_history]) for k, v in history.history.items(): logger.info(f'{k} history: {v}') logger.info(f'lr history: {lr_history.by_epoch}') dev_predictions = ner_model.predict(dev_x, verbose=1, batch_size=options.batch_size) assert len(dev_examples) == len(dev_predictions) for example, preds in zip(dev_examples, dev_predictions): assert len(example.tokens) == len(preds) for pos, (token, pred) in enumerate(zip(example.tokens, preds)): token.predictions.append((pos, pred)) documents = unique(t.document for e in dev_examples for t in e.tokens if not t.is_special) check_predictions(documents) for n, r in evaluate_assign_labels_funcs(documents, label_encoder).items(): print(f'{n}: prec {r.prec:.2%} rec {r.rec:.2%} f {r.fscore:.2%}') summarize_predictions = PREDICTION_SUMMARIZERS[options.summarize_preds] assign_labels = LABEL_ASSIGNERS[options.assign_labels] for document in documents: summarize_predictions(document) assign_labels(document, label_encoder) for n, r in evaluate_viterbi(documents, decoder.init_prob, decoder.trans_prob, label_encoder).items(): print(f'{n}: prec {r.prec:.2%} rec {r.rec:.2%} f {r.fscore:.2%}') for document in documents: assign_labels(document, label_encoder) # greedy print(conlleval_report(documents)) if options.output_file is not None: with open(options.output_file, 'w') as out: write_conll(documents, out=out) if options.ner_model_dir is not None: save_ner_model(options.ner_model_dir, ner_model, decoder, tokenizer, word_labels, config) return 0
def main(argv): options = argparser().parse_args(argv[1:]) ner_model, decoder, tokenizer, word_labels, config = load_ner_model( options.ner_model_dir) token_labeler = IobesTokenLabeler(word_labels) label_encoder = LabelEncoder(token_labeler.labels()) encode_tokens = lambda t: tokenizer.encode(t, add_special_tokens=False) document_loader = ConllLoader( tokenizer.tokenize, token_labeler.label_tokens, options.separator, #test=True ) example_generator = 'wrap' # TODO read from config seq_len = 512 # TODO read from config example_generator = EXAMPLE_GENERATORS[example_generator]( seq_len, Token(tokenizer.cls_token, is_special=True, masked=False), Token(tokenizer.sep_token, is_special=True, masked=False), Token(tokenizer.pad_token, is_special=True, masked=True), encode_tokens, label_encoder.encode) test_documents = document_loader.load(options.data) test_examples = example_generator.examples(test_documents) test_examples = list(test_examples) # TODO stream test_x, test_y = examples_to_inputs(test_examples) test_predictions = ner_model.predict(test_x) for example, preds in zip(test_examples, test_predictions): assert len(example.tokens) == len(preds) for pos, (token, pred) in enumerate(zip(example.tokens, preds)): token.predictions.append((pos, pred)) documents = unique(t.document for e in test_examples for t in e.tokens if not t.is_special) summarize_preds = 'avg' # TODO read from config assign_labels = 'first' # TODO read from config summarize_predictions = PREDICTION_SUMMARIZERS[summarize_preds] assign_labels = LABEL_ASSIGNERS[assign_labels] for document in documents: summarize_predictions(document) assign_labels(document, label_encoder) with open('greedy.tsv', 'w') as out: write_conll(documents, out=out) print(conlleval_report(documents)) for document in documents: for sentence in document.sentences: tokens = [t for w in sentence.words for t in w.tokens] cond_prob = [t.pred_summary for t in tokens] path = decoder.viterbi_path(cond_prob, weight=4) assert len(path) == len(tokens) for idx, token in zip(path, tokens): label = label_encoder.inv_label_map[idx] label = iobes_to_iob2(label) token.viterbi_label = label for word in sentence.words: word.predicted_label = word.tokens[0].viterbi_label with open('viterbi.tsv', 'w') as out: write_conll(documents, out=out) print(conlleval_report(documents))