def main(argv): argparser = argument_parser('predict') args = argparser.parse_args(argv[1:]) ner_model, tokenizer, labels, config = load_ner_model(args.ner_model_dir) max_seq_len = config['max_seq_length'] label_map = {t: i for i, t in enumerate(labels)} inv_label_map = {v: k for k, v in label_map.items()} test_words, dummy_labels = read_conll(args.test_data, mode='test') test_data = process_sentences(test_words, dummy_labels, tokenizer, max_seq_len) test_x = encode(test_data.combined_tokens, tokenizer, max_seq_len) probs = ner_model.predict(test_x, batch_size=args.batch_size) preds = np.argmax(probs, axis=-1) pred_labels = [] for i, pred in enumerate(preds): pred_labels.append( [inv_label_map[t] for t in pred[1:len(test_data.tokens[i]) + 1]]) lines = write_result(args.output_file, test_data.words, test_data.lengths, test_data.tokens, test_data.labels, pred_labels, mode='predict') return 0
def __init__(self, bot_self, ctx, arg1, sc): self.bot_self = bot_self self.ctx = ctx self.arg1 = argument_parser(sc, arg1) self.sub_command = sc self.embed_title = self.title() self.s_obj = self.sql_ship_obj()
def main(argv): argparser = argument_parser() args = argparser.parse_args(argv[1:]) seq_len = args.max_seq_length # abbreviation pretrained_model, tokenizer = load_pretrained(args) train_words, train_tags = read_conll(args.train_data) test_words, test_tags = read_conll(args.test_data) train_data = process_sentences(train_words, train_tags, tokenizer, seq_len) test_data = process_sentences(test_words, test_tags, tokenizer, seq_len) label_list = get_labels(train_data.labels) tag_map = {l: i for i, l in enumerate(label_list)} inv_tag_map = {v: k for k, v in tag_map.items()} init_prob, trans_prob = viterbi_probabilities(train_data.labels, tag_map) train_x = encode(train_data.combined_tokens, tokenizer, seq_len) test_x = encode(test_data.combined_tokens, tokenizer, seq_len) train_y, train_weights = label_encode(train_data.combined_labels, tag_map, seq_len) test_y, test_weights = label_encode(test_data.combined_labels, tag_map, seq_len) ner_model = create_ner_model(pretrained_model, len(tag_map)) optimizer = create_optimizer(len(train_x[0]), args) ner_model.compile(optimizer, loss='sparse_categorical_crossentropy', sample_weight_mode='temporal', metrics=['sparse_categorical_accuracy']) ner_model.fit(train_x, train_y, sample_weight=train_weights, epochs=args.num_train_epochs, batch_size=args.batch_size) if args.ner_model_dir is not None: label_list = [v for k, v in sorted(list(inv_tag_map.items()))] save_ner_model(ner_model, tokenizer, label_list, args) save_viterbi_probabilities(init_prob, trans_prob, inv_tag_map, args) probs = ner_model.predict(test_x, batch_size=args.batch_size) preds = np.argmax(probs, axis=-1) pred_tags = [] for i, pred in enumerate(preds): pred_tags.append( [inv_tag_map[t] for t in pred[1:len(test_data.tokens[i]) + 1]]) lines = write_result(args.output_file, test_data.words, test_data.lengths, test_data.tokens, test_data.labels, pred_tags) c = conlleval.evaluate(lines) conlleval.report(c) return 0
def main(argv): argparser = argument_parser('serve') args = argparser.parse_args(argv[1:]) if args.ner_model_dir is None: args.ner_model_dir = DEFAULT_MODEL_DIR app.tagger = Tagger.load(args.ner_model_dir) app.run(port=8080) return 0
def main(argv): args = argument_parser('serve').parse_args(argv[1:]) session = tf.Session() graph = tf.get_default_graph() with graph.as_default(): with session.as_default(): app.model, app.tokenizer, app.labels, app.model_config = load_model( args.model_dir) app.session = session app.graph = graph app.run(port=args.port, debug=True) return 0
def main(argv): argparser = argument_parser('predict') args = argparser.parse_args(argv[1:]) ner_model, tokenizer, labels, config = load_ner_model(args.ner_model_dir) max_seq_len = config['max_seq_length'] label_map = {t: i for i, t in enumerate(labels)} inv_label_map = {v: k for k, v in label_map.items()} if args.viterbi: try: init_prob, trans_prob = load_viterbi_probabilities( args.ner_model_dir, label_map) except Exception as e: error('failed to load viterbi probabilities: {}'.format(e)) init_prob, trans_prob, args.viterbi = None, None, False test_words, dummy_labels = read_conll(args.test_data, mode='test') test_data = process_sentences(test_words, dummy_labels, tokenizer, max_seq_len) test_x = encode(test_data.combined_tokens, tokenizer, max_seq_len) probs = ner_model.predict(test_x, batch_size=args.batch_size) pred_labels = [] if not args.viterbi: preds = np.argmax(probs, axis=-1) for i, pred in enumerate(preds): pred_labels.append([ inv_label_map[t] for t in pred[1:len(test_data.tokens[i]) + 1] ]) else: for i, prob in enumerate(probs): cond_prob = prob[1:len(test_data.tokens[i]) + 1] path = viterbi_path(init_prob, trans_prob, cond_prob) pred_labels.append([inv_label_map[i] for i in path]) write_result(args.output_file, test_data.words, test_data.lengths, test_data.tokens, test_data.labels, pred_labels, mode='predict') return 0
def main(argv): args = argument_parser('predict').parse_args(argv[1:]) model, tokenizer, labels, config = load_model_etc(args.model_dir) _, test_texts = load_tsv_data(args.test_data, args) max_seq_len = config['max_seq_length'] replace_span = config['replace_span'] label_map = {t: i for i, t in enumerate(labels)} inv_label_map = {v: k for k, v in label_map.items()} test_tok = tokenize_texts(test_texts, tokenizer) test_x = encode_tokenized(test_tok, tokenizer, max_seq_len, replace_span) probs = model.predict(test_x, batch_size=args.batch_size) preds = np.argmax(probs, axis=-1) for p in preds: print(inv_label_map[p]) return 0
def main(argv): args = argument_parser('test').parse_args(argv[1:]) model, tokenizer, labels, config = load_model(args.model_dir) test_labels, test_texts = load_tsv_data(args.test_data, args) max_seq_len = config['max_seq_length'] replace_span = config['replace_span'] label_map = {t: i for i, t in enumerate(labels)} inv_label_map = {v: k for k, v in label_map.items()} test_tok = tokenize_texts(test_texts, tokenizer) test_x = encode_tokenized(test_tok, tokenizer, max_seq_len, replace_span) test_y = [label_map[l] for l in test_labels] probs = model.predict(test_x, batch_size=args.batch_size) preds = np.argmax(probs, axis=-1) correct, total = sum(g == p for g, p in zip(test_y, preds)), len(test_y) print('Test accuracy: {:.1%} ({}/{})'.format(correct / total, correct, total)) return 0
def main(argv): print_versions() args = argument_parser('train').parse_args(argv[1:]) args.train_data = args.train_data.split(',') if args.checkpoint_steps is not None: os.makedirs(args.checkpoint_dir, exist_ok=True) strategy = MirroredStrategy() num_devices = strategy.num_replicas_in_sync # Batch datasets with global batch size (local * GPUs) global_batch_size = args.batch_size * num_devices tokenizer = get_tokenizer(args) label_list = load_labels(args.labels) label_map = { l: i for i, l in enumerate(label_list) } inv_label_map = { v: k for k, v in label_map.items() } if args.task_name not in (["NER","RE"]): raise ValueError("Task not found: {}".format(args.task_name)) if args.train_data[0].endswith('.tsv'): if len(args.train_data) > 1: raise NotImplementedError('Multiple TSV inputs') train_data = TsvSequence(args.train_data[0], tokenizer, label_map, global_batch_size, args) input_format = 'tsv' elif args.train_data[0].endswith('.tfrecord'): train_data = train_tfrecord_input(args.train_data, args.max_seq_length, global_batch_size) input_format = 'tfrecord' else: raise ValueError('--train_data must be .tsv or .tfrecord') if args.dev_data is None: dev_x, dev_y = None, None validation_data = None else: dev_x, dev_y = load_dataset(args.dev_data, tokenizer, args.max_seq_length, label_map, args) validation_data = (dev_x, dev_y) print('Number of devices: {}'.format(num_devices), file=sys.stderr, flush=True) if num_devices > 1 and input_format != 'tfrecord': warning('TFRecord input recommended for multi-device training') num_train_examples = num_examples(args.train_data) num_labels = len(label_list) print('num_train_examples: {}'.format(num_train_examples), file=sys.stderr, flush=True) with strategy.scope(): model = restore_or_create_model(num_train_examples, num_labels, global_batch_size, args) model.summary(print_fn=print) callbacks = [] if args.checkpoint_steps is not None: callbacks.append(ModelCheckpoint( filepath=os.path.join(args.checkpoint_dir, CHECKPOINT_NAME), save_freq=args.checkpoint_steps )) callbacks.append(DeleteOldCheckpoints( args.checkpoint_dir, CHECKPOINT_NAME, args.max_checkpoints )) if input_format == 'tsv': other_args = { 'workers': 10, # TODO } else: assert input_format == 'tfrecord', 'internal error' steps_per_epoch = int(np.ceil(num_train_examples/global_batch_size)) other_args = { 'steps_per_epoch': steps_per_epoch } model.fit( train_data, epochs=args.num_train_epochs, callbacks=callbacks, validation_data=validation_data, validation_batch_size=global_batch_size, **other_args ) if validation_data is not None: probs = model.predict(dev_x, batch_size=global_batch_size) preds = np.argmax(probs, axis=-1) correct, total = sum(g==p for g, p in zip(dev_y, preds)), len(dev_y) print('Final dev accuracy: {:.1%} ({}/{})'.format( correct/total, correct, total)) if args.model_dir is not None: print('Saving model in {}'.format(args.model_dir)) save_model_etc(model, tokenizer, label_list, args) return 0
def main(argv): argparser = argument_parser('serve') args = argparser.parse_args(argv[1:]) app.tagger = Tagger.load(args.ner_model_dir) app.run(port=8080) return 0
def main(argv): argparser = argument_parser() args = argparser.parse_args(argv[1:]) seq_len = args.max_seq_length # abbreviation pretrained_model, tokenizer = load_pretrained(args) train_words, train_tags = read_conll(args.train_data) test_words, test_tags = read_conll(args.test_data) print(args.no_context) if args.no_context: train_data = process_no_context(train_words, train_tags, tokenizer, seq_len) test_data = process_no_context(test_words, test_tags, tokenizer, seq_len) elif args.documentwise: tr_docs, tr_doc_tags, tr_line_ids = split_to_documents(train_words, train_tags) te_docs, te_doc_tags, te_line_ids = split_to_documents(test_words, test_tags) train_data = process_docs(tr_docs, tr_doc_tags, tr_line_ids, tokenizer, seq_len) test_data = process_docs(te_docs, te_doc_tags, te_line_ids, tokenizer, seq_len) else: train_data = process_sentences(train_words, train_tags, tokenizer, seq_len, args.predict_position) test_data = process_sentences(test_words, test_tags, tokenizer, seq_len, args.predict_position) label_list = get_labels(train_data.labels) tag_map = { l: i for i, l in enumerate(label_list) } inv_tag_map = { v: k for k, v in tag_map.items() } train_x = encode(train_data.combined_tokens, tokenizer, seq_len) test_x = encode(test_data.combined_tokens, tokenizer, seq_len) train_y, train_weights = label_encode(train_data.combined_labels, tag_map, seq_len) test_y, test_weights = label_encode(test_data.combined_labels, tag_map, seq_len) if args.use_ner_model and (args.ner_model_dir is not None): ner_model, tokenizer, labels, config = load_ner_model(args.ner_model_dir) else: optimizer = create_optimizer(len(train_x[0]), args) model = create_ner_model(pretrained_model, len(tag_map)) if args.num_gpus > 1: ner_model = multi_gpu_model(model, args.num_gpus) else: ner_model = model ner_model.compile( optimizer, loss='sparse_categorical_crossentropy', sample_weight_mode='temporal', metrics=['sparse_categorical_accuracy'] ) ner_model.fit( train_x, train_y, sample_weight=train_weights, epochs=args.num_train_epochs, batch_size=args.batch_size ) if args.ner_model_dir is not None: label_list = [v for k, v in sorted(list(inv_tag_map.items()))] save_ner_model(ner_model, tokenizer, label_list, args) probs = ner_model.predict(test_x, batch_size=args.batch_size) preds = np.argmax(probs, axis=-1) results = [] m_names = [] if args.no_context: pr_ensemble, pr_test_first = get_predictions(preds, test_data.tokens, test_data.sentence_numbers) output_file = "output/{}-NC.tsv".format(args.output_file) m_names.append('NC') ensemble = [] for i,pred in enumerate(pr_test_first): ensemble.append([inv_tag_map[t] for t in pred]) lines_ensemble, sentences_ensemble = write_result( output_file, test_data.words, test_data.lengths, test_data.tokens, test_data.labels, ensemble ) c = conlleval.evaluate(lines_ensemble) conlleval.report(c) results.append([conlleval.metrics(c)[0].prec, conlleval.metrics(c)[0].rec, conlleval.metrics(c)[0].fscore]) else: # First tag then vote pr_ensemble, pr_test_first = get_predictions(preds, test_data.tokens, test_data.sentence_numbers) # Accumulate probabilities, then vote prob_ensemble, prob_test_first = get_predictions2(probs, test_data.tokens, test_data.sentence_numbers) ens = [pr_ensemble, prob_ensemble, pr_test_first, prob_test_first] if args.documentwise: # D-CMV: Documentwise CMV # D-CMVP: Documetwise CMV, probs summed, argmax after that # D-F: Documentwise First # D-FP: Same as D-FP method_names = ['D-CMV','D-CMVP','D-F','D-FP'] else: method_names = ['CMV','CMVP','F','FP'] for i, ensem in enumerate(ens): ensemble = [] for j,pred in enumerate(ensem): ensemble.append([inv_tag_map[t] for t in pred]) output_file = "output/{}-{}.tsv".format(args.output_file, method_names[i]) lines_ensemble, sentences_ensemble = write_result( output_file, test_data.words, test_data.lengths, test_data.tokens, test_data.labels, ensemble) print("Model trained: ", args.ner_model_dir) print("Seq-len: ", args.max_seq_length) print("Learning rate: ", args.learning_rate) print("Batch Size: ", args.batch_size) print("Epochs: ", args.num_train_epochs) print("Training data: ", args.train_data) print("Testing data: ", args.test_data) print("") print("Results with {}".format(method_names[i])) c = conlleval.evaluate(lines_ensemble) print("") conlleval.report(c) results.append([conlleval.metrics(c)[0].prec, conlleval.metrics(c)[0].rec, conlleval.metrics(c)[0].fscore]) m_names.extend(method_names) if args.sentence_in_context: starting_pos = np.arange(0,seq_len+1,32) starting_pos[0] = 1 m_names.extend(starting_pos) for start_p in starting_pos: tt_lines, tt_tags, line_nos, line_starts = combine_sentences2(test_data.tokens, test_data.labels, seq_len-1, start_p-1) tt_x = encode(tt_lines, tokenizer, seq_len) tt_y, train_weights = label_encode(tt_tags, tag_map, seq_len) probs = ner_model.predict(tt_x, batch_size=args.batch_size) preds = np.argmax(probs, axis=-1) pred_tags = [] for i, pred in enumerate(preds): idx = line_nos[i].index(i) pred_tags.append([inv_tag_map[t] for t in pred[line_starts[i][idx]+1:line_starts[i][idx]+len(test_data.tokens[i])+1]]) output_file = "output/{}-{}.tsv".format(args.output_file, start_p) lines_first, sentences_first = write_result( output_file, test_data.words, test_data.lengths, test_data.tokens, test_data.labels, pred_tags ) print("") print("Results with prediction starting position ", start_p) c = conlleval.evaluate(lines_first) conlleval.report(c) results.append([conlleval.metrics(c)[0].prec, conlleval.metrics(c)[0].rec, conlleval.metrics(c)[0].fscore]) result_file = "./results/results-{}.csv".format(args.output_file) with open(result_file, 'w+') as f: for i, line in enumerate(results): params = "{},{},{},{},{},{},{},{},{}".format(args.output_file, args.max_seq_length, args.bert_config_file, args.num_train_epochs, args.learning_rate, args.batch_size, args.predict_position, args.train_data, args.test_data) f.write(params) f.write(",{}".format(m_names[i])) for item in line: f.write(",{}".format(item)) f.write('\n') for i in results: print(i) return 0