Example #1
0
def main(argv):
    argparser = argument_parser('predict')
    args = argparser.parse_args(argv[1:])

    ner_model, tokenizer, labels, config = load_ner_model(args.ner_model_dir)
    max_seq_len = config['max_seq_length']

    label_map = {t: i for i, t in enumerate(labels)}
    inv_label_map = {v: k for k, v in label_map.items()}

    test_words, dummy_labels = read_conll(args.test_data, mode='test')
    test_data = process_sentences(test_words, dummy_labels, tokenizer,
                                  max_seq_len)

    test_x = encode(test_data.combined_tokens, tokenizer, max_seq_len)

    probs = ner_model.predict(test_x, batch_size=args.batch_size)
    preds = np.argmax(probs, axis=-1)

    pred_labels = []
    for i, pred in enumerate(preds):
        pred_labels.append(
            [inv_label_map[t] for t in pred[1:len(test_data.tokens[i]) + 1]])

    lines = write_result(args.output_file,
                         test_data.words,
                         test_data.lengths,
                         test_data.tokens,
                         test_data.labels,
                         pred_labels,
                         mode='predict')

    return 0
Example #2
0
 def __init__(self, bot_self, ctx, arg1, sc):
     self.bot_self = bot_self
     self.ctx = ctx
     self.arg1 = argument_parser(sc, arg1)
     self.sub_command = sc
     self.embed_title = self.title()
     self.s_obj = self.sql_ship_obj()
Example #3
0
def main(argv):
    argparser = argument_parser()
    args = argparser.parse_args(argv[1:])
    seq_len = args.max_seq_length  # abbreviation

    pretrained_model, tokenizer = load_pretrained(args)

    train_words, train_tags = read_conll(args.train_data)
    test_words, test_tags = read_conll(args.test_data)
    train_data = process_sentences(train_words, train_tags, tokenizer, seq_len)
    test_data = process_sentences(test_words, test_tags, tokenizer, seq_len)

    label_list = get_labels(train_data.labels)
    tag_map = {l: i for i, l in enumerate(label_list)}
    inv_tag_map = {v: k for k, v in tag_map.items()}

    init_prob, trans_prob = viterbi_probabilities(train_data.labels, tag_map)

    train_x = encode(train_data.combined_tokens, tokenizer, seq_len)
    test_x = encode(test_data.combined_tokens, tokenizer, seq_len)

    train_y, train_weights = label_encode(train_data.combined_labels, tag_map,
                                          seq_len)
    test_y, test_weights = label_encode(test_data.combined_labels, tag_map,
                                        seq_len)

    ner_model = create_ner_model(pretrained_model, len(tag_map))
    optimizer = create_optimizer(len(train_x[0]), args)

    ner_model.compile(optimizer,
                      loss='sparse_categorical_crossentropy',
                      sample_weight_mode='temporal',
                      metrics=['sparse_categorical_accuracy'])

    ner_model.fit(train_x,
                  train_y,
                  sample_weight=train_weights,
                  epochs=args.num_train_epochs,
                  batch_size=args.batch_size)

    if args.ner_model_dir is not None:
        label_list = [v for k, v in sorted(list(inv_tag_map.items()))]
        save_ner_model(ner_model, tokenizer, label_list, args)
        save_viterbi_probabilities(init_prob, trans_prob, inv_tag_map, args)

    probs = ner_model.predict(test_x, batch_size=args.batch_size)
    preds = np.argmax(probs, axis=-1)

    pred_tags = []
    for i, pred in enumerate(preds):
        pred_tags.append(
            [inv_tag_map[t] for t in pred[1:len(test_data.tokens[i]) + 1]])

    lines = write_result(args.output_file, test_data.words, test_data.lengths,
                         test_data.tokens, test_data.labels, pred_tags)

    c = conlleval.evaluate(lines)
    conlleval.report(c)
    return 0
Example #4
0
def main(argv):
    argparser = argument_parser('serve')
    args = argparser.parse_args(argv[1:])
    if args.ner_model_dir is None:
        args.ner_model_dir = DEFAULT_MODEL_DIR
    app.tagger = Tagger.load(args.ner_model_dir)
    app.run(port=8080)
    return 0
Example #5
0
def main(argv):
    args = argument_parser('serve').parse_args(argv[1:])
    session = tf.Session()
    graph = tf.get_default_graph()
    with graph.as_default():
        with session.as_default():
            app.model, app.tokenizer, app.labels, app.model_config = load_model(
                args.model_dir)
            app.session = session
            app.graph = graph
    app.run(port=args.port, debug=True)
    return 0
Example #6
0
def main(argv):
    argparser = argument_parser('predict')
    args = argparser.parse_args(argv[1:])

    ner_model, tokenizer, labels, config = load_ner_model(args.ner_model_dir)
    max_seq_len = config['max_seq_length']

    label_map = {t: i for i, t in enumerate(labels)}
    inv_label_map = {v: k for k, v in label_map.items()}

    if args.viterbi:
        try:
            init_prob, trans_prob = load_viterbi_probabilities(
                args.ner_model_dir, label_map)
        except Exception as e:
            error('failed to load viterbi probabilities: {}'.format(e))
            init_prob, trans_prob, args.viterbi = None, None, False

    test_words, dummy_labels = read_conll(args.test_data, mode='test')
    test_data = process_sentences(test_words, dummy_labels, tokenizer,
                                  max_seq_len)

    test_x = encode(test_data.combined_tokens, tokenizer, max_seq_len)

    probs = ner_model.predict(test_x, batch_size=args.batch_size)

    pred_labels = []
    if not args.viterbi:
        preds = np.argmax(probs, axis=-1)
        for i, pred in enumerate(preds):
            pred_labels.append([
                inv_label_map[t] for t in pred[1:len(test_data.tokens[i]) + 1]
            ])
    else:
        for i, prob in enumerate(probs):
            cond_prob = prob[1:len(test_data.tokens[i]) + 1]
            path = viterbi_path(init_prob, trans_prob, cond_prob)
            pred_labels.append([inv_label_map[i] for i in path])

    write_result(args.output_file,
                 test_data.words,
                 test_data.lengths,
                 test_data.tokens,
                 test_data.labels,
                 pred_labels,
                 mode='predict')

    return 0
Example #7
0
def main(argv):
    args = argument_parser('predict').parse_args(argv[1:])

    model, tokenizer, labels, config = load_model_etc(args.model_dir)
    _, test_texts = load_tsv_data(args.test_data, args)

    max_seq_len = config['max_seq_length']
    replace_span = config['replace_span']

    label_map = {t: i for i, t in enumerate(labels)}
    inv_label_map = {v: k for k, v in label_map.items()}

    test_tok = tokenize_texts(test_texts, tokenizer)
    test_x = encode_tokenized(test_tok, tokenizer, max_seq_len, replace_span)

    probs = model.predict(test_x, batch_size=args.batch_size)
    preds = np.argmax(probs, axis=-1)
    for p in preds:
        print(inv_label_map[p])

    return 0
Example #8
0
def main(argv):
    args = argument_parser('test').parse_args(argv[1:])

    model, tokenizer, labels, config = load_model(args.model_dir)
    test_labels, test_texts = load_tsv_data(args.test_data, args)

    max_seq_len = config['max_seq_length']
    replace_span = config['replace_span']

    label_map = {t: i for i, t in enumerate(labels)}
    inv_label_map = {v: k for k, v in label_map.items()}

    test_tok = tokenize_texts(test_texts, tokenizer)
    test_x = encode_tokenized(test_tok, tokenizer, max_seq_len, replace_span)
    test_y = [label_map[l] for l in test_labels]

    probs = model.predict(test_x, batch_size=args.batch_size)
    preds = np.argmax(probs, axis=-1)
    correct, total = sum(g == p for g, p in zip(test_y, preds)), len(test_y)
    print('Test accuracy: {:.1%} ({}/{})'.format(correct / total, correct,
                                                 total))

    return 0
Example #9
0
def main(argv):
    print_versions()
    args = argument_parser('train').parse_args(argv[1:])

    args.train_data = args.train_data.split(',')
    if args.checkpoint_steps is not None:
        os.makedirs(args.checkpoint_dir, exist_ok=True)

    strategy = MirroredStrategy()
    num_devices = strategy.num_replicas_in_sync
    # Batch datasets with global batch size (local * GPUs)
    global_batch_size = args.batch_size * num_devices

    tokenizer = get_tokenizer(args)

    label_list = load_labels(args.labels)
    label_map = { l: i for i, l in enumerate(label_list) }
    inv_label_map = { v: k for k, v in label_map.items() }

    if args.task_name not in (["NER","RE"]):
        raise ValueError("Task not found: {}".format(args.task_name))

    if args.train_data[0].endswith('.tsv'):
        if len(args.train_data) > 1:
            raise NotImplementedError('Multiple TSV inputs')

        train_data = TsvSequence(args.train_data[0], tokenizer, label_map,
                                global_batch_size, args)
        input_format = 'tsv'
    elif args.train_data[0].endswith('.tfrecord'):
        train_data = train_tfrecord_input(args.train_data, args.max_seq_length,
                                          global_batch_size)
        input_format = 'tfrecord'
    else:
        raise ValueError('--train_data must be .tsv or .tfrecord')

    if args.dev_data is None:
        dev_x, dev_y = None, None
        validation_data = None
    else:
        dev_x, dev_y = load_dataset(args.dev_data, tokenizer,
                                    args.max_seq_length,
                                    label_map, args)
        validation_data = (dev_x, dev_y)

    print('Number of devices: {}'.format(num_devices), file=sys.stderr, 
          flush=True)
    if num_devices > 1 and input_format != 'tfrecord':
        warning('TFRecord input recommended for multi-device training')

    num_train_examples = num_examples(args.train_data)
    num_labels = len(label_list)
    print('num_train_examples: {}'.format(num_train_examples),
          file=sys.stderr, flush=True)

    with strategy.scope():
        model = restore_or_create_model(num_train_examples, num_labels, 
                                        global_batch_size, args)
    model.summary(print_fn=print)

    callbacks = []
    if args.checkpoint_steps is not None:
        callbacks.append(ModelCheckpoint(
            filepath=os.path.join(args.checkpoint_dir, CHECKPOINT_NAME),
            save_freq=args.checkpoint_steps
        ))
        callbacks.append(DeleteOldCheckpoints(
            args.checkpoint_dir, CHECKPOINT_NAME, args.max_checkpoints
        ))

    if input_format == 'tsv':
        other_args = {
            'workers': 10,    # TODO
        }
    else:
        assert input_format == 'tfrecord', 'internal error'
        steps_per_epoch = int(np.ceil(num_train_examples/global_batch_size))
        other_args = {
            'steps_per_epoch': steps_per_epoch
        }

    model.fit(
        train_data,
        epochs=args.num_train_epochs,
        callbacks=callbacks,
        validation_data=validation_data,
        validation_batch_size=global_batch_size,
        **other_args
    )

    if validation_data is not None:
        probs = model.predict(dev_x, batch_size=global_batch_size)
        preds = np.argmax(probs, axis=-1)
        correct, total = sum(g==p for g, p in zip(dev_y, preds)), len(dev_y)
        print('Final dev accuracy: {:.1%} ({}/{})'.format(
            correct/total, correct, total))

    if args.model_dir is not None:
        print('Saving model in {}'.format(args.model_dir))
        save_model_etc(model, tokenizer, label_list, args)
    
    return 0
Example #10
0
def main(argv):
    argparser = argument_parser('serve')
    args = argparser.parse_args(argv[1:])
    app.tagger = Tagger.load(args.ner_model_dir)
    app.run(port=8080)
    return 0
Example #11
0
def main(argv):

    argparser = argument_parser()
    args = argparser.parse_args(argv[1:])
    seq_len = args.max_seq_length    # abbreviation

    pretrained_model, tokenizer = load_pretrained(args)

    train_words, train_tags = read_conll(args.train_data)
    test_words, test_tags = read_conll(args.test_data)


    print(args.no_context)

    if args.no_context:
        train_data = process_no_context(train_words, train_tags, tokenizer, seq_len)
        test_data = process_no_context(test_words, test_tags, tokenizer, seq_len)
    elif args.documentwise:
        tr_docs, tr_doc_tags, tr_line_ids = split_to_documents(train_words, train_tags)
        te_docs, te_doc_tags, te_line_ids = split_to_documents(test_words, test_tags)
        train_data = process_docs(tr_docs, tr_doc_tags, tr_line_ids, tokenizer, seq_len)
        test_data = process_docs(te_docs, te_doc_tags, te_line_ids, tokenizer, seq_len)
    else:
        train_data = process_sentences(train_words, train_tags, tokenizer, seq_len, args.predict_position)
        test_data = process_sentences(test_words, test_tags, tokenizer, seq_len, args.predict_position)
    
    label_list = get_labels(train_data.labels)
    tag_map = { l: i for i, l in enumerate(label_list) }
    inv_tag_map = { v: k for k, v in tag_map.items() }

    train_x = encode(train_data.combined_tokens, tokenizer, seq_len)
    test_x = encode(test_data.combined_tokens, tokenizer, seq_len)
    train_y, train_weights = label_encode(train_data.combined_labels, tag_map, seq_len)
    test_y, test_weights = label_encode(test_data.combined_labels, tag_map, seq_len)


    if args.use_ner_model and (args.ner_model_dir is not None):
        ner_model, tokenizer, labels, config = load_ner_model(args.ner_model_dir)
    else:
        optimizer = create_optimizer(len(train_x[0]), args)
        model = create_ner_model(pretrained_model, len(tag_map))
        if args.num_gpus > 1:
            ner_model = multi_gpu_model(model, args.num_gpus)
        else:
            ner_model = model

        ner_model.compile(
            optimizer,
            loss='sparse_categorical_crossentropy',
            sample_weight_mode='temporal',
            metrics=['sparse_categorical_accuracy']
            )
                
        ner_model.fit(
            train_x,
            train_y,
            sample_weight=train_weights,
            epochs=args.num_train_epochs,
            batch_size=args.batch_size
            )
        if args.ner_model_dir is not None:
            label_list = [v for k, v in sorted(list(inv_tag_map.items()))]
            save_ner_model(ner_model, tokenizer, label_list, args)

    
    probs = ner_model.predict(test_x, batch_size=args.batch_size)
    preds = np.argmax(probs, axis=-1)
    
    results = []
    m_names = []
    if args.no_context:
        pr_ensemble, pr_test_first = get_predictions(preds, test_data.tokens, test_data.sentence_numbers)
        output_file = "output/{}-NC.tsv".format(args.output_file)
        m_names.append('NC')  
        ensemble = []
        for i,pred in enumerate(pr_test_first):
            ensemble.append([inv_tag_map[t] for t in pred])
        lines_ensemble, sentences_ensemble = write_result(
            output_file, test_data.words, test_data.lengths,
            test_data.tokens, test_data.labels, ensemble
            )
        c = conlleval.evaluate(lines_ensemble)
        conlleval.report(c)
        results.append([conlleval.metrics(c)[0].prec, conlleval.metrics(c)[0].rec, conlleval.metrics(c)[0].fscore])



    else:
        # First tag then vote
        pr_ensemble, pr_test_first = get_predictions(preds, test_data.tokens, test_data.sentence_numbers)
        # Accumulate probabilities, then vote
        prob_ensemble, prob_test_first = get_predictions2(probs, test_data.tokens, test_data.sentence_numbers)
        ens = [pr_ensemble, prob_ensemble, pr_test_first, prob_test_first]
        if args.documentwise:
            # D-CMV: Documentwise CMV
            # D-CMVP: Documetwise CMV, probs summed, argmax after that
            # D-F: Documentwise First
            # D-FP: Same as D-FP 
            method_names = ['D-CMV','D-CMVP','D-F','D-FP']  
        else:           
            method_names = ['CMV','CMVP','F','FP']
        for i, ensem in enumerate(ens):
            ensemble = []
            for j,pred in enumerate(ensem):
                ensemble.append([inv_tag_map[t] for t in pred])
            output_file = "output/{}-{}.tsv".format(args.output_file, method_names[i])
            lines_ensemble, sentences_ensemble = write_result(
                    output_file, test_data.words, test_data.lengths,
                    test_data.tokens, test_data.labels, ensemble)
            print("Model trained: ", args.ner_model_dir)
            print("Seq-len: ", args.max_seq_length)
            print("Learning rate: ", args.learning_rate)
            print("Batch Size: ", args.batch_size)
            print("Epochs: ", args.num_train_epochs)
            print("Training data: ", args.train_data)
            print("Testing data: ", args.test_data)
            print("")
            print("Results with {}".format(method_names[i]))
            c = conlleval.evaluate(lines_ensemble)
            print("")
            conlleval.report(c)
            results.append([conlleval.metrics(c)[0].prec, conlleval.metrics(c)[0].rec, conlleval.metrics(c)[0].fscore])
            m_names.extend(method_names)

        
    if args.sentence_in_context:     
        starting_pos = np.arange(0,seq_len+1,32)
        starting_pos[0] = 1
        m_names.extend(starting_pos)
        for start_p in starting_pos:
            tt_lines, tt_tags, line_nos, line_starts = combine_sentences2(test_data.tokens, test_data.labels, seq_len-1, start_p-1)
            tt_x = encode(tt_lines, tokenizer, seq_len)
            tt_y, train_weights = label_encode(tt_tags, tag_map, seq_len)
            probs = ner_model.predict(tt_x, batch_size=args.batch_size)
            preds = np.argmax(probs, axis=-1)


            pred_tags = []
            for i, pred in enumerate(preds):
                idx = line_nos[i].index(i)
                pred_tags.append([inv_tag_map[t] for t in pred[line_starts[i][idx]+1:line_starts[i][idx]+len(test_data.tokens[i])+1]])
                
            output_file = "output/{}-{}.tsv".format(args.output_file, start_p)
            lines_first, sentences_first = write_result(
                output_file, test_data.words, test_data.lengths,
                test_data.tokens, test_data.labels, pred_tags
            )
            print("")
            print("Results with prediction starting position ", start_p)
            c = conlleval.evaluate(lines_first)
            conlleval.report(c)
            results.append([conlleval.metrics(c)[0].prec, conlleval.metrics(c)[0].rec, conlleval.metrics(c)[0].fscore])

    result_file = "./results/results-{}.csv".format(args.output_file) 
    with open(result_file, 'w+') as f:
        for i, line in enumerate(results):
            params = "{},{},{},{},{},{},{},{},{}".format(args.output_file,
                                            args.max_seq_length, 
                                            args.bert_config_file, 
                                            args.num_train_epochs, 
                                            args.learning_rate,
                                            args.batch_size,
                                            args.predict_position,
                                            args.train_data,
                                            args.test_data)
            f.write(params)
            f.write(",{}".format(m_names[i]))
            for item in line:
                f.write(",{}".format(item))
            f.write('\n') 

    for i in results:
        print(i)
    return 0