Beispiel #1
0
def annotate(output_format,
             dataset_type='conll2003',
             lang='en',
             architecture='BidLSTM_CRF',
             use_ELMo=False,
             use_BERT=False,
             file_in=None,
             file_out=None):
    if file_in is None or not os.path.isfile(file_in):
        raise ValueError("the provided input file is not valid")
    annotations = []

    if (dataset_type == 'conll2003') and (lang == 'en'):
        # load model
        model_name = 'ner-en-conll2003'
        if use_ELMo:
            model_name += '-with_ELMo'
        elif use_BERT:
            model_name += '-with_BERT'
        model_name += '-' + architecture
        model = Sequence(model_name)
        model.load()

    elif (dataset_type == 'conll2012') and (lang == 'en'):
        # load model
        model_name = 'ner-en-conll2012'
        if use_ELMo:
            model_name += '-with_ELMo'
        elif use_BERT:
            model_name += '-with_BERT'
        model_name += '-' + architecture
        model = Sequence(model_name)
        model.load()

    elif (lang == 'fr'):
        model_name = 'ner-fr-lemonde'
        if use_ELMo:
            model_name += '-with_ELMo'
        elif use_BERT:
            model_name += '-with_BERT'
        model_name += '-' + architecture
        model = Sequence(model_name)
        model.load()
    else:
        print("dataset/language combination is not supported:", dataset_type,
              lang)
        return

    start_time = time.time()

    model.tag_file(file_in=file_in,
                   output_format=output_format,
                   file_out=file_out)
    runtime = round(time.time() - start_time, 3)

    print("runtime: %s seconds " % (runtime))
Beispiel #2
0
def eval(dataset_type='conll2003',
         lang='en',
         architecture='BidLSTM_CRF',
         use_ELMo=False,
         use_BERT=False,
         data_path=None):

    if (dataset_type == 'conll2003') and (lang == 'en'):
        print('Loading CoNLL-2003 NER data...')
        x_test, y_test = load_data_and_labels_conll(
            'data/sequenceLabelling/CoNLL-2003/eng.testb')
        stats(x_eval=x_test, y_eval=y_test)

        # load model
        model_name = 'ner-en-conll2003'
        if use_ELMo:
            model_name += '-with_ELMo'
        elif use_BERT:
            model_name += '-with_BERT'
        model_name += '-' + architecture
        model = Sequence(model_name)
        model.load()

    elif (dataset_type == 'conll2012') and (lang == 'en'):
        print('Loading Ontonotes 5.0 CoNLL-2012 NER data...')

        x_test, y_test = load_data_and_labels_conll(
            'data/sequenceLabelling/CoNLL-2012-NER/eng.test')
        stats(x_eval=x_test, y_eval=y_test)

        # load model
        model_name = 'ner-en-conll2012'
        if use_ELMo:
            model_name += '-with_ELMo'
        elif use_BERT:
            model_name += '-with_BERT'
        model_name += '-' + architecture
        model = Sequence(model_name)
        model.load()

    else:
        print("dataset/language combination is not supported for fixed eval:",
              dataset_type, lang)
        return

    start_time = time.time()

    print("\nEvaluation on test set:")
    model.eval(x_test, y_test)
    runtime = round(time.time() - start_time, 3)

    print("runtime: %s seconds " % (runtime))
Beispiel #3
0
def annotate(texts,
             output_format,
             architecture='BidLSTM_CRF',
             transformer=None,
             use_ELMo=False):
    annotations = []

    model_name = 'insult-' + architecture
    if use_ELMo:
        model_name += '-with_ELMo'

    # load model
    model = Sequence(model_name,
                     architecture=architecture,
                     transformer_name=transformer,
                     use_ELMo=use_ELMo)
    model.load()

    start_time = time.time()

    annotations = model.tag(texts, output_format)
    runtime = round(time.time() - start_time, 3)

    if output_format == 'json':
        annotations["runtime"] = runtime
    else:
        print("runtime: %s seconds " % (runtime))
    return annotations
Beispiel #4
0
def eval_(model, use_ELMo=False, input_path=None):
    print('Loading data...')
    if input_path is None:
        # it should never be the case
        print("A Grobid evaluation data file must be specified to evaluate a grobid model for the eval action")
    else:
        x_all, y_all, f_all = load_data_and_labels_crf_file(input_path)

    print(len(x_all), 'evaluation sequences')

    model_name = 'grobid-' + model

    if use_ELMo:
        model_name += '-with_ELMo'

    start_time = time.time()

    # load the model
    model = Sequence(model_name)
    model.load()

    # evaluation
    print("\nEvaluation:")
    model.eval(x_all, y_all, features=f_all)

    runtime = round(time.time() - start_time, 3)
    print("Evaluation runtime: %s seconds " % (runtime))
Beispiel #5
0
def annotate_text(texts,
                  model,
                  output_format,
                  use_ELMo=False,
                  architecture='BidLSTM_CRF',
                  features=None):
    annotations = []

    # load model
    model_name = 'grobid-' + model
    model_name += '-' + architecture

    if use_ELMo and not 'bert' in model.lower():
        model_name += '-with_ELMo'

    model = Sequence(model_name)
    model.load()

    start_time = time.time()

    annotations = model.tag(texts, output_format, features=features)
    runtime = round(time.time() - start_time, 3)

    if output_format is 'json':
        annotations["runtime"] = runtime
    else:
        print("runtime: %s seconds " % (runtime))
    return annotations
Beispiel #6
0
def eval_(model, use_ELMo=False, input_path=None, architecture='BidLSTM_CRF'):
    print('Loading data...')
    if input_path is None:
        # it should never be the case
        print(
            "A Grobid evaluation data file must be specified for evaluating a grobid model for the eval action, use parameter --input "
        )
    else:
        x_all, y_all, f_all = load_data_and_labels_crf_file(input_path)

    print(len(x_all), 'evaluation sequences')

    model_name = 'grobid-' + model
    model_name += '-' + architecture

    if use_ELMo and not 'bert' in model.lower():
        model_name += '-with_ELMo'

    start_time = time.time()

    # load the model
    model = Sequence(model_name)
    model.load()

    # evaluation
    print("\nEvaluation:")
    model.eval(x_all, y_all, features=f_all)

    runtime = round(time.time() - start_time, 3)
    print("Evaluation runtime: %s seconds " % (runtime))
Beispiel #7
0
def train_eval(model, embeddings_name=None, architecture='BidLSTM_CRF', transformer=None,
               input_path=None, output_path=None, fold_count=1,
               features_indices=None, max_sequence_length=-1, batch_size=-1, max_epoch=-1, use_ELMo=False):
    print('Loading data...')
    if input_path is None:
        x_all, y_all, f_all = load_data_and_labels_crf_file('data/sequenceLabelling/grobid/'+model+'/'+model+'-060518.train')
    else:
        x_all, y_all, f_all = load_data_and_labels_crf_file(input_path)

    x_train_all, x_eval, y_train_all, y_eval, f_train_all, f_eval = train_test_split(x_all, y_all, f_all, test_size=0.1, shuffle=True)
    x_train, x_valid, y_train, y_valid, f_train, f_valid = train_test_split(x_train_all, y_train_all, f_train_all, test_size=0.1)

    print(len(x_train), 'train sequences')
    print(len(x_valid), 'validation sequences')
    print(len(x_eval), 'evaluation sequences')

    batch_size, max_sequence_length, model_name, embeddings_name, max_epoch, multiprocessing, early_stop = configure(model, 
                                                                            architecture, 
                                                                            output_path, 
                                                                            max_sequence_length, 
                                                                            batch_size, 
                                                                            embeddings_name,
                                                                            max_epoch,
                                                                            use_ELMo)
    model = Sequence(model_name,
                    recurrent_dropout=0.50,
                    embeddings_name=embeddings_name,
                    architecture=architecture,
                    transformer_name=transformer,
                    max_sequence_length=max_sequence_length,
                    batch_size=batch_size,
                    fold_number=fold_count,
                    features_indices=features_indices,
                    max_epoch=max_epoch, 
                    use_ELMo=use_ELMo,
                    multiprocessing=multiprocessing,
                    early_stop=early_stop)

    start_time = time.time()

    if fold_count == 1:
        model.train(x_train, y_train, f_train=f_train, x_valid=x_valid, y_valid=y_valid, f_valid=f_valid)
    else:
        model.train_nfold(x_train, y_train, f_train=f_train, x_valid=x_valid, y_valid=y_valid, f_valid=f_valid)

    runtime = round(time.time() - start_time, 3)
    print("training runtime: %s seconds " % runtime)

    # evaluation
    print("\nEvaluation:")
    model.eval(x_eval, y_eval, features=f_eval)

    # saving the model (must be called after eval for multiple fold training)
    if output_path:
        model.save(output_path)
    else:
        model.save()
Beispiel #8
0
    def __init__(self):
        from delft.sequenceLabelling import Sequence
        from delft.sequenceLabelling.models import BidLSTM_CRF

        self.model = Sequence("material", BidLSTM_CRF.name)
        self.model.load(dir_path="./models")

        self.mp = MaterialParser(pubchem_lookup=False, verbose=True)
        self.regex_separators = re.compile(r',|;|or|and')
Beispiel #9
0
    def __init__(self, model_path=None):
        from delft.sequenceLabelling import Sequence
        from delft.sequenceLabelling.models import BidLSTM_CRF

        self.model = Sequence("materialNER_fastText_oS+Sm-BidLSTM_CRF",
                              BidLSTM_CRF.name)
        if model_path and os.path.exists(model_path):
            self.model.load(dir_path=model_path)
        else:
            self.model.load(dir_path="./models")
Beispiel #10
0
def train_eval(model, embeddings_name, architecture='BidLSTM_CRF', use_ELMo=False,
               input_path=None, output_path=None, fold_count=1,
               features_indices=None):
    print('Loading data...')
    if input_path is None:
        x_all, y_all, f_all = load_data_and_labels_crf_file('data/sequenceLabelling/grobid/'+model+'/'+model+'-060518.train')
    else:
        x_all, y_all, f_all = load_data_and_labels_crf_file(input_path)

    x_train_all, x_eval, y_train_all, y_eval, f_train_all, f_eval = train_test_split(x_all, y_all, f_all, test_size=0.1)
    x_train, x_valid, y_train, y_valid, f_train, f_valid = train_test_split(x_train_all, y_train_all, f_train_all, test_size=0.1)

    print(len(x_train), 'train sequences')
    print(len(x_valid), 'validation sequences')
    print(len(x_eval), 'evaluation sequences')

    batch_size, max_sequence_length, model_name = configure(model, architecture, output_path, use_ELMo)

    model = Sequence(model_name,
                    max_epoch=100,
                    recurrent_dropout=0.50,
                    embeddings_name=embeddings_name,
                    model_type=architecture,
                    use_ELMo=use_ELMo,
                    max_sequence_length=max_sequence_length,
                    batch_size=batch_size,
                    fold_number=fold_count,
                    features_indices=features_indices)

    start_time = time.time()

    if fold_count == 1:
        model.train(x_train, y_train, f_train=f_train, x_valid=x_valid, y_valid=y_valid, f_valid=f_valid)
    else:
        model.train_nfold(x_train, y_train, f_train=f_train, x_valid=x_valid, y_valid=y_valid, f_valid=f_valid, fold_number=fold_count)

    runtime = round(time.time() - start_time, 3)
    print("training runtime: %s seconds " % runtime)

    # evaluation
    print("\nEvaluation:")
    model.eval(x_eval, y_eval, features=f_eval)

    # saving the model
    if (output_path):
        model.save(output_path)
    else:
        model.save()
Beispiel #11
0
def train(model, embeddings_name=None, architecture=None, transformer=None, input_path=None, output_path=None,
          features_indices=None, max_sequence_length=-1, batch_size=-1, max_epoch=-1, use_ELMo=False):

    print('Loading data...')
    if input_path == None:
        x_all, y_all, f_all = load_data_and_labels_crf_file('data/sequenceLabelling/grobid/'+model+'/'+model+'-060518.train')
    else:
        x_all, y_all, f_all = load_data_and_labels_crf_file(input_path)

    print(len(x_all), 'total sequences')

    x_train, x_valid, y_train, y_valid, f_train, f_valid = train_test_split(x_all, y_all, f_all, test_size=0.1, shuffle=True)

    print(len(x_train), 'train sequences')
    print(len(x_valid), 'validation sequences')

    batch_size, max_sequence_length, model_name, embeddings_name, max_epoch, multiprocessing, early_stop = configure(model,
                                                                            architecture,
                                                                            output_path,
                                                                            max_sequence_length,
                                                                            batch_size,
                                                                            embeddings_name,
                                                                            max_epoch,
                                                                            use_ELMo)
    model = Sequence(model_name,
                     recurrent_dropout=0.50,
                     embeddings_name=embeddings_name,
                     architecture=architecture,
                     transformer_name=transformer,
                     batch_size=batch_size,
                     max_sequence_length=max_sequence_length,
                     features_indices=features_indices,
                     max_epoch=max_epoch, 
                     use_ELMo=use_ELMo,
                     multiprocessing=multiprocessing,
                     early_stop=early_stop)

    start_time = time.time()
    model.train(x_train, y_train, f_train, x_valid, y_valid, f_valid)
    runtime = round(time.time() - start_time, 3)
    print("training runtime: %s seconds " % (runtime))

    # saving the model
    if output_path:
        model.save(output_path)
    else:
        model.save()
Beispiel #12
0
def annotate(texts, output_format, architecture='BidLSTM_CRF'):
    annotations = []

    # load model
    model = Sequence('insult', architecture=architecture)
    model.load()

    start_time = time.time()

    annotations = model.tag(texts, output_format)
    runtime = round(time.time() - start_time, 3)

    if output_format is 'json':
        annotations["runtime"] = runtime
    else:
        print("runtime: %s seconds " % (runtime))
    return annotations
Beispiel #13
0
def train(model,
          embeddings_name,
          architecture='BidLSTM_CRF',
          use_ELMo=False,
          input_path=None,
          output_path=None):
    print('Loading data...')
    if input_path is None:
        x_all, y_all, f_all = load_data_and_labels_crf_file(
            'data/sequenceLabelling/grobid/' + model + '/' + model +
            '-060518.train')
    else:
        x_all, y_all, f_all = load_data_and_labels_crf_file(input_path)
    x_train, x_valid, y_train, y_valid = train_test_split(x_all,
                                                          y_all,
                                                          test_size=0.1)

    print(len(x_train), 'train sequences')
    print(len(x_valid), 'validation sequences')

    if output_path:
        model_name = model
    else:
        model_name = 'grobid-' + model

    if use_ELMo:
        model_name += '-with_ELMo'

    model = Sequence(model_name,
                     max_epoch=100,
                     recurrent_dropout=0.50,
                     embeddings_name=embeddings_name,
                     model_type=architecture,
                     use_ELMo=use_ELMo)

    start_time = time.time()
    model.train(x_train, y_train, x_valid, y_valid)
    runtime = round(time.time() - start_time, 3)
    print("training runtime: %s seconds " % (runtime))

    # saving the model
    if (output_path):
        model.save(output_path)
    else:
        model.save()
Beispiel #14
0
def train(embeddings_name, architecture='BidLSTM_CRF'): 
    root = os.path.join(os.path.dirname(__file__), 'data/sequenceLabelling/toxic/')

    train_path = os.path.join(root, 'corrected.xml')
    valid_path = os.path.join(root, 'valid.xml')

    print('Loading data...')
    x_train, y_train = load_data_and_labels_xml_file(train_path)
    x_valid, y_valid = load_data_and_labels_xml_file(valid_path)
    print(len(x_train), 'train sequences')
    print(len(x_valid), 'validation sequences')

    model = Sequence('insult', max_epoch=50, embeddings_name=embeddings_name)
    model.train(x_train, y_train, x_valid=x_valid, y_valid=y_valid)
    print('training done')

    # saving the model
    model.save()
Beispiel #15
0
def annotate_text(texts, model, output_format, use_ELMo=False):
    annotations = []

    # load model
    model_name = 'grobid-'+model
    if use_ELMo:
        model_name += '-with_ELMo'
    model = Sequence(model_name)
    model.load()

    start_time = time.time()

    annotations = model.tag(texts, output_format)
    runtime = round(time.time() - start_time, 3)

    if output_format is 'json':
        annotations["runtime"] = runtime
    else:
        print("runtime: %s seconds " % (runtime))
    return annotations
Beispiel #16
0
def train(embeddings_name=None,
          architecture='BidLSTM_CRF',
          transformer=None,
          use_ELMo=False):
    batch_size, maxlen, patience, early_stop, max_epoch, embeddings_name = configure(
        architecture, embeddings_name)

    root = 'data/sequenceLabelling/toxic/'

    train_path = os.path.join(root, 'corrected.xml')
    valid_path = os.path.join(root, 'valid.xml')

    print('Loading data...')
    x_train, y_train = load_data_and_labels_xml_file(train_path)
    x_valid, y_valid = load_data_and_labels_xml_file(valid_path)
    print(len(x_train), 'train sequences')
    print(len(x_valid), 'validation sequences')

    model_name = 'insult-' + architecture
    if use_ELMo:
        model_name += '-with_ELMo'

    model = Sequence(model_name,
                     max_epoch=max_epoch,
                     batch_size=batch_size,
                     max_sequence_length=maxlen,
                     embeddings_name=embeddings_name,
                     architecture=architecture,
                     patience=patience,
                     early_stop=early_stop,
                     transformer_name=transformer,
                     use_ELMo=use_ELMo)
    model.train(x_train, y_train, x_valid=x_valid, y_valid=y_valid)
    print('training done')

    # saving the model (must be called after eval for multiple fold training)
    model.save()
Beispiel #17
0
def train(dataset_type='conll2003',
          lang='en',
          embeddings_name=None,
          architecture='BidLSTM_CRF',
          transformer=None,
          data_path=None,
          use_ELMo=False):

    batch_size, max_sequence_length, patience, recurrent_dropout, early_stop, max_epoch, embeddings_name, word_lstm_units, multiprocessing = \
        configure(architecture, dataset_type, lang, embeddings_name, use_ELMo)

    if (dataset_type == 'conll2003') and (lang == 'en'):
        print('Loading data...')
        x_train1, y_train1 = load_data_and_labels_conll(
            'data/sequenceLabelling/CoNLL-2003/eng.train')
        x_train2, y_train2 = load_data_and_labels_conll(
            'data/sequenceLabelling/CoNLL-2003/eng.testa')
        x_train3, y_train3 = load_data_and_labels_conll(
            'data/sequenceLabelling/CoNLL-2003/eng.testb')

        # we concatenate all sets
        x_all = np.concatenate((x_train1, x_train2, x_train3), axis=0)
        y_all = np.concatenate((y_train1, y_train2, y_train3), axis=0)

        # split train and valid sets in a random way
        x_train, x_valid, y_train, y_valid = train_test_split(x_all,
                                                              y_all,
                                                              test_size=0.1)
        stats(x_train, y_train, x_valid, y_valid)

        model_name = 'ner-en-conll2003-' + architecture
        if use_ELMo:
            model_name += '-with_ELMo'

        model = Sequence(model_name,
                         max_epoch=max_epoch,
                         recurrent_dropout=recurrent_dropout,
                         embeddings_name=embeddings_name,
                         architecture=architecture,
                         transformer_name=transformer,
                         word_lstm_units=word_lstm_units,
                         batch_size=batch_size,
                         early_stop=early_stop,
                         patience=patience,
                         max_sequence_length=max_sequence_length,
                         use_ELMo=use_ELMo,
                         multiprocessing=multiprocessing)
    elif (dataset_type == 'conll2012') and (lang == 'en'):
        print('Loading Ontonotes 5.0 CoNLL-2012 NER data...')

        x_train1, y_train1 = load_data_and_labels_conll(
            'data/sequenceLabelling/CoNLL-2012-NER/eng.train')
        x_train2, y_train2 = load_data_and_labels_conll(
            'data/sequenceLabelling/CoNLL-2012-NER/eng.dev')
        x_train3, y_train3 = load_data_and_labels_conll(
            'data/sequenceLabelling/CoNLL-2012-NER/eng.test')

        # we concatenate train and valid sets
        x_all = np.concatenate((x_train1, x_train2, x_train3), axis=0)
        y_all = np.concatenate((y_train1, y_train2, y_train3), axis=0)

        # split train and valid sets in a random way
        x_train, x_valid, y_train, y_valid = train_test_split(x_all,
                                                              y_all,
                                                              test_size=0.1)
        stats(x_train, y_train, x_valid, y_valid)

        model_name = 'ner-en-conll2012-' + architecture
        if use_ELMo:
            model_name += '-with_ELMo'

        model = Sequence(model_name,
                         max_epoch=max_epoch,
                         recurrent_dropout=recurrent_dropout,
                         embeddings_name=embeddings_name,
                         architecture=architecture,
                         transformer_name=transformer,
                         word_lstm_units=word_lstm_units,
                         batch_size=batch_size,
                         early_stop=early_stop,
                         patience=patience,
                         max_sequence_length=max_sequence_length,
                         use_ELMo=use_ELMo,
                         multiprocessing=multiprocessing)
    elif (lang == 'fr'):
        print('Loading data...')
        dataset_type = 'lemonde'
        x_all, y_all = load_data_and_labels_lemonde(
            'data/sequenceLabelling/leMonde/ftb6_ALL.EN.docs.relinked.xml')
        shuffle_arrays([x_all, y_all])
        x_train, x_valid, y_train, y_valid = train_test_split(x_all,
                                                              y_all,
                                                              test_size=0.1)
        stats(x_train, y_train, x_valid, y_valid)

        model_name = 'ner-fr-lemonde-' + architecture
        if use_ELMo:
            model_name += '-with_ELMo'

        model = Sequence(model_name,
                         max_epoch=max_epoch,
                         recurrent_dropout=recurrent_dropout,
                         embeddings_name=embeddings_name,
                         architecture=architecture,
                         transformer_name=transformer,
                         word_lstm_units=word_lstm_units,
                         batch_size=batch_size,
                         early_stop=early_stop,
                         patience=patience,
                         max_sequence_length=max_sequence_length,
                         use_ELMo=use_ELMo,
                         multiprocessing=multiprocessing)
    else:
        print("dataset/language combination is not supported:", dataset_type,
              lang)
        return

    #elif (dataset_type == 'ontonotes') and (lang == 'en'):
    #    model = sequenceLabelling.Sequence('ner-en-ontonotes', max_epoch=60, embeddings_name=embeddings_name)
    #elif (lang == 'fr'):
    #    model = sequenceLabelling.Sequence('ner-fr-lemonde', max_epoch=60, embeddings_name=embeddings_name)

    start_time = time.time()
    model.train(x_train, y_train, x_valid=x_valid, y_valid=y_valid)
    runtime = round(time.time() - start_time, 3)
    print("training runtime: %s seconds " % (runtime))

    # saving the model
    model.save()
Beispiel #18
0
def train_eval(embeddings_name=None,
               dataset_type='conll2003',
               lang='en',
               architecture='BidLSTM_CRF',
               transformer=None,
               fold_count=1,
               train_with_validation_set=False,
               data_path=None,
               use_ELMo=False):

    batch_size, max_sequence_length, patience, recurrent_dropout, early_stop, max_epoch, embeddings_name, word_lstm_units, multiprocessing = \
        configure(architecture, dataset_type, lang, embeddings_name, use_ELMo)

    if (dataset_type == 'conll2003') and (lang == 'en'):
        print('Loading CoNLL 2003 data...')
        x_train, y_train = load_data_and_labels_conll(
            'data/sequenceLabelling/CoNLL-2003/eng.train')
        x_valid, y_valid = load_data_and_labels_conll(
            'data/sequenceLabelling/CoNLL-2003/eng.testa')
        x_eval, y_eval = load_data_and_labels_conll(
            'data/sequenceLabelling/CoNLL-2003/eng.testb')
        stats(x_train, y_train, x_valid, y_valid, x_eval, y_eval)

        model_name = 'ner-en-conll2003-' + architecture
        if use_ELMo:
            model_name += '-with_ELMo'

        if not train_with_validation_set:
            # restrict training on train set, use validation set for early stop, as in most papers
            model = Sequence(model_name,
                             max_epoch=max_epoch,
                             recurrent_dropout=recurrent_dropout,
                             embeddings_name=embeddings_name,
                             fold_number=fold_count,
                             architecture=architecture,
                             transformer_name=transformer,
                             word_lstm_units=word_lstm_units,
                             batch_size=batch_size,
                             early_stop=True,
                             patience=patience,
                             max_sequence_length=max_sequence_length,
                             use_ELMo=use_ELMo,
                             multiprocessing=multiprocessing)
        else:
            # also use validation set to train (no early stop, hyperparmeters must be set preliminarly),
            # as (Chui & Nochols, 2016) and (Peters and al., 2017)
            # this leads obviously to much higher results (~ +0.5 f1 score with CoNLL-2003)
            model = Sequence(model_name,
                             max_epoch=max_epoch,
                             recurrent_dropout=recurrent_dropout,
                             embeddings_name=embeddings_name,
                             early_stop=False,
                             fold_number=fold_count,
                             architecture=architecture,
                             transformer_name=transformer,
                             word_lstm_units=word_lstm_units,
                             batch_size=batch_size,
                             patience=patience,
                             max_sequence_length=max_sequence_length,
                             use_ELMo=use_ELMo,
                             multiprocessing=multiprocessing)

    elif (dataset_type == 'ontonotes-all') and (lang == 'en'):
        print(
            "Loading all Ontonotes 5.0 XML data, evaluation will be on 10\% random partition"
        )
        x_all, y_all = load_data_and_labels_ontonotes(data_path)
        x_train_all, x_eval, y_train_all, y_eval = train_test_split(
            x_all, y_all, test_size=0.1)
        x_train, x_valid, y_train, y_valid = train_test_split(x_train_all,
                                                              y_train_all,
                                                              test_size=0.1)
        stats(x_train, y_train, x_valid, y_valid, x_eval, y_eval)

        model_name = 'ner-en-ontonotes-' + architecture
        if use_ELMo:
            model_name += '-with_ELMo'

        model = Sequence(model_name,
                         max_epoch=max_epoch,
                         recurrent_dropout=recurrent_dropout,
                         embeddings_name=embeddings_name,
                         fold_number=fold_count,
                         architecture=architecture,
                         transformer_name=transformer,
                         word_lstm_units=word_lstm_units,
                         batch_size=batch_size,
                         early_stop=early_stop,
                         patience=patience,
                         max_sequence_length=max_sequence_length,
                         use_ELMo=use_ELMo,
                         multiprocessing=multiprocessing)

    elif (dataset_type == 'conll2012') and (lang == 'en'):
        print('Loading Ontonotes 5.0 CoNLL-2012 NER data...')

        x_train, y_train = load_data_and_labels_conll(
            'data/sequenceLabelling/CoNLL-2012-NER/eng.train')
        x_valid, y_valid = load_data_and_labels_conll(
            'data/sequenceLabelling/CoNLL-2012-NER/eng.dev')
        x_eval, y_eval = load_data_and_labels_conll(
            'data/sequenceLabelling/CoNLL-2012-NER/eng.test')
        stats(x_train, y_train, x_valid, y_valid, x_eval, y_eval)

        model_name = 'ner-en-conll2012-' + architecture
        if use_ELMo:
            model_name += '-with_ELMo'

        if not train_with_validation_set:
            model = Sequence(model_name,
                             max_epoch=max_epoch,
                             recurrent_dropout=recurrent_dropout,
                             embeddings_name=embeddings_name,
                             fold_number=fold_count,
                             architecture=architecture,
                             transformer_name=transformer,
                             word_lstm_units=word_lstm_units,
                             batch_size=batch_size,
                             early_stop=True,
                             patience=patience,
                             max_sequence_length=max_sequence_length,
                             use_ELMo=use_ELMo,
                             multiprocessing=multiprocessing)
        else:
            # also use validation set to train (no early stop, hyperparameters must be set preliminarly),
            # as (Chui & Nochols, 2016) and (Peters and al., 2017)
            # this leads obviously to much higher results
            model = Sequence(model_name,
                             max_epoch=max_epoch,
                             recurrent_dropout=recurrent_dropout,
                             embeddings_name=embeddings_name,
                             early_stop=False,
                             fold_number=fold_count,
                             architecture=architecture,
                             transformer_name=transformer,
                             word_lstm_units=word_lstm_units,
                             batch_size=batch_size,
                             patience=patience,
                             max_sequence_length=max_sequence_length,
                             use_ELMo=use_ELMo,
                             multiprocessing=multiprocessing)

    elif (lang == 'fr') and (dataset_type == 'ftb' or dataset_type is None):
        print('Loading data for ftb...')
        x_all, y_all = load_data_and_labels_lemonde(
            'data/sequenceLabelling/leMonde/ftb6_ALL.EN.docs.relinked.xml')
        shuffle_arrays([x_all, y_all])
        x_train_all, x_eval, y_train_all, y_eval = train_test_split(
            x_all, y_all, test_size=0.1)
        x_train, x_valid, y_train, y_valid = train_test_split(x_train_all,
                                                              y_train_all,
                                                              test_size=0.1)
        stats(x_train, y_train, x_valid, y_valid, x_eval, y_eval)

        model_name = 'ner-fr-lemonde-' + architecture
        if use_ELMo:
            model_name += '-with_ELMo'

        model = Sequence(model_name,
                         max_epoch=max_epoch,
                         recurrent_dropout=recurrent_dropout,
                         embeddings_name=embeddings_name,
                         fold_number=fold_count,
                         architecture=architecture,
                         transformer_name=transformer,
                         word_lstm_units=word_lstm_units,
                         batch_size=batch_size,
                         early_stop=early_stop,
                         patience=patience,
                         max_sequence_length=max_sequence_length,
                         use_ELMo=use_ELMo,
                         multiprocessing=multiprocessing)
    elif (lang == 'fr') and (dataset_type == 'ftb_force_split'):
        print('Loading data for ftb_force_split...')
        x_train, y_train = load_data_and_labels_conll(
            'data/sequenceLabelling/leMonde/ftb6_train.conll')
        shuffle_arrays([x_train, y_train])
        x_valid, y_valid = load_data_and_labels_conll(
            'data/sequenceLabelling/leMonde/ftb6_dev.conll')
        x_eval, y_eval = load_data_and_labels_conll(
            'data/sequenceLabelling/leMonde/ftb6_test.conll')
        stats(x_train, y_train, x_valid, y_valid, x_eval, y_eval)

        model_name = 'ner-fr-lemonde-force-split-' + architecture
        if use_ELMo:
            model_name += '-with_ELMo'

        if not train_with_validation_set:
            # restrict training on train set, use validation set for early stop, as in most papers
            model = Sequence(model_name,
                             max_epoch=max_epoch,
                             recurrent_dropout=recurrent_dropout,
                             embeddings_name=embeddings_name,
                             early_stop=True,
                             fold_number=fold_count,
                             architecture=architecture,
                             transformer_name=transformer,
                             word_lstm_units=word_lstm_units,
                             batch_size=batch_size,
                             patience=patience,
                             max_sequence_length=max_sequence_length,
                             use_ELMo=use_ELMo,
                             multiprocessing=multiprocessing)
        else:
            # also use validation set to train (no early stop, hyperparmeters must be set preliminarly),
            # as (Chui & Nochols, 2016) and (Peters and al., 2017)
            # this leads obviously to much higher results (~ +0.5 f1 score with CoNLL-2003)
            model = Sequence(model_name,
                             max_epoch=max_epoch,
                             recurrent_dropout=recurrent_dropout,
                             embeddings_name=embeddings_name,
                             early_stop=False,
                             fold_number=fold_count,
                             architecture=architecture,
                             transformer_name=transformer,
                             word_lstm_units=word_lstm_units,
                             batch_size=batch_size,
                             patience=patience,
                             max_sequence_length=max_sequence_length,
                             use_ELMo=use_ELMo,
                             multiprocessing=multiprocessing)
    elif (lang == 'fr') and (dataset_type == 'ftb_force_split_xml'):
        print('Loading data for ftb_force_split_xml...')
        x_train, y_train = load_data_and_labels_lemonde(
            'data/sequenceLabelling/leMonde/ftb6_ALL.EN.docs.relinked.train.xml'
        )
        shuffle_arrays([x_train, y_train])
        x_valid, y_valid = load_data_and_labels_lemonde(
            'data/sequenceLabelling/leMonde/ftb6_ALL.EN.docs.relinked.dev.xml')
        x_eval, y_eval = load_data_and_labels_lemonde(
            'data/sequenceLabelling/leMonde/ftb6_ALL.EN.docs.relinked.test.xml'
        )
        stats(x_train, y_train, x_valid, y_valid, x_eval, y_eval)

        model_name = 'ner-fr-lemonde-force-split-xml-' + architecture
        if use_ELMo:
            model_name += '-with_ELMo'

        if not train_with_validation_set:
            # restrict training on train set, use validation set for early stop, as in most papers
            model = Sequence(model_name,
                             max_epoch=max_epoch,
                             recurrent_dropout=recurrent_dropout,
                             embeddings_name=embeddings_name,
                             early_stop=True,
                             fold_number=fold_count,
                             architecture=architecture,
                             transformer_name=transformer,
                             word_lstm_units=word_lstm_units,
                             batch_size=batch_size,
                             patience=patience,
                             max_sequence_length=max_sequence_length,
                             use_ELMo=use_ELMo,
                             multiprocessing=multiprocessing)
        else:
            # also use validation set to train (no early stop, hyperparmeters must be set preliminarly),
            # as (Chui & Nochols, 2016) and (Peters and al., 2017)
            # this leads obviously to much higher results (~ +0.5 f1 score with CoNLL-2003)
            model = Sequence(model_name,
                             max_epoch=max_epoch,
                             recurrent_dropout=recurrent_dropout,
                             embeddings_name=embeddings_name,
                             early_stop=False,
                             fold_number=fold_count,
                             architecture=architecture,
                             transformer_name=transformer,
                             word_lstm_units=word_lstm_units,
                             batch_size=batch_size,
                             patience=patience,
                             max_sequence_length=max_sequence_length,
                             use_ELMo=use_ELMo,
                             multiprocessing=multiprocessing)
    else:
        print("dataset/language combination is not supported:", dataset_type,
              lang)
        return

    start_time = time.time()
    if fold_count == 1:
        model.train(x_train, y_train, x_valid=x_valid, y_valid=y_valid)
    else:
        model.train_nfold(x_train, y_train, x_valid=x_valid, y_valid=y_valid)
    runtime = round(time.time() - start_time, 3)
    print("training runtime: %s seconds " % (runtime))

    print("\nEvaluation on test set:")
    model.eval(x_eval, y_eval)

    # # saving the model (must be called after eval for multiple fold training)
    model.save()
Beispiel #19
0
def train(embeddings_name=None,
          architecture='BidLSTM_CRF',
          transformer=None,
          input_path=None,
          output_path=None,
          fold_count=1,
          features_indices=None,
          max_sequence_length=-1,
          batch_size=-1,
          max_epoch=-1,
          use_ELMo=False):
    print('Loading data...')
    if input_path is None:
        x_all1 = y_all1 = x_all2 = y_all2 = x_all3 = y_all3 = []
        dataseer_sentences_path = "data/sequenceLabelling/datasets/dataseer_sentences.json"
        if os.path.exists(dataseer_sentences_path):
            x_all1, y_all1 = load_data_and_labels_json_offsets(
                dataseer_sentences_path)
        ner_dataset_recognition_sentences_path = "data/sequenceLabelling/datasets/ner_dataset_recognition_sentences.json"
        if os.path.exists(ner_dataset_recognition_sentences_path):
            x_all2, y_all2 = load_data_and_labels_json_offsets(
                ner_dataset_recognition_sentences_path)
        coleridge_sentences_path = "data/sequenceLabelling/datasets/coleridge_sentences.json.gz"
        if os.path.exists(coleridge_sentences_path):
            x_all3, y_all3 = load_data_and_labels_json_offsets(
                coleridge_sentences_path)
        x_all = np.concatenate((x_all1, x_all2, x_all3[:1000]), axis=0)
        y_all = np.concatenate((y_all1, y_all2, y_all3[:1000]), axis=0)
    else:
        x_all, y_all = load_data_and_labels_json_offsets(input_path)

    x_train, x_valid, y_train, y_valid = train_test_split(x_all,
                                                          y_all,
                                                          test_size=0.1,
                                                          shuffle=True)

    print(len(x_train), 'train sequences')
    print(len(x_valid), 'validation sequences')

    batch_size, max_sequence_length, model_name, embeddings_name, max_epoch, multiprocessing, early_stop = configure(
        architecture, output_path, max_sequence_length, batch_size,
        embeddings_name, max_epoch, use_ELMo)
    model = Sequence(model_name,
                     recurrent_dropout=0.50,
                     embeddings_name=embeddings_name,
                     architecture=architecture,
                     transformer_name=transformer,
                     max_sequence_length=max_sequence_length,
                     batch_size=batch_size,
                     fold_number=fold_count,
                     features_indices=features_indices,
                     max_epoch=max_epoch,
                     use_ELMo=use_ELMo,
                     multiprocessing=multiprocessing,
                     early_stop=early_stop)

    start_time = time.time()
    model.train(x_train, y_train, x_valid=x_valid, y_valid=y_valid)
    runtime = round(time.time() - start_time, 3)

    print("training runtime: %s seconds " % runtime)

    # saving the model
    if output_path:
        model.save(output_path)
    else:
        model.save()
Beispiel #20
0
def train_eval(embedding_name,
               dataset_type='conll2003',
               lang='en',
               architecture='BidLSTM_CRF',
               fold_count=1,
               train_with_validation_set=False,
               use_ELMo=False,
               use_BERT=False,
               data_path=None):

    if (architecture == "BidLSTM_CNN_CRF"):
        word_lstm_units = 200
        max_epoch = 30
        recurrent_dropout = 0.5
    else:
        word_lstm_units = 100
        max_epoch = 25
        recurrent_dropout = 0.5

    if use_ELMo or use_BERT:
        batch_size = 120
    else:
        batch_size = 20

    if (dataset_type == 'conll2003') and (lang == 'en'):
        print('Loading CoNLL 2003 data...')
        x_train, y_train = load_data_and_labels_conll(
            'data/sequenceLabelling/CoNLL-2003/eng.train')
        x_valid, y_valid = load_data_and_labels_conll(
            'data/sequenceLabelling/CoNLL-2003/eng.testa')
        x_eval, y_eval = load_data_and_labels_conll(
            'data/sequenceLabelling/CoNLL-2003/eng.testb')
        stats(x_train, y_train, x_valid, y_valid, x_eval, y_eval)

        model_name = 'ner-en-conll2003'
        if use_ELMo:
            model_name += '-with_ELMo'
        elif use_BERT:
            model_name += '-with_BERT'
        model_name += '-' + architecture

        if not train_with_validation_set:
            # restrict training on train set, use validation set for early stop, as in most papers
            model = Sequence(model_name,
                             max_epoch=60,
                             recurrent_dropout=recurrent_dropout,
                             embeddings_name=embedding_name,
                             early_stop=True,
                             fold_number=fold_count,
                             model_type=architecture,
                             word_lstm_units=word_lstm_units,
                             batch_size=batch_size,
                             use_ELMo=use_ELMo,
                             use_BERT=use_BERT)
        else:
            # also use validation set to train (no early stop, hyperparmeters must be set preliminarly),
            # as (Chui & Nochols, 2016) and (Peters and al., 2017)
            # this leads obviously to much higher results (~ +0.5 f1 score with CoNLL-2003)
            model = Sequence(model_name,
                             max_epoch=max_epoch,
                             recurrent_dropout=recurrent_dropout,
                             embeddings_name=embedding_name,
                             early_stop=False,
                             fold_number=fold_count,
                             model_type=architecture,
                             word_lstm_units=word_lstm_units,
                             batch_size=batch_size,
                             use_ELMo=use_ELMo,
                             use_BERT=use_BERT)

    elif (dataset_type == 'ontonotes-all') and (lang == 'en'):
        print('Loading Ontonotes 5.0 XML data...')
        x_all, y_all = load_data_and_labels_ontonotes(data_path)
        x_train_all, x_eval, y_train_all, y_eval = train_test_split(
            x_all, y_all, test_size=0.1)
        x_train, x_valid, y_train, y_valid = train_test_split(x_train_all,
                                                              y_train_all,
                                                              test_size=0.1)
        stats(x_train, y_train, x_valid, y_valid, x_eval, y_eval)

        model_name = 'ner-en-ontonotes'
        if use_ELMo:
            model_name += '-with_ELMo'
        elif use_BERT:
            model_name += '-with_BERT'
        model_name += '-' + architecture

        model = Sequence(model_name,
                         max_epoch=60,
                         recurrent_dropout=recurrent_dropout,
                         embeddings_name=embedding_name,
                         early_stop=True,
                         fold_number=fold_count,
                         model_type=architecture,
                         word_lstm_units=word_lstm_units,
                         batch_size=batch_size,
                         use_ELMo=use_ELMo,
                         use_BERT=use_BERT)

    elif (dataset_type == 'conll2012') and (lang == 'en'):
        print('Loading Ontonotes 5.0 CoNLL-2012 NER data...')

        x_train, y_train = load_data_and_labels_conll(
            'data/sequenceLabelling/CoNLL-2012-NER/eng.train')
        x_valid, y_valid = load_data_and_labels_conll(
            'data/sequenceLabelling/CoNLL-2012-NER/eng.dev')
        x_eval, y_eval = load_data_and_labels_conll(
            'data/sequenceLabelling/CoNLL-2012-NER/eng.test')
        stats(x_train, y_train, x_valid, y_valid, x_eval, y_eval)

        model_name = 'ner-en-conll2012'
        if use_ELMo:
            model_name += '-with_ELMo'
        elif use_BERT:
            model_name += '-with_BERT'
        model_name += '-' + architecture

        if not train_with_validation_set:
            model = Sequence(model_name,
                             max_epoch=80,
                             recurrent_dropout=recurrent_dropout,
                             embeddings_name=embedding_name,
                             early_stop=True,
                             fold_number=fold_count,
                             model_type=architecture,
                             word_lstm_units=word_lstm_units,
                             batch_size=batch_size,
                             use_ELMo=use_ELMo,
                             use_BERT=use_BERT)
        else:
            # also use validation set to train (no early stop, hyperparmeters must be set preliminarly),
            # as (Chui & Nochols, 2016) and (Peters and al., 2017)
            # this leads obviously to much higher results
            model = Sequence(model_name,
                             max_epoch=40,
                             recurrent_dropout=recurrent_dropout,
                             embeddings_name=embedding_name,
                             early_stop=False,
                             fold_number=fold_count,
                             model_type=architecture,
                             word_lstm_units=word_lstm_units,
                             batch_size=batch_size,
                             use_ELMo=use_ELMo,
                             use_BERT=use_BERT)

    elif (lang == 'fr'):
        print('Loading data...')
        dataset_type = 'lemonde'
        x_all, y_all = load_data_and_labels_lemonde(
            'data/sequenceLabelling/leMonde/ftb6_ALL.EN.docs.relinked.xml')
        x_train_all, x_eval, y_train_all, y_eval = train_test_split(
            x_all, y_all, test_size=0.1)
        x_train, x_valid, y_train, y_valid = train_test_split(x_train_all,
                                                              y_train_all,
                                                              test_size=0.1)
        stats(x_train, y_train, x_valid, y_valid, x_eval, y_eval)

        model_name = 'ner-fr-lemonde'
        if use_ELMo:
            model_name += '-with_ELMo'
            # custom batch size for French ELMo
            batch_size = 20
        elif use_BERT:
            # need to find a French BERT :/
            model_name += '-with_BERT'
        model_name += '-' + architecture

        model = Sequence(model_name,
                         max_epoch=60,
                         recurrent_dropout=recurrent_dropout,
                         embeddings_name=embedding_name,
                         early_stop=True,
                         fold_number=fold_count,
                         model_type=architecture,
                         word_lstm_units=word_lstm_units,
                         batch_size=batch_size,
                         use_ELMo=use_ELMo,
                         use_BERT=use_BERT)
    else:
        print("dataset/language combination is not supported:", dataset_type,
              lang)
        return

    start_time = time.time()
    if fold_count == 1:
        model.train(x_train, y_train, x_valid, y_valid)
    else:
        model.train_nfold(x_train,
                          y_train,
                          x_valid,
                          y_valid,
                          fold_number=fold_count)
    runtime = round(time.time() - start_time, 3)
    print("training runtime: %s seconds " % (runtime))

    print("\nEvaluation on test set:")
    model.eval(x_eval, y_eval)

    # saving the model
    model.save()
Beispiel #21
0
def train(embedding_name,
          dataset_type='conll2003',
          lang='en',
          architecture='BidLSTM_CRF',
          use_ELMo=False,
          use_BERT=False,
          data_path=None):

    if (architecture == "BidLSTM_CNN_CRF"):
        word_lstm_units = 200
        recurrent_dropout = 0.5
    else:
        word_lstm_units = 100
        recurrent_dropout = 0.5

    if use_ELMo:
        batch_size = 100
    else:
        batch_size = 20

    if (dataset_type == 'conll2003') and (lang == 'en'):
        print('Loading data...')
        x_train1, y_train1 = load_data_and_labels_conll(
            'data/sequenceLabelling/CoNLL-2003/eng.train')
        x_train2, y_train2 = load_data_and_labels_conll(
            'data/sequenceLabelling/CoNLL-2003/eng.testa')
        x_train3, y_train3 = load_data_and_labels_conll(
            'data/sequenceLabelling/CoNLL-2003/eng.testb')

        # we concatenate all sets
        x_all = np.concatenate((x_train1, x_train2, x_train3), axis=0)
        y_all = np.concatenate((y_train1, y_train2, y_train3), axis=0)

        # split train and valid sets in a random way
        x_train, x_valid, y_train, y_valid = train_test_split(x_all,
                                                              y_all,
                                                              test_size=0.1)
        stats(x_train, y_train, x_valid, y_valid)

        model_name = 'ner-en-conll2003'
        if use_ELMo:
            model_name += '-with_ELMo'
        elif use_BERT:
            model_name += '-with_BERT'
        model_name += '-' + architecture

        model = Sequence(model_name,
                         max_epoch=60,
                         recurrent_dropout=recurrent_dropout,
                         embeddings_name=embedding_name,
                         model_type=architecture,
                         word_lstm_units=word_lstm_units,
                         batch_size=batch_size,
                         use_ELMo=use_ELMo,
                         use_BERT=use_BERT)
    elif (dataset_type == 'conll2012') and (lang == 'en'):
        print('Loading Ontonotes 5.0 CoNLL-2012 NER data...')

        x_train1, y_train1 = load_data_and_labels_conll(
            'data/sequenceLabelling/CoNLL-2012-NER/eng.train')
        x_train2, y_train2 = load_data_and_labels_conll(
            'data/sequenceLabelling/CoNLL-2012-NER/eng.dev')
        x_train3, y_train3 = load_data_and_labels_conll(
            'data/sequenceLabelling/CoNLL-2012-NER/eng.test')

        # we concatenate train and valid sets
        x_all = np.concatenate((x_train1, x_train2, x_train3), axis=0)
        y_all = np.concatenate((y_train1, y_train2, y_train3), axis=0)

        # split train and valid sets in a random way
        x_train, x_valid, y_train, y_valid = train_test_split(x_all,
                                                              y_all,
                                                              test_size=0.1)
        stats(x_train, y_train, x_valid, y_valid)

        model_name = 'ner-en-conll2012'
        if use_ELMo:
            model_name += '-with_ELMo'
        elif use_BERT:
            model_name += '-with_BERT'
        model_name += '-' + architecture

        model = Sequence(model_name,
                         max_epoch=80,
                         recurrent_dropout=0.20,
                         embeddings_name=embedding_name,
                         early_stop=True,
                         model_type=architecture,
                         word_lstm_units=word_lstm_units,
                         batch_size=batch_size,
                         use_ELMo=use_ELMo,
                         use_BERT=use_BERT)
    elif (lang == 'fr'):
        print('Loading data...')
        dataset_type = 'lemonde'
        x_all, y_all = load_data_and_labels_lemonde(
            'data/sequenceLabelling/leMonde/ftb6_ALL.EN.docs.relinked.xml')
        x_train, x_valid, y_train, y_valid = train_test_split(x_all,
                                                              y_all,
                                                              test_size=0.1)
        stats(x_train, y_train, x_valid, y_valid)

        model_name = 'ner-fr-lemonde'
        if use_ELMo:
            model_name += '-with_ELMo'
        elif use_BERT:
            model_name += '-with_BERT'
        model_name += '-' + architecture

        model = Sequence(model_name,
                         max_epoch=60,
                         recurrent_dropout=recurrent_dropout,
                         embeddings_name=embedding_name,
                         model_type=architecture,
                         word_lstm_units=word_lstm_units,
                         batch_size=batch_size,
                         use_ELMo=use_ELMo,
                         use_BERT=use_BERT)
    else:
        print("dataset/language combination is not supported:", dataset_type,
              lang)
        return

    #elif (dataset_type == 'ontonotes') and (lang == 'en'):
    #    model = sequenceLabelling.Sequence('ner-en-ontonotes', max_epoch=60, embeddings_name=embedding_name)
    #elif (lang == 'fr'):
    #    model = sequenceLabelling.Sequence('ner-fr-lemonde', max_epoch=60, embeddings_name=embedding_name)

    start_time = time.time()
    model.train(x_train, y_train, x_valid, y_valid)
    runtime = round(time.time() - start_time, 3)
    print("training runtime: %s seconds " % (runtime))

    # saving the model
    model.save()
Beispiel #22
0
def migrate():
    # grobid models
    for grobid_model in GROBID_MODEL:
        model_name = 'grobid-' + grobid_model

        # load the model
        """
        print(os.path.join(DATA_PATH, model_name))
        if os.path.isdir(os.path.join(DATA_PATH, model_name)):
            model = Sequence(model_name)
            model.load()
            model.save()
        """

        # with ELMo
        """
        if os.path.isdir(os.path.join(DATA_PATH, model_name+'-with_ELMo')):
            model = Sequence(model_name+'-with_ELMo')
            model.load()
            model.save()
        """

    # insult model
    """
    model = Sequence('insult')
    model.load()
    model.save()
    """

    for en_model in NER_MODELS_EN:
        for architecture in ARCHITECTURE:
            """
            model_name = 'ner-en-' + en_model
            model_name += '-' + architecture
            if os.path.isdir(os.path.join(DATA_PATH, model_name)):
                model = Sequence(model_name)
                model.load()
                model.save()
            """
            """
            model_name = 'ner-en-' + en_model
            model_name += '-with_ELMo'
            model_name += '-' + architecture
            if os.path.isdir(os.path.join(DATA_PATH, model_name)):
                model = Sequence(model_name)
                model.load()
                model.save()

            """
            """
            model_name = 'ner-en-' + en_model
            model_name += '-with_BERT'
            model_name += '-' + architecture
            if os.path.isdir(os.path.join(DATA_PATH, model_name)):
                model = Sequence(model_name)
                model.load()
                model.save()
            """

    for fr_model in NER_MODELS_FR:
        for architecture in ARCHITECTURE:

            model_name = 'ner-fr-' + fr_model
            model_name += '-' + architecture
            print(os.path.join(DATA_PATH, model_name))
            if os.path.isdir(os.path.join(DATA_PATH, model_name)):
                model = Sequence(model_name)
                model.load()
                model.save()

            model_name = 'ner-fr-' + fr_model
            model_name += '-with_ELMo'
            model_name += '-' + architecture
            if os.path.isdir(os.path.join(DATA_PATH, model_name)):
                model = Sequence(model_name)
                model.load()
                model.save()
Beispiel #23
0
def train_eval(model,
               embeddings_name,
               architecture='BidLSTM_CRF',
               use_ELMo=False,
               input_path=None,
               output_path=None,
               fold_count=1):
    print('Loading data...')
    if input_path is None:
        x_all, y_all, f_all = load_data_and_labels_crf_file(
            'data/sequenceLabelling/grobid/' + model + '/' + model +
            '-060518.train')
    else:
        x_all, y_all, f_all = load_data_and_labels_crf_file(input_path)

    x_train_all, x_eval, y_train_all, y_eval = train_test_split(x_all,
                                                                y_all,
                                                                test_size=0.1)
    x_train, x_valid, y_train, y_valid = train_test_split(x_train_all,
                                                          y_train_all,
                                                          test_size=0.1)

    print(len(x_train), 'train sequences')
    print(len(x_valid), 'validation sequences')
    print(len(x_eval), 'evaluation sequences')

    if output_path:
        model_name = model
    else:
        model_name = 'grobid-' + model

    batch_size = 20
    max_sequence_length = 3000

    if use_ELMo:
        model_name += '-with_ELMo'
        if model_name == 'software-with_ELMo' or model_name == 'grobid-software-with_ELMo':
            batch_size = 3

    model = Sequence(model_name,
                     max_epoch=100,
                     recurrent_dropout=0.50,
                     embeddings_name=embeddings_name,
                     model_type=architecture,
                     use_ELMo=use_ELMo,
                     max_sequence_length=max_sequence_length,
                     batch_size=batch_size,
                     fold_number=fold_count)

    start_time = time.time()

    if fold_count == 1:
        model.train(x_train, y_train, x_valid, y_valid)
    else:
        model.train_nfold(x_train,
                          y_train,
                          x_valid,
                          y_valid,
                          fold_number=fold_count)

    runtime = round(time.time() - start_time, 3)
    print("training runtime: %s seconds " % (runtime))

    # evaluation
    print("\nEvaluation:")
    model.eval(x_eval, y_eval)

    # saving the model
    if (output_path):
        model.save(output_path)
    else:
        model.save()
def run_eval_txt(xml_repo_path, model, nb_threads=1, use_ELMo=False):

    # load the model
    # load model
    model_name = 'grobid-' + model
    if use_ELMo:
        model_name += '-with_ELMo'

    model = Sequence(model_name)
    model.load()

    if not use_ELMo:
        model.model_config.batch_size = 200

    start_time = time.time()

    # acquisition of texts
    texts = []
    nb_texts = 0
    nb_tokens = 0
    nb_files = 0
    for (dirpath, dirnames, filenames) in os.walk(xml_repo_path):
        for filename in filenames:
            if filename.endswith('.xml') or filename.endswith('.tei'):
                #try:
                tree = ET.parse(os.path.join(dirpath, filename))
                #except:
                #    print("XML parsing error with", filename)
                for paragraph in tree.findall(
                        ".//{http://www.tei-c.org/ns/1.0}p"):
                    #texts.append(paragraph.text)
                    text = ET.tostring(paragraph,
                                       encoding='utf-8',
                                       method='text').decode('utf-8')
                    text = text.replace("\n", " ")
                    text = text.replace("\t", " ")
                    test = re.sub(r'( )+', ' ', text.strip())
                    texts.append(text.strip())
                    nb_texts += 1
                    nb_tokens += len(pattern.split(text))
                    if len(texts) == model.model_config.batch_size:
                        process_batch_txt(texts, model, nb_threads)
                        texts = []
                nb_files += 1
                if nb_files > 50:
                    break
    # last batch
    if len(texts) > 0:
        process_batch_txt(texts, model, nb_threads)

    print("-----------------------------")
    print("nb xml files:", nb_files)
    print("nb texts:", nb_texts)
    print("nb tokens:", nb_tokens)

    runtime = round(time.time() - start_time, 4)
    print("-----------------------------")
    print("total runtime: %s seconds " % (runtime))
    print("-----------------------------")
    print("xml files/s:\t {:.4f}".format(nb_files / runtime))
    print("    texts/s:\t {:.4f}".format(nb_texts / runtime))
    print("   tokens/s:\t {:.4f}".format(nb_tokens / runtime))