Python BERT_NERの例

プログラミング言語: Python

名前空間/パッケージ名: deep_ner.bert_ner

クラス/型: BERT_NER

hotexamples.comのコード掲載数: 5

Python BERT_NER - 5件のコード例が見つかりました。すべてオープンソースプロジェクトから抽出されたPythonのdeep_ner.bert_ner.BERT_NERの実例で、最も評価が高いものを厳選しています。コード例の評価を行っていただくことで、より質の高いコード例が表示されるようになります。

よく使われるメソッド

表示非表示

BERT_NER(3)

fit(3)

predict(3)

よく使われるメソッド

BERT_NER (3)

fit (3)

predict (3)

コード例 #1

ファイルを表示

def train(factrueval2016_devset_dir: str, split_by_paragraphs: bool,
          bert_will_be_tuned: bool, lstm_layer_size: Union[int, None],
          max_epochs: int, batch_size: int, gpu_memory_frac: float,
          model_name: str) -> BERT_NER:
    if os.path.isfile(model_name):
        with open(model_name, 'rb') as fp:
            recognizer = pickle.load(fp)
        assert isinstance(recognizer, BERT_NER)
        print('The NER has been successfully loaded from the file `{0}`...'.
              format(model_name))
        print('')
    else:
        temp_json_name = tempfile.NamedTemporaryFile(mode='w').name
        try:
            factrueval2016_to_json(factrueval2016_devset_dir, temp_json_name,
                                   split_by_paragraphs)
            X, y = load_dataset(temp_json_name)
        finally:
            if os.path.isfile(temp_json_name):
                os.remove(temp_json_name)
        print('Data for training have been loaded...')
        print('Number of samples is {0}.'.format(len(y)))
        print('')
        if BERT_NER.PATH_TO_BERT is None:
            bert_hub_module_handle = 'https://tfhub.dev/google/bert_multi_cased_L-12_H-768_A-12/1'
        else:
            bert_hub_module_handle = None
        recognizer = BERT_NER(finetune_bert=bert_will_be_tuned,
                              batch_size=batch_size,
                              l2_reg=1e-3,
                              bert_hub_module_handle=bert_hub_module_handle,
                              lstm_units=lstm_layer_size,
                              validation_fraction=0.25,
                              max_epochs=max_epochs,
                              patience=3,
                              gpu_memory_frac=gpu_memory_frac,
                              verbose=True,
                              random_seed=42,
                              lr=1e-5 if bert_will_be_tuned else 1e-3)
        recognizer.fit(X, y)
        with open(model_name, 'wb') as fp:
            pickle.dump(recognizer, fp)
        print('')
        print(
            'The NER has been successfully fitted and saved into the file `{0}`...'
            .format(model_name))
        print('')
    return recognizer

コード例 #2

ファイルを表示

ファイル: demo_bert_conll2003.py プロジェクト: kuilef/deep_ner

def recognize(test_file_name: str, split_by_paragraphs: bool,
              recognizer: BERT_NER, results_file_name: str):
    X_test, y_test = load_dataset_from_bio(
        test_file_name,
        paragraph_separators=({'-DOCSTART-'} if split_by_paragraphs else None),
        stopwords={'-DOCSTART-'})
    print('The CoNLL-2003 data for final testing have been loaded...')
    print('Number of samples is {0}.'.format(len(y_test)))
    print('')
    y_pred = recognizer.predict(X_test)
    f1, precision, recall, quality_by_entities = calculate_prediction_quality(
        y_test, y_pred, classes_list=recognizer.classes_list_)
    print('All entities:')
    print('    F1-score is {0:.2%}.'.format(f1))
    print('    Precision is {0:.2%}.'.format(precision))
    print('    Recall is {0:.2%}.'.format(recall))
    for ne_type in sorted(list(quality_by_entities.keys())):
        print('  {0}'.format(ne_type))
        print('    F1-score is {0:.2%}.'.format(
            quality_by_entities[ne_type][0]))
        print('    Precision is {0:.2%}.'.format(
            quality_by_entities[ne_type][1]))
        print('    Recall is {0:.2%}.'.format(quality_by_entities[ne_type][2]))
    print('')
    save_dataset_as_bio(test_file_name,
                        X_test,
                        y_pred,
                        results_file_name,
                        stopwords={'-DOCSTART-'})

コード例 #3

ファイルを表示

ファイル: demo_bert_conll2003.py プロジェクト: kuilef/deep_ner

def train(train_file_name: str, valid_file_name: str,
          split_by_paragraphs: bool, bert_will_be_tuned: bool,
          lstm_layer_size: Union[int, None], l2: float, max_epochs: int,
          batch_size: int, gpu_memory_frac: float,
          model_name: str) -> BERT_NER:
    if os.path.isfile(model_name):
        with open(model_name, 'rb') as fp:
            recognizer = pickle.load(fp)
        assert isinstance(recognizer, BERT_NER)
        print('The NER has been successfully loaded from the file `{0}`...'.
              format(model_name))
        print('')
    else:
        X_train, y_train = load_dataset_from_bio(
            train_file_name,
            paragraph_separators=({'-DOCSTART-'}
                                  if split_by_paragraphs else None),
            stopwords={'-DOCSTART-'})
        X_val, y_val = load_dataset_from_bio(
            valid_file_name,
            paragraph_separators=({'-DOCSTART-'}
                                  if split_by_paragraphs else None),
            stopwords={'-DOCSTART-'})
        print(
            'The CoNLL-2003 data for training and validation have been loaded...'
        )
        print('Number of samples for training is {0}.'.format(len(y_train)))
        print('Number of samples for validation is {0}.'.format(len(y_val)))
        print('')
        if BERT_NER.PATH_TO_BERT is None:
            bert_hub_module_handle = 'https://tfhub.dev/google/bert_cased_L-12_H-768_A-12/1'
        else:
            bert_hub_module_handle = None
        recognizer = BERT_NER(finetune_bert=bert_will_be_tuned,
                              batch_size=batch_size,
                              l2_reg=l2,
                              bert_hub_module_handle=bert_hub_module_handle,
                              lstm_units=lstm_layer_size,
                              max_epochs=max_epochs,
                              patience=5,
                              gpu_memory_frac=gpu_memory_frac,
                              verbose=True,
                              random_seed=42,
                              lr=1e-6 if bert_will_be_tuned else 1e-4)
        recognizer.fit(X_train, y_train, validation_data=(X_val, y_val))
        print('')
        print(
            'The NER has been successfully fitted and saved into the file `{0}`...'
            .format(model_name))
        y_pred = recognizer.predict(X_val)
        f1, precision, recall, quality_by_entities = calculate_prediction_quality(
            y_val, y_pred, classes_list=recognizer.classes_list_)
        print('All entities:')
        print('    F1-score is {0:.2%}.'.format(f1))
        print('    Precision is {0:.2%}.'.format(precision))
        print('    Recall is {0:.2%}.'.format(recall))
        for ne_type in sorted(list(quality_by_entities.keys())):
            print('  {0}'.format(ne_type))
            print('    F1-score is {0:.2%}.'.format(
                quality_by_entities[ne_type][0]))
            print('    Precision is {0:.2%}.'.format(
                quality_by_entities[ne_type][1]))
            print('    Recall is {0:.2%}.'.format(
                quality_by_entities[ne_type][2]))
        print('')
        with open(model_name, 'wb') as fp:
            pickle.dump(recognizer, fp)
    return recognizer

コード例 #4

ファイルを表示

def recognize(factrueval2016_testset_dir: str, split_by_paragraphs: bool,
              recognizer: BERT_NER, results_dir: str):
    temp_json_name = tempfile.NamedTemporaryFile(mode='w').name
    try:
        factrueval2016_to_json(factrueval2016_testset_dir, temp_json_name,
                               split_by_paragraphs)
        with codecs.open(temp_json_name,
                         mode='r',
                         encoding='utf-8',
                         errors='ignore') as fp:
            data_for_testing = json.load(fp)
        _, true_entities = load_dataset(temp_json_name)
    finally:
        if os.path.isfile(temp_json_name):
            os.remove(temp_json_name)
    texts = []
    additional_info = []
    for cur_document in data_for_testing:
        base_name = os.path.join(results_dir,
                                 cur_document['base_name'] + '.task1')
        for cur_paragraph in cur_document['paragraph_bounds']:
            texts.append(
                cur_document['text'][cur_paragraph[0]:cur_paragraph[1]])
            additional_info.append((base_name, cur_paragraph))
    print('Data for final testing have been loaded...')
    print('Number of samples is {0}.'.format(len(true_entities)))
    print('')
    predicted_entities = recognizer.predict(texts)
    assert len(predicted_entities) == len(true_entities)
    f1, precision, recall, quality_by_entities = calculate_prediction_quality(
        true_entities, predicted_entities, recognizer.classes_list_)
    print('All entities:')
    print('    F1-score is {0:.2%}.'.format(f1))
    print('    Precision is {0:.2%}.'.format(precision))
    print('    Recall is {0:.2%}.'.format(recall))
    for ne_type in sorted(list(quality_by_entities.keys())):
        print('  {0}'.format(ne_type))
        print('    F1-score is {0:.2%}.'.format(
            quality_by_entities[ne_type][0]))
        print('    Precision is {0:.2%}.'.format(
            quality_by_entities[ne_type][1]))
        print('    Recall is {0:.2%}.'.format(quality_by_entities[ne_type][2]))
    results_for_factrueval_2016 = dict()
    for sample_idx, cur_result in enumerate(predicted_entities):
        base_name, paragraph_bounds = additional_info[sample_idx]
        for entity_type in cur_result:
            if entity_type == 'ORG':
                prepared_entity_type = 'org'
            elif entity_type == 'PERSON':
                prepared_entity_type = 'per'
            elif entity_type == 'LOCATION':
                prepared_entity_type = 'loc'
            else:
                prepared_entity_type = None
            if prepared_entity_type is None:
                raise ValueError(
                    '`{0}` is unknown entity type!'.format(entity_type))
            for entity_bounds in cur_result[entity_type]:
                postprocessed_entity = (prepared_entity_type,
                                        entity_bounds[0] + paragraph_bounds[0],
                                        entity_bounds[1] - entity_bounds[0])
                if base_name in results_for_factrueval_2016:
                    results_for_factrueval_2016[base_name].append(
                        postprocessed_entity)
                else:
                    results_for_factrueval_2016[base_name] = [
                        postprocessed_entity
                    ]
    for base_name in results_for_factrueval_2016:
        with codecs.open(base_name,
                         mode='w',
                         encoding='utf-8',
                         errors='ignore') as fp:
            for cur_entity in sorted(results_for_factrueval_2016[base_name],
                                     key=lambda it: (it[1], it[2], it[0])):
                fp.write('{0} {1} {2}\n'.format(cur_entity[0], cur_entity[1],
                                                cur_entity[2]))

コード例 #5

ファイルを表示

ファイル: demo_bert_factrueval2016.py プロジェクト: bond005/deep_ner

def train(factrueval2016_devset_dir: str,
          split_by_paragraphs: bool,
          bert_will_be_tuned: bool,
          use_lang_features: bool,
          use_shapes: bool,
          lstm_layer_size: Union[int, None],
          l2: float,
          max_epochs: int,
          patience: int,
          batch_size: int,
          gpu_memory_frac: float,
          model_name: str,
          collection3_dir: Union[str, None] = None,
          n_max_samples: int = 0) -> BERT_NER:
    if os.path.isfile(model_name):
        with open(model_name, 'rb') as fp:
            recognizer = pickle.load(fp)
        assert isinstance(recognizer, BERT_NER)
        print('The NER has been successfully loaded from the file `{0}`...'.
              format(model_name))
        print('')
    else:
        temp_json_name = tempfile.NamedTemporaryFile(mode='w').name
        try:
            factrueval2016_to_json(factrueval2016_devset_dir, temp_json_name,
                                   split_by_paragraphs)
            X, y = load_dataset_from_json(temp_json_name)
        finally:
            if os.path.isfile(temp_json_name):
                os.remove(temp_json_name)
        print('The FactRuEval-2016 data for training have been loaded...')
        print('Number of samples is {0}.'.format(len(y)))
        print('')
        if BERT_NER.PATH_TO_BERT is None:
            bert_hub_module_handle = 'https://tfhub.dev/google/bert_multi_cased_L-12_H-768_A-12/1'
        else:
            bert_hub_module_handle = None
        recognizer = BERT_NER(finetune_bert=bert_will_be_tuned,
                              batch_size=batch_size,
                              l2_reg=l2,
                              bert_hub_module_handle=bert_hub_module_handle,
                              lstm_units=lstm_layer_size,
                              validation_fraction=0.25,
                              max_epochs=max_epochs,
                              patience=patience,
                              gpu_memory_frac=gpu_memory_frac,
                              verbose=True,
                              random_seed=42,
                              lr=3e-6 if bert_will_be_tuned else 1e-4,
                              udpipe_lang='ru',
                              use_nlp_features=use_lang_features,
                              use_shapes=use_shapes)
        if collection3_dir is None:
            if n_max_samples > 0:
                train_index, test_index = split_dataset(
                    y=y, test_part=recognizer.validation_fraction)
                X_train = np.array(X, dtype=object)[train_index]
                y_train = np.array(y, dtype=object)[train_index]
                X_val = np.array(X, dtype=object)[test_index]
                y_val = np.array(y, dtype=object)[test_index]
                del train_index, test_index
                index = sample_from_dataset(y=y_train, n=n_max_samples)
                recognizer.fit(X_train[index],
                               y_train[index],
                               validation_data=(X_val, y_val))
            else:
                recognizer.fit(X, y)
        else:
            X_train, y_train = load_dataset_from_brat(collection3_dir,
                                                      split_by_paragraphs=True)
            if not split_by_paragraphs:
                X_train, y_train = divide_dataset_by_sentences(
                    X_train, y_train, sent_tokenize_func=ru_sent_tokenize)
            for sample_idx in range(len(y_train)):
                new_y_sample = dict()
                for ne_type in sorted(list(y_train[sample_idx].keys())):
                    if ne_type == 'PER':
                        new_y_sample['PERSON'] = y_train[sample_idx][ne_type]
                    elif ne_type == 'LOC':
                        new_y_sample['LOCATION'] = y_train[sample_idx][ne_type]
                    else:
                        new_y_sample[ne_type] = y_train[sample_idx][ne_type]
                y_train[sample_idx] = new_y_sample
                del new_y_sample
            print('The Collection3 data for training have been loaded...')
            print('Number of samples is {0}.'.format(len(y_train)))
            print('')
            if n_max_samples > 0:
                index = sample_from_dataset(y=y_train, n=n_max_samples)
                X_train = np.array(X_train, dtype=object)[index]
                y_train = np.array(y_train, dtype=object)[index]
                del index
            recognizer.fit(X_train, y_train, validation_data=(X, y))
        with open(model_name, 'wb') as fp:
            pickle.dump(recognizer, fp)
        print('')
        print(
            'The NER has been successfully fitted and saved into the file `{0}`...'
            .format(model_name))
        print('')
    return recognizer