else: xindex, xlabel = x.split("-") if xindex == "B": if start is not None: extract_ner.append((start, i, label, input_x[start:i])) start = i label = xlabel else: if label != xlabel: start = None label = None return extract_ner if __name__ == "__main__": msra_data = LoadMsraDataV2("D:\data\\nlp\\命名实体识别\\msra_ner_token_level\\") print(msra_data.train_tag_list[0]) X_train = [sent2features(s) for s in msra_data.train_sentence_list] y_train = [sent2labels(s) for s in msra_data.train_tag_list] X_test = [sent2features(s) for s in msra_data.test_sentence_list] y_test = [sent2labels(s) for s in msra_data.test_tag_list] # print(X_train) print(len(y_train)) crf_mode = CRFNerModel() # crf_mode.load_model() crf_mode.fit(X_train, y_train)
#!/usr/bin/env python # -*- coding: utf-8 -*- # Copyright (c) *** import numpy as np import torch import torch.nn as nn from torch.optim import Optimizer from transformers import BertModel from pytorch.layers.crf import CRF import torch.autograd as autograd import torch.optim as optim from pytorch.layers.bert_optimization import BertAdam from transformers import BertTokenizer from nlp_applications.data_loader import LoadMsraDataV2 msra_data = LoadMsraDataV2("D:\data\\ner\\msra_ner_token_level\\") bert_model_name = "bert-base-chinese" class_num = len(msra_data.label2id) def sequence_padding(inputs, length=None, padding=0, is_float=False): """Numpy函数,将序列padding到同一长度 """ if length is None: length = max([len(x) for x in inputs]) outputs = np.array([ np.concatenate([x, [padding] * (length - len(x))]) if len(x) < length else x[:length] for x in inputs ])