Exemple #1
0
def run(schema_path, name, sample_size, batch_size, epochs):
    dataset = Dataset(schema_path, name)
    labels, data = dataset.get_data()

    X = [x['words'] for x in data]
    y = [x['labels'] for x in data]

    word_vocab = Vocabulary()
    word_vocab.build_vocab([w for command in X for w in command])

    #char embedding
    char_vocab = Vocabulary()
    char_vocab.build_vocab([ch for w in word_vocab for ch in w])

    labels2idx = dict((label, idx) for idx, label in enumerate(labels))
    idx2label = dict((idx, label) for idx, label in enumerate(labels))

    preprocessor = Preprocessor(word_vocab, labels2idx, char_vocab)
    model = BiLSTMCRF(labels, len(word_vocab), len(char_vocab))
    trainer = Trainer(model, X, y, preprocessor.transform, split=[0.75, 0.95])

    trainer.train(batch_size, epochs)
    trainer.evaluate(idx2label)

    model.save_weights(name)
    dataset.save(X[:sample_size], labels)
    word_vocab.save("%s_word_vocab.json" % name)
    char_vocab.save("%s_char_vocab.json" % name)
Exemple #2
0
def predict(name, command):
    command = command.lower()

    label_path = path.join(path.dirname(path.realpath(__file__)), "intents",
                           "config", "labels", "%s_labels.json" % name)
    with open(label_path, encoding="utf8") as f:
        labels = json.load(f)

    word_vocab = Vocabulary()
    word_vocab.load("%s_word_vocab.json" % name)

    #char embedding
    char_vocab = Vocabulary()
    char_vocab.load("%s_char_vocab.json" % name)

    idx2label = dict((idx, label) for idx, label in enumerate(labels))

    preprocessor = Preprocessor(word_vocab, None, char_vocab)
    model = BiLSTMCRF(labels, len(word_vocab), len(char_vocab))
    model.load_weights('intents/config/weights/%s.hdf5' % name)

    sentence = tokenize(command)
    features = preprocessor.transform([sentence])

    p = model.predict(features)
    predicted_labels = []
    for pred in p:
        predicted_labels.append(idx2label[pred])

    for word, label in zip(sentence, predicted_labels):
        print('%s: %s' % (word, label))
Exemple #3
0
    def __init_model(self, entry):
        if entry == "train":
            self.train_manager = DataManager(batch_size=self.batch_size,
                                             tags=self.tags)
            self.total_size = len(self.train_manager.batch_data)
            data = {
                "batch_size": self.train_manager.batch_size,
                "input_size": self.train_manager.input_size,
                "vocab": self.train_manager.vocab,
                "tag_map": self.train_manager.tag_map,
            }
            self.save_params(data)
            dev_manager = DataManager(batch_size=30, data_type="dev")
            self.dev_batch = dev_manager.iteration()

            self.model = BiLSTMCRF(
                tag_map=self.train_manager.tag_map,
                batch_size=self.batch_size,
                vocab_size=len(self.train_manager.vocab),
                dropout=self.dropout,
                embedding_dim=self.embedding_size,
                hidden_dim=self.hidden_size,
            )
            self.restore_model()
        elif entry == "predict":
            data_map = self.load_params()
            input_size = data_map.get("input_size")
            self.tag_map = data_map.get("tag_map")
            self.vocab = data_map.get("vocab")

            self.model = BiLSTMCRF(tag_map=self.tag_map,
                                   vocab_size=input_size,
                                   embedding_dim=self.embedding_size,
                                   hidden_dim=self.hidden_size)
            self.restore_model()
Exemple #4
0
def train(config, params):
    """模型训练。"""
    # 构建词典
    if not (os.path.exists(config["vocab_file"])
            and os.path.exists(config["tag_file"])):
        build_vocab(config["train_path"], config["vocab_file"],
                    config["tag_file"])

    # 读取词典
    vocab2id, id2vocab = read_vocab(config["vocab_file"])
    tag2id, id2tag = read_vocab(config["tag_file"])
    # 数据预处理
    train_text, train_label = tokenize(config["train_path"], vocab2id, tag2id,
                                       params["maxlen"])
    dev_text, dev_label = tokenize(config["dev_path"], vocab2id, tag2id,
                                   params["maxlen"])

    # 将数据转换为tf.data.Dataset
    train_dataset = data.Dataset.from_tensor_slices((train_text, train_label))
    train_dataset = train_dataset.shuffle(len(train_text)).batch(
        params["batch_size"], drop_remainder=True)

    dev_dataset = data.Dataset.from_tensor_slices((dev_text, dev_label))
    dev_dataset = dev_dataset.batch(params["batch_size"], drop_remainder=True)

    print(f"hidden_num:{params['hidden_num']}, vocab_size:{len(vocab2id)}, "
          f"label_size:{len(tag2id)}")

    # 构建模型
    model = BiLSTMCRF(hidden_num=params["hidden_num"],
                      vocab_size=len(vocab2id),
                      label_size=len(tag2id),
                      embedding_size=params["embedding_size"])

    # 编译模型
    model.compile(loss=CRFLoss(model.crf, model.dtype),
                  optimizer=tf.keras.optimizers.Adam(params["lr"]),
                  metrics=[model.crf.viterbi_accuracy,
                           IOBESF1(id2tag)],
                  run_eagerly=True)
    model.build((None, train_text.shape[-1]))
    model.summary()

    # 设置回调函数
    callbacks = [
        tf.keras.callbacks.ModelCheckpoint(filepath=config["ckpt_path"],
                                           save_weights_only=True,
                                           save_best_only=True,
                                           monitor="val_f1",
                                           mode="max"),
    ]

    # 训练(拟合)模型
    model.fit(train_dataset,
              epochs=params["epochs"],
              callbacks=callbacks,
              validation_data=dev_dataset)
Exemple #5
0
    def main_model(self, entry):
        """
        Model Initialization
        """
        # The Training Process
        if entry == "train":
            # Training Process: read Training Data from DataManager
            self.train_manager = DataManager(batch_size=self.batch_size,
                                             data_type='train',
                                             tags=self.tags)
            self.total_size = len(self.train_manager.batch_data)

            # Read the corresponding character index (vocab) and other hyper-parameters
            data = {
                "batch_size": self.train_manager.batch_size,
                "input_size": self.train_manager.input_size,
                "vocab": self.train_manager.vocab,
                "tag_map": self.train_manager.tag_map,
            }

            save_params(data=data, path=self.model_path)

            # Build BiLSTM-CRF Model
            self.model = BiLSTMCRF(tag_map=self.train_manager.tag_map,
                                   batch_size=self.batch_size,
                                   vocab_size=len(self.train_manager.vocab),
                                   dropout=self.dropout,
                                   embedding_dim=self.embedding_size,
                                   hidden_dim=self.hidden_size,
                                   max_length=self.max_length)

            # Evaluation Process: read Dev Data from DataManager
            self.dev_size = DataManager(batch_size=1,
                                        data_type="dev",
                                        tags=self.tags).load_data()
            self.dev_manager = DataManager(batch_size=int(self.dev_size),
                                           data_type="dev",
                                           tags=self.tags)
            self.dev_batch = self.dev_manager.iteration()

            # Restore model if it exists
            self.restore_model()

        # The Testing & Inference Process
        elif entry == "predict":
            data_map = load_params(path=self.model_path)
            input_size = data_map.get("input_size")
            self.tag_map = data_map.get("tag_map")
            self.vocab = data_map.get("vocab")
            self.model = BiLSTMCRF(tag_map=self.tag_map,
                                   vocab_size=input_size,
                                   dropout=0.0,
                                   embedding_dim=self.embedding_size,
                                   hidden_dim=self.hidden_size,
                                   max_length=self.max_length)

            self.restore_model()
Exemple #6
0
    def main_model(self, entry):
        # The Training Process
        if entry == "train":
            # Training Process: read Training Data from DataManager
            self.train_manager = DataManager(batch_size=self.batch_size, data_type='train', tags=self.tags)
            self.total_size = len(self.train_manager.batch_data)

            # Read the corresponding character index (vocab) and other hyper-parameters
            saved_data = {
                "batch_size": self.train_manager.batch_size,
                "input_size": self.train_manager.input_size,
                "char_vocab": self.train_manager.char_vocab,
                "tag_map": self.train_manager.tag_map,
            }
            save_params(data=saved_data, path=self.model_path)

            # Evaluation Process: read Dev Data from DataManager
            self.dev_size = DataManager(batch_size=1, data_type="dev", tags=self.tags).load_char_data()
            self.dev_manager = DataManager(batch_size=int(self.dev_size), data_type="dev")
            self.dev_batch = self.dev_manager.iteration()

            # Build BiLSTM-CRF Model
            self.model = BiLSTMCRF(
                tag_map=self.train_manager.tag_map,
                batch_size=self.batch_size,
                vocab_size=len(self.train_manager.char_vocab),
                dropout=self.dropout,
                word_num=self.word_num,
                word_dim=self.word_dim,
                char_num=self.char_num,
                char_dim=self.char_dim,
                hidden_dim=self.hidden_size,
            )

            # Restore model if it exists
            self.restore_model()

        # The Inference Process
        elif entry == "predict":
            data = load_params(path=self.model_path)
            input_size = data.get("input_size")
            self.tag_map = data.get("tag_map")
            self.vocab = data.get("char_vocab")
            self.model = BiLSTMCRF(
                tag_map=self.tag_map,
                vocab_size=input_size,
                dropout=1.0,
                word_num=self.word_num,
                word_dim=self.word_dim,
                char_num=self.char_num,
                char_dim=self.char_dim,
                hidden_dim=self.hidden_size,
            )
            self.restore_model()
Exemple #7
0
    def __init_model(self, entry):
        # 模型训练的参数准备
        if entry == "train":
            #创建训练数据集的管理对象
            print(self.tags)
            self.train_manager = DataManager(batch_size=self.batch_size, tags=self.tags)
            print(self.train_manager.batch_data)
            print(len(self.train_manager.batch_data))
            self.total_size = len(self.train_manager.batch_data)
            # print(self.train_manager.batch_data)
            data = {
                "batch_size": self.train_manager.batch_size,
                "input_size": self.train_manager.input_size,
                "vocab": self.train_manager.vocab,
                "tag_map": self.train_manager.tag_map,
            }
            # 保存参数
            self.save_params(data)
            # 验证数据集的准备
            # 创建验证数据集的管理对象
            dev_manager = DataManager(batch_size=30, data_type="dev")
            # 通过data_manager中的迭代器不断将创建的数据管理器对象赋值到dev_batch中,用于下面计算损失的函数
            self.dev_batch = dev_manager.iteration()

            # 模型的主体使用的是BiLSTM来进行语义编码,CRF用来约束各个标签
            self.model = BiLSTMCRF(
                tag_map=self.train_manager.tag_map,
                batch_size=self.batch_size,
                vocab_size=len(self.train_manager.vocab),
                dropout=self.dropout,
                embedding_dim=self.embedding_size,
                hidden_dim=self.hidden_size,
            )
            # 加载恢复模型参数
            self.restore_model()
        # 模型用来预测的参数准备
        elif entry == "predict":
            data_map = self.load_params()
            input_size = data_map.get("input_size")
            self.tag_map = data_map.get("tag_map")
            self.vocab = data_map.get("vocab")
            # 这里创建一个模型对象model
            self.model = BiLSTMCRF(
                tag_map=self.tag_map,
                vocab_size=input_size,
                embedding_dim=self.embedding_size,
                hidden_dim=self.hidden_size
            )
            self.restore_model()
Exemple #8
0
 def main_model(self, entry):
     """
     Model Initialization
     """
     # The Testing & Inference Process
     if entry == "predict":
         data_map = load_params(path=self.model_path)
         input_size = data_map.get("input_size")
         self.tag_map = data_map.get("tag_map")
         self.vocab = data_map.get("vocab")
         self.model = BiLSTMCRF(tag_map=self.tag_map,
                                vocab_size=input_size,
                                dropout=0.0,
                                embedding_dim=self.embedding_size,
                                hidden_dim=self.hidden_size,
                                max_length=self.max_length)
         self.restore_model()
    def __init_model(self, entry):
        if entry == "train":
            self.train_manager = DataManager(batch_size=self.batch_size,
                                             tags=self.tags)
            self.total_size = len(self.train_manager.batch_data)
            data = {
                "batch_size": self.train_manager.batch_size,
                "input_size": self.train_manager.input_size,
                "vocab": self.train_manager.vocab,
                "tag_map": self.train_manager.tag_map,
            }
            self.save_params(data)
            self.dev_manager = DataManager(batch_size=60, data_type="dev")
            # 验证集
            # self.dev_batch = self.dev_manager.iteration()

            self.model = BiLSTMCRF(
                tag_map=self.train_manager.tag_map,
                batch_size=self.batch_size,
                vocab_size=len(self.train_manager.vocab),
                dropout=self.dropout,
                embedding_dim=self.embedding_size,
                hidden_dim=self.hidden_size,
            )
            self.model = self.model.cuda()
            self.restore_model()
        elif entry == "predict" or "evaluate":
            # python main.py predict
            data_map = self.load_params()
            input_size = data_map.get("input_size")
            self.tag_map = data_map.get("tag_map")
            self.vocab = data_map.get("vocab")
            print('input_size', input_size)
            print('tag_map', self.tag_map)
            self.model = BiLSTMCRF(tag_map=self.tag_map,
                                   vocab_size=input_size,
                                   embedding_dim=self.embedding_size,
                                   hidden_dim=self.hidden_size)
            self.model = self.model.cuda()
            self.test_manager = DataManager(batch_size=60, data_type="dev")
            self.restore_model()
Exemple #10
0
def workflow():

    train_data, valid_data, test_data, vocab, speech_vocab = prepare_data()

    ## Set the corresponding tags for each dataset, which will be used in the Trainer
    train_data.set_input("token_index_list", "origin_len", "speech_index_list")
    test_data.set_input("token_index_list", "origin_len", "speech_index_list")
    valid_data.set_input("token_index_list", "origin_len", "speech_index_list")

    train_data.set_target("speech_index_list")
    test_data.set_target("speech_index_list")
    valid_data.set_target("speech_index_list")

    ## Build the model
    config = {
        "vocab_size": len(vocab),
        "word_emb_dim": args.word_emb,
        "rnn_hidden_units": args.rnn_hidden,
        "num_classes": len(speech_vocab),
        "bi_direction": args.bilstm
    }

    ## Load the model from scratch or from saved model
    if args.cont:
        model = torch.load(args.cont)
    else:
        model = BiLSTMCRF(config)

    if args.mode == "train":
        ##Choose the optimizer
        optimizer = Adam(lr=args.lr) if args.op else SGD(lr=args.lr)

        ## Train the model
        trainer = Trainer(model=model,
                          train_data=train_data,
                          dev_data=valid_data,
                          use_cuda=args.cuda,
                          metrics=PosMetric(pred='pred',
                                            target='speech_index_list'),
                          optimizer=optimizer,
                          n_epochs=args.epoch,
                          batch_size=args.batch_size,
                          save_path="./save")
        trainer.train()

    ## Test the model
    tester = Tester(
        data=test_data,
        model=model,
        metrics=PosMetric(pred='pred', target='speech_index_list'),
        use_cuda=args.cuda,
    )
    tester.test()
Exemple #11
0
def single_predict():

    vocab_size = 4688
    embed_size = 128
    units = 64
    num_tags = 4

    _, _, char_index_dict, index_char_dict = open_file("./data/data.txt")

    optimizer = tf.keras.optimizers.Adagrad(learning_rate=0.1)

    my_model = BiLSTMCRF(vocab_size, embed_size, units, num_tags)
    ckpt = tf.train.Checkpoint(optimizer=optimizer, my_model=my_model)
    ckpt.restore(tf.train.latest_checkpoint("./save_checkpoint/"))

    text = input_text()

    char_index_list = [char_index_dict.get(char, 0) for char in text]

    text_list = [char for char in text]
    tag_list = ['b', 'm', 'e', 's']

    inputs = tf.keras.preprocessing.sequence.pad_sequences([char_index_list],
                                                           padding='post')

    #predict得到numpy矩阵
    logits, inputs_length = my_model.predict(inputs)

    #viterbi_decode得到最优路径
    path, _ = tfa_crf.viterbi_decode(logits[0], my_model.transition_params)

    path_list = [tag_list[index] for index in path]
    new_path_list = tag_finetune(path_list)

    #衡量标签路径更改的程度
    print("标签正常率%.2f%%" %
          (100 * sum([i1 == i2 for i1, i2 in zip(path_list, new_path_list)]) /
           len(path_list)))

    seg_text(text, new_path_list)
Exemple #12
0
def predict(text, config, params, is_export=False):
    """模型预测。"""
    # 读取词典
    vocab2id, id2vocab = read_vocab(config["vocab_file"])
    tag2id, id2tag = read_vocab(config["tag_file"])

    # 构建模型
    model = BiLSTMCRF(hidden_num=params["hidden_num"],
                      vocab_size=len(vocab2id),
                      label_size=len(tag2id),
                      embedding_size=params["embedding_size"])
    model.load_weights(config["ckpt_path"])

    # 数据预处理
    dataset = tf.keras.preprocessing.sequence.pad_sequences(
        [[vocab2id.get(char, 0) for char in text]],
        padding='post',
        maxlen=params["maxlen"])

    # 模型预测
    result = model.predict(dataset)[0]
    result = np.argmax(result, axis=-1)
    result = [id2tag[i] for i in result]
    print(result)
    # 结果处理
    entities_result = format_result(list(text), result)
    print(json.dumps(entities_result, indent=4, ensure_ascii=False))

    if is_export:
        # 导出模型
        tf.keras.models.save_model(model,
                                   config["export_dir"],
                                   overwrite=True,
                                   include_optimizer=True,
                                   save_format=None,
                                   options=None)
Exemple #13
0
def create_model(bert_config, is_training, input_ids, input_mask, sequence_lens,
                 segment_ids, label_ids, num_labels, use_one_hot_embeddings):
    """
    创建模型
    :param bert_config:  bert 模型的配置参数
    :param is_training:  判断是否是训练模式
    :param input_ids:  输入的数据的index表示
    :param input_mask:  mask列表
    :param segment_ids:  句子的index
    :param label_ids:  标签序列
    :param num_labels:  标签的数量
    :param use_one_hot_embeddings:
    :return:
    """
    # 初始化bert模型
    model = modeling.BertModel(
        config=bert_config,
        is_training=is_training,
        input_ids=input_ids,
        input_mask=input_mask,
        token_type_ids=segment_ids,
        use_one_hot_embeddings=use_one_hot_embeddings
    )

    # 获得bert模型最后的输出,维度为[batch_size, seq_length, embedding_size]
    # 将bert的输出作为我们的输入,相当于做word embedding
    embedding = model.get_sequence_output()
    tf.logging.info("bert embedding size: {}".format(embedding.get_shape()))
    max_seq_length = embedding.shape[1].value

    blstm_crf = BiLSTMCRF(embedded_chars=embedding, hidden_sizes=FLAGS.hidden_sizes, layers=FLAGS.layers,
                          dropout_rate=FLAGS.dropout_rate, num_labels=num_labels, max_len=max_seq_length,
                          labels=label_ids, sequence_lens=sequence_lens, is_training=is_training)

    result = blstm_crf.construct_graph()
    return result
Exemple #14
0
def predict(text, config, params):
    """模型预测。"""
    # 读取词典
    vocab2id, id2vocab = read_vocab(config["vocab_file"])
    tag2id, id2tag = read_vocab(config["tag_file"])

    # 构建模型
    model = BiLSTMCRF(
        hidden_num=params["hidden_num"], vocab_size=len(vocab2id),
        label_size=len(tag2id), embedding_size=params["embedding_size"])
    model.load_weights(config["ckpt_path"])

    # 数据预处理
    dataset = tf.keras.preprocessing.sequence.pad_sequences(
        [[vocab2id.get(char, 0) for char in text]], padding='post')

    # 模型预测
    result = model.predict(dataset)[0]
    result = np.argmax(result, axis=-1)
    result = [id2tag[i] for i in result]
    print(result)
    # 结果处理
    entities_result = format_result(list(text), result)
    print(json.dumps(entities_result, indent=4, ensure_ascii=False))
Exemple #15
0
    print 'Evaluation: #test_samples= ' + str(len(test_Y))

    for i in range(len(stored_model_list)):
        #model = BaseSequenceLabeling(word_embedding_dimension, number_class, hidden_size=parameters['hidden_size'], sentence_embedding_type = parameters['sentence_embedding_type'],
        #                            sentence_zero_inithidden = parameters['sentence_zero_inithidden'], attention = None, num_layers = parameters['num_layers'], dropout = parameters['dropout'])

        #model = BaseSequenceLabeling_LSTMEncoder(word_embedding_dimension, number_class, hidden_size=parameters['hidden_size'], sentence_embedding_type = parameters['sentence_embedding_type'],
        #                             sentence_zero_inithidden = parameters['sentence_zero_inithidden'], attention = None, num_layers = parameters['num_layers'], dropout = parameters['dropout'])

        model = BiLSTMCRF(
            word_embedding_dimension,
            number_class,
            hidden_size=parameters['hidden_size'],
            sentence_embedding_type=parameters['sentence_embedding_type'],
            sentence_zero_inithidden=parameters['sentence_zero_inithidden'],
            attention=None,
            crf_decode_method=parameters['crf_decode_method'],
            loss_function=parameters['loss_function'],
            num_layers=parameters['num_layers'],
            dropout=parameters['dropout'])

        if use_cuda:
            model = model.cuda()
        model.load_state_dict(stored_model_list[i])

        print 'Evaluate on all situation entity'
        print '----------------------------------------------------'
        best_macro_Fscore, best_result = evaluate(
            model, (test_X, test_X_eos_list, test_X_connective_position_list),
            test_Y)
Exemple #16
0
def main():
    # load vocab
    vocab = Vocab(stopwords_file=args.vocab_dir + 'stopwords.txt',
                  vocab_dir=args.vocab_dir)
    vocab.load_vocab_label()
    vocab_size = vocab.get_vocab_size()
    label_size = vocab.get_label_size()

    # load pre-trained word embedding
    # embeddings_index = {}
    # embedding_matrix = {}
    # if args.w2v_file is not None:
    #     with open(args.w2v_file, 'r', encoding='utf-8') as f:
    #         for line in f:
    #             arrs = line.split()
    #             if len(arrs) == 2:
    #                 continue
    #             w = arrs[0]
    #             vec = np.asarray(arrs[1:], dtype='float32')
    #             embeddings_index[w] = vec
    #     print('{} INFO: Use pre-train word embedding , Found {} word vectors'.format(
    #         get_timestamp(), len(embeddings_index)))
    #
    #     # convert embedding to weights
    #     embedding_matrix = np.zeros((vocab_size, args.embedding_dim))
    #     for word, idx in vocab.vocab.items():
    #         if word in embeddings_index:
    #             embedding_matrix[idx] = embeddings_index[word]
    #
    # # define tf train summary writer
    # summary_writer = tf.summary.create_file_writer(args.summary_dir)

    # load data
    train_dataset, num_train_samples = create_dataset_with_tf(
        args.train_data, vocab, args.epochs, args.batch_size, args.max_seq_len,
        args.mode)
    dev_dataset, num_dev_samples = create_dataset_with_tf(
        args.test_data, vocab, 1, args.batch_size, args.max_seq_len,
        "evaluate")

    model = BiLSTMCRF(args.hidden_num,
                      vocab_size,
                      label_size,
                      args.embedding_dim,
                      args.max_seq_len,
                      weights=None,
                      weights_trainable=False)
    optimizer = tf.keras.optimizers.Adam(learning_rate=args.learning_rate,
                                         decay=0.0)
    steps_per_epoch = num_train_samples // args.batch_size
    train_loss = tf.metrics.Mean()
    for epoch in range(args.epochs):
        train_loss.reset_states()
        # train_accuracy.reset_states()

        for (batch_idx,
             (inputs,
              labels)) in enumerate(train_dataset.take(steps_per_epoch)):
            time_s = time.time()
            train_loss, pred, seq_len = train_step(inputs, labels, model,
                                                   train_loss, optimizer, True)
            time_e = time.time()
            train_accuracy = cal_acc_one_step(model, pred, seq_len, labels)
            # write to summary file
            # tf.summary.scalar("train_loss", train_loss.result().numpy(), step=batch_idx)
            # tf.summary.scalar("train_accuracy", train_accuracy.result().numpy(), step=batch_idx)
            # tf.summary.scalar("learning_rate", params.learning_rate, step=batch_idx)
            # summary_writer.flush()

            print(
                "{} INFO: Train batch:{}/{}\tloss:{:.4f}\tacc:{:.4f} time:{:.4f}s"
                .format(get_timestamp(), batch_idx + epoch * steps_per_epoch,
                        args.epochs * steps_per_epoch, train_loss,
                        train_accuracy, (time_e - time_s)))
Exemple #17
0
class ChineseNER:
    def __init__(self, entry="train"):
        # Load some Hyper-parameters
        config = load_config()
        self.embedding_size = config.get("embedding_size")
        self.hidden_size = config.get("hidden_size")
        self.batch_size = config.get("batch_size")
        self.model_path = config.get("model_path")
        self.dropout = config.get("dropout")
        self.tags = config.get("tags")
        self.learning_rate = config.get("learning_rate")
        self.epochs = config.get("epochs")
        self.weight_decay = config.get("weight_decay")
        self.transfer_learning = config.get("transfer_learning")
        self.lr_decay_step = config.get("lr_decay_step")
        self.lr_decay_rate = config.get("lr_decay_rate")
        self.max_length = config.get("max_length")

        # Model Initialization
        self.main_model(entry)

    def main_model(self, entry):
        """
        Model Initialization
        """
        # The Training Process
        if entry == "train":
            # Training Process: read Training Data from DataManager
            self.train_manager = DataManager(batch_size=self.batch_size,
                                             data_type='train',
                                             tags=self.tags)
            self.total_size = len(self.train_manager.batch_data)

            # Read the corresponding character index (vocab) and other hyper-parameters
            data = {
                "batch_size": self.train_manager.batch_size,
                "input_size": self.train_manager.input_size,
                "vocab": self.train_manager.vocab,
                "tag_map": self.train_manager.tag_map,
            }

            save_params(data=data, path=self.model_path)

            # Build BiLSTM-CRF Model
            self.model = BiLSTMCRF(tag_map=self.train_manager.tag_map,
                                   batch_size=self.batch_size,
                                   vocab_size=len(self.train_manager.vocab),
                                   dropout=self.dropout,
                                   embedding_dim=self.embedding_size,
                                   hidden_dim=self.hidden_size,
                                   max_length=self.max_length)

            # Evaluation Process: read Dev Data from DataManager
            self.dev_size = DataManager(batch_size=1,
                                        data_type="dev",
                                        tags=self.tags).load_data()
            self.dev_manager = DataManager(batch_size=int(self.dev_size),
                                           data_type="dev",
                                           tags=self.tags)
            self.dev_batch = self.dev_manager.iteration()

            # Restore model if it exists
            self.restore_model()

        # The Testing & Inference Process
        elif entry == "predict":
            data_map = load_params(path=self.model_path)
            input_size = data_map.get("input_size")
            self.tag_map = data_map.get("tag_map")
            self.vocab = data_map.get("vocab")
            self.model = BiLSTMCRF(tag_map=self.tag_map,
                                   vocab_size=input_size,
                                   dropout=0.0,
                                   embedding_dim=self.embedding_size,
                                   hidden_dim=self.hidden_size,
                                   max_length=self.max_length)

            self.restore_model()

    def restore_model(self):
        """
        Restore the model if there is one
        """
        try:
            self.model.load_state_dict(
                torch.load(self.model_path + "params.pkl"))
            print("Model Successfully Restored!")
        except Exception as error:
            print("Model Failed to restore! {}".format(error))

    def train(self):
        """
        Training stage
        """
        model = self.model.to(device=device)

        # Transfer Learning Module
        if self.transfer_learning == True:
            keep_grad = [
                "transitions", "word_embeddings.weight", "hidden2tag.weight",
                "hidden2tag.bias", "linear1.weight", "linear1.bias",
                "linear2.weight", "linear2.bias"
            ]

            for name, value in model.named_parameters():
                if name in keep_grad:
                    value.requires_grad = True
                else:
                    value.requires_grad = False
        else:
            for name, value in model.named_parameters():
                value.requires_grad = True

        # Use Adam Optimizer
        optimizer = optim.AdamW(params=filter(lambda p: p.requires_grad,
                                              model.parameters()),
                                lr=self.learning_rate,
                                weight_decay=self.weight_decay,
                                amsgrad=True)

        # Learning Rate Decay
        # scheduler = optim.lr_scheduler.StepLR(optimizer=optimizer, step_size=self.lr_decay_step, gamma=self.lr_decay_rate)

        # Print model architecture
        print('\033[1;31mThe model architecture is shown below:\033[0m')
        print(model)
        print('\n')

        # Print model parameters
        print('\033[1;31mThe model\'s parameters are shown below:\033[0m')
        for name, value in model.named_parameters():
            print("Name: \033[1;31m{0}\033[0m, "
                  "Parameter Size: \033[1;36m{1}\033[0m, "
                  "Gradient: \033[1;35m{2}\033[0m".format(
                      name, value.size(), value.requires_grad))
        print('\n')

        for epoch in range(1, self.epochs + 1):
            index = 0
            for batch in self.train_manager.get_batch():
                index += 1

                # Clear gradients before training
                self.model.zero_grad()

                # Read sentences and tags from the batch data
                sentences, tags, length = zip(*batch)
                sentences_tensor = torch.tensor(sentences,
                                                dtype=torch.long,
                                                device=device)
                tags_tensor = torch.tensor(tags,
                                           dtype=torch.float,
                                           device=device)
                length_tensor = torch.tensor(length,
                                             dtype=torch.int64,
                                             device=device)

                # Use Negative Log-Likelihood (NLL) as Loss Function, Run the forward pass
                batch_loss = self.model.neg_log_likelihood(
                    sentences_tensor, tags_tensor, length_tensor)
                loss = batch_loss.mean()

                progress = ("█" * int(index * 40 / self.total_size)).ljust(40)
                print("epoch [{}] |{}| {}/{}\n\t Training Loss {:.6f}".format(
                    epoch, progress, index, self.total_size, loss))

                loss.backward()
                optimizer.step()

                # Save the model during training
                torch.save(self.model.state_dict(),
                           self.model_path + 'params.pkl')

            self.evaluate()
            # scheduler.step()

    def evaluate(self):
        """
        Evaluation of the performance using the dev batch - dev dataset
        """
        sentences, labels, length = zip(*self.dev_batch.__next__())
        _, pre = self.model(sentences=sentences,
                            real_length=length,
                            lengths=None)

        sentences_tensor = torch.tensor(sentences,
                                        dtype=torch.long,
                                        device=device)
        tags_tensor = torch.tensor(pre, dtype=torch.float, device=device)
        length_tensor = torch.tensor(length, dtype=torch.int64, device=device)

        loss = self.model.neg_log_likelihood(sentences_tensor, tags_tensor,
                                             length_tensor)
        print("\t Evaluation Loss {:.6f}".format(loss.tolist()[0]))

        ####################################################################################################################################
        print('Start to evaluate on the dev set: ')
        # Tag-level F1 score summary (w.r.t. each tag)
        tag_f1_total = []
        for tag in self.tags:
            _, _, f1_tag = tag_f1(tar_path=labels,
                                  pre_path=pre,
                                  tag=tag,
                                  tag_map=self.model.tag_map)
            tag_f1_total.append(f1_tag)
        tag_macro_f1 = sum(tag_f1_total) / len(tag_f1_total)
        print(
            'Tag-level Macro-averaged F1 Score of the dev set is \033[1;31m%s\033[0m'
            % tag_macro_f1)

        # Tag-level Micro-averaged F1 Score
        _, _, f1_Micro_tag = tag_micro_f1(tar_path=labels,
                                          pre_path=pre,
                                          tags=self.tags,
                                          tag_map=self.model.tag_map)
        print(
            'Tag-level Micro-averaged F1 Score of the dev set is \033[1;35m%s\033[0m'
            % f1_Micro_tag)

        ####################################################################################################################################
        # Tag-level with Label-level F1 score summary
        f1_prefix_total = []
        prefixes = ['B', 'I', 'E', 'S']
        for tag in self.tags:
            for prefix in prefixes:
                _, _, f1_prefix = entity_label_f1(tar_path=labels,
                                                  pre_path=pre,
                                                  length=length,
                                                  tag=tag,
                                                  tag_map=self.model.tag_map,
                                                  prefix=prefix)
                f1_prefix_total.append(f1_prefix)

        f1_macro_tag_prefix = sum(f1_prefix_total) / len(f1_prefix_total)
        print(
            'Tag-Label-level Macro-averaged F1 Score of the dev set is \033[1;31m%s\033[0m'
            % f1_macro_tag_prefix)

        ####################################################################################################################################
        # Label-level F1 score summary
        f1_prefix_total = []
        prefixes = ['B', 'I', 'E', 'S', 'O']
        for prefix in prefixes:
            _, _, f1_prefix = label_f1(tar_path=labels,
                                       pre_path=pre,
                                       length=length,
                                       tags=self.tags,
                                       tag_map=self.model.tag_map,
                                       prefix=prefix)
            f1_prefix_total.append(f1_prefix)

        f1_macro_prefix = sum(f1_prefix_total) / len(f1_prefix_total)
        print(
            'Label-level Macro-averaged F1 Score of the dev set is \033[1;31m%s\033[0m'
            % f1_macro_prefix)

    def predict(self):
        """
        Prediction & Inference Stage
        :param input_str: Input Chinese sentence
        :return entities: Predicted entities
        """
        # Print model architecture
        print('\033[1;31mThe model architecture is shown below:\033[0m')
        print(self.model)
        print('\n')

        # Input one Chinese Sentence
        while True:
            input_str = input("Please input a sentence in Chinese: ")

            if len(input_str) != 0:
                # Full-width to half-width
                input_str = strQ2B(input_str)
                input_str = re.sub(pattern='。', repl='.', string=input_str)

                text = cut_text(text=input_str, length=self.max_length)

                cut_out = []
                for cuttext in text:
                    # Get the embedding vector (Input Vector) from vocab
                    input_vec = [self.vocab.get(i, 0) for i in cuttext]

                    # convert it to tensor and run the model
                    sentences = torch.tensor(input_vec).view(1, -1)

                    length = np.expand_dims(np.shape(sentences)[1], axis=0)
                    length = torch.tensor(length,
                                          dtype=torch.int64,
                                          device=device)

                    _, paths = self.model(sentences=sentences,
                                          real_length=length,
                                          lengths=None)

                    # Get the entities from the model
                    entities = []
                    for tag in self.tags:
                        tags = get_tags(paths[0], tag, self.tag_map)
                        entities += format_result(tags, cuttext, tag)

                    # Get all the entities
                    all_start = []
                    for entity in entities:
                        start = entity.get('start')
                        all_start.append([start, entity])

                    # Sort the results by the "start" index
                    sort_d = [
                        value for index, value in sorted(
                            enumerate(all_start),
                            key=lambda all_start: all_start[1])
                    ]

                    if len(sort_d) == 0:
                        return print("There was no entity in this sentence!!")
                    else:
                        sort_d = np.reshape(
                            np.array(sort_d)[:, 1], [np.shape(sort_d)[0], 1])
                        cut_out.append(sort_d)
                # return cut_out
                print(cut_out)
            else:
                return print('Invalid input! Please re-input!!\n')
    def __init_model(self, entry):
        if entry == "train":
            self.train_manager = DataManager(batch_size=self.batch_size,
                                             tags=self.tags)
            self.total_size = len(self.train_manager.batch_data)
            data = {
                "batch_size": self.train_manager.batch_size,
                "input_size": self.train_manager.input_size,
                "vocab": self.train_manager.vocab,
                "tag_map": self.train_manager.tag_map,
            }
            self.save_params(data)
            dev_manager = DataManager(batch_size=30, data_type="dev")
            self.dev_batch = dev_manager.iteration()

            self.model = BiLSTMCRF(tag_map=self.train_manager.tag_map,
                                   batch_size=self.batch_size,
                                   vocab_size=len(self.train_manager.vocab),
                                   dropout=self.dropout,
                                   embedding_dim=self.embedding_size,
                                   hidden_dim=self.hidden_size,
                                   use_gpu=self.use_gpu)
            if self.use_gpu:
                print('True')
                self.model = self.model.cuda()
            else:
                print('False')
            self.restore_model()
#         elif entry=='testXXX':
#             self.dev_manager= DataManager(batch_size=30, data_type="test")
# #             self.dev_batch = dev_manager.batch_data
#             print('####batch_data###',len(dev_manager.batch_data))
        elif entry == 'test':
            self.dev_manager = DataManager(batch_size=30, data_type="test")
            #             self.dev_batch = dev_manager.iteration()

            data_map = self.load_params()
            input_size = data_map.get("input_size")
            self.tag_map = data_map.get("tag_map")
            self.vocab = data_map.get("vocab")

            self.model = BiLSTMCRF(tag_map=self.tag_map,
                                   vocab_size=input_size,
                                   embedding_dim=self.embedding_size,
                                   hidden_dim=self.hidden_size,
                                   use_gpu=self.use_gpu)
            if self.use_gpu:
                print('True')
                self.model = self.model.cuda()
            else:
                print('False')
            self.restore_model()

        elif entry == "predict":
            data_map = self.load_params()
            input_size = data_map.get("input_size")
            self.tag_map = data_map.get("tag_map")
            self.vocab = data_map.get("vocab")

            self.model = BiLSTMCRF(tag_map=self.tag_map,
                                   vocab_size=input_size,
                                   embedding_dim=self.embedding_size,
                                   hidden_dim=self.hidden_size,
                                   use_gpu=self.use_gpu)
            if self.use_gpu:
                self.model = self.model.cuda()
            self.restore_model()
Exemple #19
0
class ChineseNER(object):
    def __init__(self, entry="train"):
        self.load_config()
        self.__init_model(entry)

    def __init_model(self, entry):
        if entry == "train":
            self.train_manager = DataManager(batch_size=self.batch_size,
                                             tags=self.tags)
            self.total_size = len(self.train_manager.batch_data)
            data = {
                "batch_size": self.train_manager.batch_size,
                "input_size": self.train_manager.input_size,
                "vocab": self.train_manager.vocab,
                "tag_map": self.train_manager.tag_map,
            }
            self.save_params(data)
            dev_manager = DataManager(batch_size=30, data_type="dev")
            self.dev_batch = dev_manager.iteration()

            self.model = BiLSTMCRF(
                tag_map=self.train_manager.tag_map,
                batch_size=self.batch_size,
                vocab_size=len(self.train_manager.vocab),
                dropout=self.dropout,
                embedding_dim=self.embedding_size,
                hidden_dim=self.hidden_size,
            )
            self.restore_model()
        elif entry == "predict":
            data_map = self.load_params()
            input_size = data_map.get("input_size")
            self.tag_map = data_map.get("tag_map")
            self.vocab = data_map.get("vocab")

            self.model = BiLSTMCRF(tag_map=self.tag_map,
                                   vocab_size=input_size,
                                   embedding_dim=self.embedding_size,
                                   hidden_dim=self.hidden_size)
            self.restore_model()

    def load_config(self):
        try:
            fopen = open("models/config.yml")
            config = yaml.load(fopen, Loader=yaml.FullLoader)
            fopen.close()
        except Exception as error:
            print("Load config failed, using default config {}".format(error))
            fopen = open("models/config.yml", "w", encoding='UTF-8')
            config = {
                "embedding_size": 100,
                "hidden_size": 128,
                "batch_size": 20,
                "dropout": 0.5,
                "model_path": "models/",
                "tasg": ["ORG", "PER"]
            }
            yaml.dump(config, fopen)
            fopen.close()
        self.embedding_size = config.get("embedding_size")
        self.hidden_size = config.get("hidden_size")
        self.batch_size = config.get("batch_size")
        self.model_path = config.get("model_path")
        self.tags = config.get("tags")
        self.dropout = config.get("dropout")

    def restore_model(self):
        try:
            self.model.load_state_dict(
                torch.load(self.model_path + "params.pkl"))
            print("model restore success!")
        except Exception as error:
            print("model restore faild! {}".format(error))

    def save_params(self, data):
        with open("models/data.pkl", "wb") as fopen:
            pickle.dump(data, fopen)

    def load_params(self):
        with open("models/data.pkl", "rb") as fopen:
            data_map = pickle.load(fopen)
        return data_map

    def train(self):
        optimizer = optim.Adam(self.model.parameters())
        # optimizer = optim.SGD(ner_model.parameters(), lr=0.01)
        for epoch in range(100):
            index = 0
            for batch in self.train_manager.get_batch():
                index += 1
                self.model.zero_grad()

                sentences, tags, length = zip(*batch)
                sentences_tensor = torch.tensor(sentences, dtype=torch.long)
                tags_tensor = torch.tensor(tags, dtype=torch.long)
                length_tensor = torch.tensor(length, dtype=torch.long)

                loss = self.model.neg_log_likelihood(sentences_tensor,
                                                     tags_tensor,
                                                     length_tensor)
                progress = ("█" * int(index * 25 / self.total_size)).ljust(25)
                print("""epoch [{}] |{}| {}/{}\n\tloss {:.2f}""".format(
                    epoch, progress, index, self.total_size,
                    loss.cpu().tolist()[0]))
                self.evaluate()
                print("-" * 50)
                loss.backward()
                optimizer.step()
                torch.save(self.model.state_dict(),
                           self.model_path + 'params.pkl')

    def evaluate(self):
        sentences, labels, length = zip(*self.dev_batch.__next__())
        _, paths = self.model(sentences)
        print("\teval")
        for tag in self.tags:
            f1_score(labels, paths, tag, self.model.tag_map)

    def predict(self, input_str=""):
        if not input_str:
            input_str = input("请输入文本: ")
        input_vec = [self.vocab.get(i, 0) for i in input_str]
        # convert to tensor
        sentences = torch.tensor(input_vec).view(1, -1)
        _, paths = self.model(sentences)

        entities = []
        for tag in self.tags:
            tags = get_tags(paths[0], tag, self.tag_map)
            entities += format_result(tags, input_str, tag)
        return entities
Exemple #20
0
        exit()

    # 定义训练集
    train_dataset = MyDataset(batch_size=32, tags=["ORG", "PER"])
    # 定义测试集
    word2id, tag2id = train_dataset.word2id, train_dataset.tag2id
    test_dataset = MyDataset(batch_size=32,
                             data_type="test",
                             word2id=word2id,
                             tag2id=tag2id)

    if sys.argv[1] == "train_model":
        # 定义模型
        model = BiLSTMCRF(tag2id=tag2id,
                          word2id_size=len(word2id),
                          batch_size=32,
                          embedding_dim=100,
                          hidden_dim=128)
        # 训练模型
        train_model(train_dataset=train_dataset,
                    test_dataset=test_dataset,
                    model=model,
                    tag2id=tag2id)

    elif sys.argv[1] == "predict_model":
        # 定义模型
        model = BiLSTMCRF(tag2id=tag2id,
                          word2id_size=len(word2id),
                          batch_size=1,
                          embedding_dim=100,
                          hidden_dim=128)
class ChineseNER(object):
    def __init__(self, entry="train"):
        self.load_config()
        self.__init_model(entry)

    def __init_model(self, entry):
        if entry == "train":
            self.train_manager = DataManager(batch_size=self.batch_size,
                                             tags=self.tags)
            self.total_size = len(self.train_manager.batch_data)
            data = {
                "batch_size": self.train_manager.batch_size,
                "input_size": self.train_manager.input_size,
                "vocab": self.train_manager.vocab,
                "tag_map": self.train_manager.tag_map,
            }
            self.save_params(data)
            dev_manager = DataManager(batch_size=30, data_type="dev")
            self.dev_batch = dev_manager.iteration()

            self.model = BiLSTMCRF(tag_map=self.train_manager.tag_map,
                                   batch_size=self.batch_size,
                                   vocab_size=len(self.train_manager.vocab),
                                   dropout=self.dropout,
                                   embedding_dim=self.embedding_size,
                                   hidden_dim=self.hidden_size,
                                   use_gpu=self.use_gpu)
            if self.use_gpu:
                print('True')
                self.model = self.model.cuda()
            else:
                print('False')
            self.restore_model()
#         elif entry=='testXXX':
#             self.dev_manager= DataManager(batch_size=30, data_type="test")
# #             self.dev_batch = dev_manager.batch_data
#             print('####batch_data###',len(dev_manager.batch_data))
        elif entry == 'test':
            self.dev_manager = DataManager(batch_size=30, data_type="test")
            #             self.dev_batch = dev_manager.iteration()

            data_map = self.load_params()
            input_size = data_map.get("input_size")
            self.tag_map = data_map.get("tag_map")
            self.vocab = data_map.get("vocab")

            self.model = BiLSTMCRF(tag_map=self.tag_map,
                                   vocab_size=input_size,
                                   embedding_dim=self.embedding_size,
                                   hidden_dim=self.hidden_size,
                                   use_gpu=self.use_gpu)
            if self.use_gpu:
                print('True')
                self.model = self.model.cuda()
            else:
                print('False')
            self.restore_model()

        elif entry == "predict":
            data_map = self.load_params()
            input_size = data_map.get("input_size")
            self.tag_map = data_map.get("tag_map")
            self.vocab = data_map.get("vocab")

            self.model = BiLSTMCRF(tag_map=self.tag_map,
                                   vocab_size=input_size,
                                   embedding_dim=self.embedding_size,
                                   hidden_dim=self.hidden_size,
                                   use_gpu=self.use_gpu)
            if self.use_gpu:
                self.model = self.model.cuda()
            self.restore_model()

    def load_config(self):
        try:
            fopen = open("models/config.yml")
            config = yaml.load(fopen)
            fopen.close()
        except Exception as error:
            print("Load config failed, using default config {}".format(error))
            fopen = open("models/config.yml", "w")
            config = {
                "embedding_size": 100,
                "hidden_size": 128,
                "batch_size": 20,
                "dropout": 0.5,
                "model_path": "models/",
                "tags": ["component", "disease&symptom", "people"],  #在这里修改tag
                "use_gpu": True
            }
            yaml.dump(config, fopen)
            fopen.close()
        self.embedding_size = config.get("embedding_size")
        self.hidden_size = config.get("hidden_size")
        self.batch_size = config.get("batch_size")
        self.model_path = config.get("model_path")
        self.tags = config.get("tags")
        self.dropout = config.get("dropout")
        self.use_gpu = config.get("use_gpu")

    def restore_model(self):
        try:
            self.model.load_state_dict(
                torch.load(self.model_path + "params.pkl"))
            print("model restore success!")
        except Exception as error:
            print("model restore faild! {}".format(error))

    def save_params(self, data):
        with open("models/data.pkl", "wb") as fopen:
            pickle.dump(data, fopen)

    def load_params(self):
        with open("models/data.pkl", "rb") as fopen:
            data_map = pickle.load(fopen)
        return data_map

    def train(self):
        optimizer = optim.Adam(self.model.parameters())
        # optimizer = optim.SGD(ner_model.parameters(), lr=0.01)

        for epoch in range(100):
            index = 0
            for batch in self.train_manager.get_batch():
                index += 1
                self.model.zero_grad()
                print('batch', type(batch), len(batch), len(batch[0]),
                      len(batch[10]))
                sentences, tags, length = zip(*batch)
                # print('zip batch sentences', type(sentences), sentences)
                # print('zip batch tags', type(tags), tags)
                # print('zip batch length', type(length), length)
                sentences_tensor = torch.tensor(sentences, dtype=torch.long)
                tags_tensor = torch.tensor(tags, dtype=torch.long)
                length_tensor = torch.tensor(
                    length, dtype=torch.long)  #在一个batch中,每个句子的原长度
                if self.use_gpu:
                    sentences_tensor = sentences_tensor.cuda()
                    tags_tensor = tags_tensor.cuda()
                    length_tensor = length_tensor.cuda()
#                 print('zip batch sentences', type(sentences_tensor), sentences_tensor.shape)
#                 print('zip batch tags', type(tags_tensor), tags_tensor.shape)
#                 print('zip batch length', type(length_tensor), length_tensor.shape,length)
                loss = self.model.neg_log_likelihood(sentences_tensor,
                                                     tags_tensor,
                                                     length_tensor)
                progress = ("█" * int(index * 25 / self.total_size)).ljust(25)
                print("""epoch [{}] |{}| {}/{}\n\tloss {:.2f}""".format(
                    epoch, progress, index, self.total_size,
                    loss.cpu().tolist()[0]))
                if index % 10 == 0:
                    self.evaluate()
                print("-" * 50)
                loss.backward()
                optimizer.step()
                torch.save(self.model.state_dict(),
                           self.model_path + 'params.pkl')

    def evaluate(self):
        with torch.no_grad():
            sentences, labels, length = zip(*self.dev_batch.__next__())
            _, paths = self.model(sentences)
            print("\teval")
            for tag in self.tags:
                f1_score(labels, paths, tag, self.model.tag_map)

    def predict(self, path):  #, input_str=""):
        #         if not input_str:
        #             input_str = input("请输入文本: ")
        sentences = []
        with open('./data/' + path + '.txt', 'r', encoding='utf-8') as f:
            for i in f:
                sentences += i.strip().split('。')
        f = open('./result/tag_' + path + '.json', 'w')
        for input_str in sentences:
            input_vec = [self.vocab.get(i, 0) for i in input_str]
            # convert to tensor
            sentences = torch.tensor(input_vec).view(1, -1)
            _, paths = self.model(sentences)

            entities = []
            for tag in self.tags:
                tags = get_tags(paths[0], tag, self.tag_map)
                entities += format_result(tags, input_str, tag)
            dic = {'sentense': input_str, 'entities': entities}
            json.dump(dic, f, ensure_ascii=False)
        f.close()
#             return entities
#     def testXXX(self):
#         for batch in self.dev_manager.get_batch():
#             print(_)
#             print(_,len(items),len(items[0][0]),len(items[0][1]),items[0][2])
#             break

    def test(self):
        with torch.no_grad():
            id2vocab = {self.vocab[i]: i for i in self.vocab}
            print(len(id2vocab))
            f = open('./result/test_tag.json', 'w')
            total_matrix = np.zeros(
                [len(self.tags), 3]
            )  #横坐标分别表示component,disease&symptom,people;纵坐标分别表示recall, precision, f1
            count = 0
            for batch in self.dev_manager.get_batch():
                count += 1
                print(count)
                #                 print(type(items))
                sentences, labels, length = zip(*batch)
                #             sentences, labels, length = zip(*self.dev_batch.__next__())
                #                 print('I am in')
                strs = [[id2vocab[w] for w in s] for s in sentences]
                #                 print(strs)
                #                 print(len(sentences),len(sentences[0]),len(sentences[5]))
                _, paths = self.model(sentences)
                #                 print("\teval")
                #                 print('path',len(paths),len(paths[0]),len(paths[1]))
                for i in range(len(self.tags)):
                    recall, precision, f1 = f1_score(labels, paths,
                                                     self.tags[i],
                                                     self.model.tag_map)
                    total_matrix[i][0] += recall
                    total_matrix[i][1] += precision
                    total_matrix[i][2] += f1
                entities = []
                for i in range(len(paths)):
                    tmp = []

                    for tag in self.tags:
                        tags = get_tags(paths[i], tag, self.tag_map)
                        tmp += format_result(tags, strs[i], tag)
                    entities.append(tmp)

    #             print(entities)
                for i in range(len(entities)):
                    dic = {
                        'sentense': ''.join(strs[i]),
                        'entities': entities[i]
                    }
                    json.dump(dic, f, ensure_ascii=False)


#                     f.write(''.join(strs[i])+'#####找到的实体为#####'+'&'.join(entities[i])+'\n')
            total_matrix /= count
            #             print(total_matrix)
            for i in range(len(self.tags)):
                print(
                    "{}\tcount\t{}\trecall {:.2f}\tprecision {:.2f}\tf1 {:.2f}"
                    .format(count, self.tags[i], total_matrix[i][0],
                            total_matrix[i][1], total_matrix[i][2]))
            f.close()
Exemple #22
0
class ChineseNER:
    def __init__(self, entry="train"):
        # Load Hyper-parameters
        config = load_config()
        self.model_path = config.get("model_path")
        self.epochs = config.get("epochs")
        self.batch_size = config.get("batch_size")
        self.learning_rate = config.get("learning_rate")
        self.weight_decay = config.get("weight_decay")
        self.dropout = config.get("dropout")
        self.hidden_size = config.get("hidden_size")
        self.char_num = config.get("char_num")
        self.char_dim = config.get("char_dim")
        self.word_dim = config.get("word_dim")
        self.word_num = config.get("word_num")
        self.tags = config.get("tags")
        self.transfer_learning = config.get("transfer_learning")
        self.lr_decay_step = config.get("lr_decay_step")
        self.lr_decay_rate = config.get("lr_decay_rate")

        # Load main model
        self.main_model(entry)

    def main_model(self, entry):
        # The Training Process
        if entry == "train":
            # Training Process: read Training Data from DataManager
            self.train_manager = DataManager(batch_size=self.batch_size,
                                             data_type='train',
                                             tags=self.tags)
            self.total_size = len(self.train_manager.batch_data)

            # Load some model parameters
            try:
                load_params(path=self.model_path)
                print("Successfully load the data.pkl!!!")

            except Exception as error:
                print("There was no data.pkl!! Start to save........")
                # Read the corresponding character index (vocab) and other hyper-parameters
                saved_data = {
                    "batch_size": self.train_manager.batch_size,
                    "input_size": self.train_manager.input_size,
                    "char_vocab": self.train_manager.char_vocab,
                    "tag_map": self.train_manager.tag_map,
                }
                save_params(data=saved_data, path=self.model_path)

            # Build BiLSTM-CRF Model
            self.model = BiLSTMCRF(
                tag_map=self.train_manager.tag_map,
                batch_size=self.batch_size,
                vocab_size=len(self.train_manager.char_vocab),
                dropout=self.dropout,
                word_num=self.word_num,
                word_dim=self.word_dim,
                char_num=self.char_num,
                char_dim=self.char_dim,
                hidden_dim=self.hidden_size,
            )

            # Restore model if it exists
            self.restore_model()

            # Evaluation Process: read Dev Data from DataManager
            self.dev_size = DataManager(batch_size=1,
                                        data_type="dev",
                                        tags=self.tags).load_char_data()
            self.dev_manager = DataManager(batch_size=int(self.dev_size),
                                           data_type="dev")
            self.dev_batch = self.dev_manager.iteration()

        # The Inference Process
        elif entry == "predict":
            data = load_params(path=self.model_path)
            input_size = data.get("input_size")
            self.tag_map = data.get("tag_map")
            self.vocab = data.get("char_vocab")
            self.model = BiLSTMCRF(
                tag_map=self.tag_map,
                vocab_size=input_size,
                dropout=0.0,
                word_num=self.word_num,
                word_dim=self.word_dim,
                char_num=self.char_num,
                char_dim=self.char_dim,
                hidden_dim=self.hidden_size,
            )
            self.restore_model()

    def train(self):
        # Transfer Learning Module
        if self.transfer_learning == True:
            keep_grad = [
                "transitions", "char_embedding.weight",
                "char_linear_lstm.weight", "char_linear_lstm.bias",
                "word_linear_lstm.weight", "word_linear_lstm.bias",
                "hidden2tag.weight", "hidden2tag.bias"
            ]

            for name, value in self.model.named_parameters():
                if name in keep_grad:
                    value.requires_grad = True
                else:
                    value.requires_grad = False
        else:
            for name, value in self.model.named_parameters():
                value.requires_grad = True

        # Use Adam Optimizer
        optimizer = optim.Adam(params=filter(lambda p: p.requires_grad,
                                             self.model.parameters()),
                               lr=self.learning_rate,
                               weight_decay=self.weight_decay)

        # Learning Rate Decay
        # scheduler = optim.lr_scheduler.StepLR(optimizer=optimizer, step_size=self.lr_decay_step, gamma=self.lr_decay_rate)

        # Print model architecture
        print('\033[1;31mThe model architecture is shown below:\033[0m')
        print(self.model)
        print('\n')

        # Print model parameters
        print('\033[1;31mThe model\'s parameters are shown below:\033[0m')
        for name, value in self.model.named_parameters():
            print("Name: \033[1;31m{0}\033[0m, "
                  "Parameter Size: \033[1;36m{1}\033[0m, "
                  "Gradient: \033[1;35m{2}\033[0m".format(
                      name, value.size(), value.requires_grad))
        print('\n')

        for epoch in range(1, self.epochs + 1):
            index = 0
            for batch in self.train_manager.get_batch():
                index += 1
                # Clear gradients before training
                self.model.zero_grad()

                ####################################################################################################################################
                # Read sentences and labels from the batch data
                chars, labels, words, len_word, len_char = zip(*batch)
                chars_tensor = torch.tensor(chars,
                                            dtype=torch.long,
                                            device=device)
                labels_tensor = torch.tensor(labels,
                                             dtype=torch.float,
                                             device=device)
                words_tensor = torch.tensor(words,
                                            dtype=torch.float,
                                            device=device)
                len_word_tensor = torch.tensor(len_word,
                                               dtype=torch.int64,
                                               device=device)
                len_char_tensor = torch.tensor(len_char,
                                               dtype=torch.int64,
                                               device=device)

                ####################################################################################################################################
                loss = self.model.neg_log_likelihood(characters=chars_tensor,
                                                     tags=labels_tensor,
                                                     len_char=len_char_tensor,
                                                     words=words_tensor,
                                                     len_word=len_word_tensor)
                progress = ("█" * int(index * 40 / self.total_size)).ljust(40)
                print("epoch [{}] |{}| {}/{}\t Batch Loss {:.6f}".format(
                    epoch, progress, index, self.total_size,
                    loss.tolist()[0]))

                ####################################################################################################################################
                loss.backward()
                optimizer.step()
                torch.save(self.model.state_dict(),
                           self.model_path + 'params.pkl')

            self.evaluate()
            # scheduler.step()

    def evaluate(self):
        """
        Evaluation of the performance using the development set
        """
        chars, labels, words, len_words, len_chars = zip(
            *self.dev_batch.__next__())
        chars_tensor = torch.tensor(chars, dtype=torch.long, device=device)
        words_tensor = torch.tensor(words, dtype=torch.float, device=device)
        len_word_tensor = torch.tensor(len_words,
                                       dtype=torch.int64,
                                       device=device)
        len_char_tensor = torch.tensor(len_chars,
                                       dtype=torch.int64,
                                       device=device)

        # Run the Forward pass of the model
        _, pre = self.model(characters=chars_tensor,
                            len_char=len_char_tensor,
                            words=words_tensor,
                            len_word=len_word_tensor)
        tags_tensor = torch.tensor(pre, dtype=torch.int, device=device)

        ####################################################################################################################################
        # Loss on the dev set
        loss = self.model.neg_log_likelihood(characters=chars_tensor,
                                             tags=tags_tensor,
                                             len_char=len_char_tensor,
                                             words=words_tensor,
                                             len_word=len_word_tensor)
        print("\t Evaluation Loss on the dev set{:.6f}".format(
            loss.tolist()[0]))

        ####################################################################################################################################
        print('Start to evaluate on the dev set: ')

        ####################################################################################################################################
        # Tag-level F1 score summary (w.r.t. each tag)
        tag_f1_total = []
        for tag in self.tags:
            _, _, f1_tag = tag_f1(tar_path=labels,
                                  pre_path=pre,
                                  tag=tag,
                                  tag_map=self.model.tag_map)
            tag_f1_total.append(f1_tag)
        tag_macro_f1 = sum(tag_f1_total) / len(tag_f1_total)
        print(
            'Tag-level Macro-averaged F1 Score of the dev set is \033[1;31m%s\033[0m'
            % tag_macro_f1)

        # Tag-level Micro-averaged F1 Score
        _, _, f1_Micro_tag = tag_micro_f1(tar_path=labels,
                                          pre_path=pre,
                                          tags=self.tags,
                                          tag_map=self.model.tag_map)
        print(
            'Tag-level Micro-averaged F1 Score of the dev set is \033[1;35m%s\033[0m'
            % f1_Micro_tag)

        ####################################################################################################################################
        # Tag-level with Label-level F1 score summary
        f1_prefix_total = []
        prefixes = ['B', 'I', 'E', 'S']
        for tag in self.tags:
            for prefix in prefixes:
                _, _, f1_prefix = entity_label_f1(tar_path=labels,
                                                  pre_path=pre,
                                                  length=len_chars,
                                                  tag=tag,
                                                  tag_map=self.model.tag_map,
                                                  prefix=prefix)
                f1_prefix_total.append(f1_prefix)

        f1_macro_tag_prefix = sum(f1_prefix_total) / len(f1_prefix_total)
        print(
            'Tag-Label-level Macro-averaged F1 Score of the dev set is \033[1;31m%s\033[0m'
            % f1_macro_tag_prefix)

        ####################################################################################################################################
        # Label-level F1 score summary
        f1_prefix_total = []
        prefixes = ['B', 'I', 'E', 'S', 'O']
        for prefix in prefixes:
            _, _, f1_prefix = label_f1(tar_path=labels,
                                       pre_path=pre,
                                       length=len_chars,
                                       tags=self.tags,
                                       tag_map=self.model.tag_map,
                                       prefix=prefix)
            f1_prefix_total.append(f1_prefix)

        f1_macro_prefix = sum(f1_prefix_total) / len(f1_prefix_total)
        print(
            'Label-level Macro-averaged F1 Score of the dev set is \033[1;31m%s\033[0m'
            % f1_macro_prefix)

    def predict(self):
        """
        Prediction & Inference Stage
        """
        # Load word vectors
        pre_trained = self.load_word_vector()

        while True:
            input_str = input("Please input a sentence in Chinese: ")
            input_str = stringQ2B(input_str)

            # Get character embedding
            char_vec = [self.vocab.get(i, 0) for i in input_str]
            char_tensor = np.reshape(char_vec, [-1]).tolist()
            len_char = np.expand_dims(len(char_tensor), axis=0)
            len_char = torch.tensor(len_char, dtype=torch.int64, device=device)
            char_tensor = np.array(self.pad_char_data(char_tensor)).tolist()
            char_tensor = torch.tensor(char_tensor,
                                       dtype=torch.long,
                                       device=device)

            # Get word embedding
            embed_words = []
            words = jieba.lcut(input_str, HMM=True)
            for i in words:
                vec = pre_trained.get(i)
                if str(type(vec)) != "<class 'NoneType'>":
                    embed_words.append(vec)
                else:
                    gen_vec = np.random.normal(size=self.word_dim).tolist()
                    embed_words.append(gen_vec)

            word_tensor = np.array(self.pad_word_data(embed_words)).tolist()
            len_word = np.expand_dims(len(word_tensor), axis=0)
            len_word = torch.tensor(len_word, dtype=torch.int64, device=device)
            word_tensor = torch.tensor(word_tensor,
                                       dtype=torch.float,
                                       device=device)

            # Run the model and get all the predicted entities
            _, paths = self.model(characters=char_tensor,
                                  len_char=len_char,
                                  words=word_tensor,
                                  len_word=len_word)

            # Format the results
            entities = []
            for tag in self.tags:
                tags = get_tags(path=paths[0], tag=tag, tag_map=self.tag_map)
                entities += format_result(result=tags, text=input_str, tag=tag)
            print(entities)

    def load_word_vector(self):
        """
        Load pre-trained word vectors
        """
        if 'pre_trained' not in globals().keys():
            print("Start to load pre-trained word embeddings!!")
            pre_trained = {}
            for i, line in enumerate(
                    codecs.open(self.model_path + "word_vectors.vec",
                                'r',
                                encoding='utf-8')):
                line = line.rstrip().split()
                if len(line) == self.word_dim + 1:
                    pre_trained[line[0]] = np.array(
                        [float(x) for x in line[1:]]).astype(np.float32)
        else:
            pre_trained = globals().get("pre_trained")
        return pre_trained

    def pad_char_data(self, data: list):
        """
        Pad character data
        """
        c_data = copy.deepcopy(data)
        if np.shape(c_data)[0] < self.char_num:
            c_data = c_data + (self.char_num - np.shape(c_data)[0]) * [0]
        else:
            c_data = c_data[:self.char_num]
        c_data = np.expand_dims(c_data, axis=0)
        return c_data

    def pad_word_data(self, data: list):
        """
        Pad word data
        """
        c_data = copy.deepcopy(data)
        if len(c_data) <= self.word_num:
            c_data = c_data + (self.word_num - len(c_data)) * [[0] *
                                                               self.word_dim]
        else:
            c_data = c_data[:self.word_num, :]
        c_data = np.reshape(c_data,
                            [np.shape(c_data)[0] * np.shape(c_data)[1]])
        c_data = np.expand_dims(c_data, axis=0)
        return c_data

    def restore_model(self):
        """
        Restore and load the model
        """
        try:
            self.model.load_state_dict(
                torch.load(self.model_path + "params.pkl"))
            print("Model Successfully Restored!!")
        except Exception as error:
            print("Model Failed to restore!!")
Exemple #23
0
class ChineseNER:
    def __init__(self, entry="train"):
        # Load some Hyper-parameters
        config = load_config()
        self.embedding_size = config.get("embedding_size")
        self.hidden_size = config.get("hidden_size")
        self.batch_size = config.get("batch_size")
        self.model_path = config.get("model_path")
        self.dropout = config.get("dropout")
        self.tags = config.get("tags")
        self.learning_rate = config.get("learning_rate")
        self.epochs = config.get("epochs")
        self.weight_decay = config.get("weight_decay")
        self.transfer_learning = config.get("transfer_learning")
        self.lr_decay_step = config.get("lr_decay_step")
        self.lr_decay_rate = config.get("lr_decay_rate")
        self.max_length = config.get("max_length")

        # Model Initialization
        self.main_model(entry)

    def main_model(self, entry):
        """
        Model Initialization
        """
        # The Testing & Inference Process
        if entry == "predict":
            data_map = load_params(path=self.model_path)
            input_size = data_map.get("input_size")
            self.tag_map = data_map.get("tag_map")
            self.vocab = data_map.get("vocab")
            self.model = BiLSTMCRF(tag_map=self.tag_map,
                                   vocab_size=input_size,
                                   dropout=0.0,
                                   embedding_dim=self.embedding_size,
                                   hidden_dim=self.hidden_size,
                                   max_length=self.max_length)
            self.restore_model()

    def restore_model(self):
        """
        Restore the model if there is one
        """
        try:
            self.model.load_state_dict(
                torch.load(self.model_path + "params.pkl"))
            print("Model Successfully Restored!")
        except Exception as error:
            print("Model Failed to restore! {}".format(error))

    def predict(self, input_str):
        """
        Prediction & Inference Stage
        :param input_str: Input Chinese sentence
        :return entities: Predicted entities
        """
        if len(input_str) != 0:
            # Full-width to half-width
            input_str = strQ2B(input_str)
            input_str = re.sub(pattern='。', repl='.', string=input_str)
            text = cut_text(text=input_str, length=self.max_length)

            cut_out = []
            for cuttext in text:
                # Get the embedding vector (Input Vector) from vocab
                input_vec = [self.vocab.get(i, 0) for i in cuttext]

                # convert it to tensor and run the model
                sentences = torch.tensor(input_vec).view(1, -1)

                length = np.expand_dims(np.shape(sentences)[1], axis=0)
                length = torch.tensor(length, dtype=torch.int64, device=device)

                _, paths = self.model(sentences=sentences,
                                      real_length=length,
                                      lengths=None)

                # Get the entities from the model
                entities = []
                for tag in self.tags:
                    tags = get_tags(paths[0], tag, self.tag_map)
                    entities += format_result(tags, cuttext, tag)

                # Get all the entities
                all_start = []
                for entity in entities:
                    start = entity.get('start')
                    all_start.append([start, entity])

                # Sort the results by the "start" index
                sort_d = [
                    value for index, value in sorted(
                        enumerate(all_start),
                        key=lambda all_start: all_start[1])
                ]

                if len(sort_d) == 0:
                    return print("There was no entity in this sentence!!")
                else:
                    sort_d = np.reshape(
                        np.array(sort_d)[:, 1], [np.shape(sort_d)[0], 1])
                    cut_out.append(sort_d)
            return cut_out
        else:
            return print('Invalid input! Please re-input!!\n')
Exemple #24
0
class ChineseNER(object):
    use_gpu = False

    def __init__(self, entry="train"):
        self.load_config()
        #self.use_gpu = torch.cuda.is_available()
        self.__init_model(entry)
        print(self.use_gpu)
        if (self.use_gpu):  # gpu加速
            self.model = self.model.cuda()

    def __init_model(self, entry):
        if entry == "train":
            self.train_manager = DataManager(batch_size=self.batch_size,
                                             tags=self.tags)
            self.total_size = len(self.train_manager.batch_data)
            data = {
                "batch_size": self.train_manager.batch_size,
                "input_size": self.train_manager.input_size,
                "vocab": self.train_manager.vocab,
                "tag_map": self.train_manager.tag_map,
            }
            self.save_params(data)
            dev_manager = DataManager(batch_size=30, data_type="dev")
            self.dev_batch = dev_manager.iteration()
            self.model = BiLSTMCRF(
                tag_map=self.train_manager.tag_map,
                batch_size=self.batch_size,
                vocab_size=len(self.train_manager.vocab),
                dropout=self.dropout,
                embedding_dim=self.embedding_size,
                hidden_dim=self.hidden_size,
            )
            self.restore_model()
        elif entry == "predict":
            data_map = self.load_params()
            input_size = data_map.get("input_size")
            self.tag_map = data_map.get("tag_map")
            self.vocab = data_map.get("vocab")

            self.model = BiLSTMCRF(tag_map=self.tag_map,
                                   vocab_size=input_size,
                                   embedding_dim=self.embedding_size,
                                   hidden_dim=self.hidden_size)
            self.restore_model()

    def load_config(self):
        try:
            fopen = open("models/config.yml")
            config = yaml.load(fopen)
            fopen.close()
        except Exception as error:
            print("Load config failed, using default config {}".format(error))
            fopen = open("models/config.yml", "w")
            config = {
                "embedding_size": 100,
                "hidden_size": 128,
                "batch_size": 20,
                "dropout": 0.5,
                "model_path": "models/",
                "tasg": ["ORG", "PER"]
            }
            yaml.dump(config, fopen)
            fopen.close()
        self.embedding_size = config.get("embedding_size")
        self.hidden_size = config.get("hidden_size")
        self.batch_size = config.get("batch_size")
        self.model_path = config.get("model_path")
        self.tags = config.get("tags")
        self.dropout = config.get("dropout")

    def restore_model(self):
        try:
            self.model.load_state_dict(
                torch.load(self.model_path + "params.pkl"))
            print("model restore success!")
        except Exception as error:
            print("model restore faild! {}".format(error))

    def save_params(self, data):
        with open("models/data.pkl", "wb") as fopen:
            pickle.dump(data, fopen)

    def load_params(self):
        with open("models/data.pkl", "rb") as fopen:
            data_map = pickle.load(fopen)
        return data_map

    #@torchsnooper.snoop()
    def train(self):
        optimizer = optim.Adam(self.model.parameters())
        # optimizer = optim.SGD(ner_model.parameters(), lr=0.01)
        for epoch in range(100):
            index = 0
            for batch in self.train_manager.get_batch():
                index += 1
                self.model.zero_grad()
                sentences, tags, length = zip(*batch)
                sentences_tensor = torch.tensor(sentences, dtype=torch.long)
                tags_tensor = torch.tensor(tags, dtype=torch.long)
                length_tensor = torch.tensor(length, dtype=torch.long)
                if (self.use_gpu):  # gpu加速
                    sentences_tensor = sentences_tensor.cuda()
                    tags_tensor = tags_tensor.cuda()
                    length_tensor = length_tensor.cuda()
                loss = self.model.neg_log_likelihood(sentences_tensor,
                                                     tags_tensor,
                                                     length_tensor)
                if (self.use_gpu):
                    loss = loss.cuda()
                progress = ("█" * int(index * 25 / self.total_size)).ljust(25)
                print("""epoch [{}] |{}| {}/{}\n\tloss {:.2f}""".format(
                    epoch, progress, index, self.total_size,
                    loss.cpu().tolist()[0]))
                self.evaluate()
                print("-" * 50)
                loss.backward()
                optimizer.step()
                torch.save(self.model.state_dict(),
                           self.model_path + 'params.pkl')

    def get_string(self, x):
        now = x.split('\n')
        o = now[1].split(' ')
        while '' in o:
            o.remove('')
        return o[1]

    def evaluate(self):
        sentences, labels, length = zip(*self.dev_batch.__next__())
        if (self.use_gpu):
            sentences = torch.tensor(sentences, dtype=torch.long).cuda()
        _, paths = self.model(sentences)
        print("\teval")
        for tag in self.tags:
            f1_score(labels, paths, tag, self.model.tag_map)

    def predict(self, input_str="", input_path=None):
        if input_path is not None:
            tests = pd.read_csv(input_path)
            with open('output.txt', 'w', encoding='utf-8') as o:
                #o.write('id,aspect,opinion\n')
                for ids in range(1, 2235):
                    input_str = self.get_string(
                        str(tests.loc[ids - 1:ids - 1, ['Review']]))
                    index = int(
                        self.get_string(str(tests.loc[ids - 1:ids - 1,
                                                      ['id']])))
                    input_vec = [self.vocab.get(i, 0) for i in input_str]
                    # convert to tensor
                    if (self.use_gpu):  # gpu加速
                        sentences = torch.tensor(input_vec).view(1, -1).cuda()
                    else:
                        sentences = torch.tensor(input_vec).view(1, -1)
                    _, paths = self.model(sentences)

                    entities = []
                    for tag in self.tags:
                        tags = get_tags(paths[0], tag, self.tag_map)
                        entities += format_result(tags, input_str, tag)
                    entities = sorted(entities, key=lambda x: x['start'])
                    #print(str(index) + "  " + input_str + " " +str(len(entities)))
                    for entity in entities:
                        #print(entity)
                        o.write(
                            str(index) + ',' + entity['type'] + ',' +
                            entity['word'] + '\n')
        else:
            if not input_str:
                input_str = input("请输入文本: ")
            input_vec = [self.vocab.get(i, 0) for i in input_str]
            # convert to tensor
            if (self.use_gpu):  # gpu加速
                sentences = torch.tensor(input_vec).view(1, -1).cuda()
            else:
                sentences = torch.tensor(input_vec).view(1, -1)
            _, paths = self.model(sentences)

            entities = []
            for tag in self.tags:
                tags = get_tags(paths[0], tag, self.tag_map)
                entities += format_result(tags, input_str, tag)
            return entities
Exemple #25
0
class ChineseNER(object):
    def __init__(self, entry="train"):
        self.load_config()
        self.__init_model(entry)

    def __init_model(self, entry):
        if entry == "train":
            self.train_manager = DataManager(batch_size=self.batch_size,
                                             tags=self.tags)
            self.total_size = len(self.train_manager.batch_data)
            data = {
                "batch_size": self.train_manager.batch_size,
                "input_size": self.train_manager.input_size,
                "vocab": self.train_manager.vocab,
                "tag_map": self.train_manager.tag_map,
            }
            self.save_params(data)
            dev_manager = DataManager(batch_size=30, data_type="dev")
            self.dev_batch = dev_manager.iteration()

            self.model = BiLSTMCRF(
                tag_map=self.train_manager.tag_map,
                batch_size=self.batch_size,
                vocab_size=len(self.train_manager.vocab),
                dropout=self.dropout,
                embedding_dim=self.embedding_size,
                hidden_dim=self.hidden_size,
            )
            self.restore_model()
        elif entry == "predict":
            data_map = self.load_params()
            input_size = data_map.get("input_size")
            self.tag_map = data_map.get("tag_map")
            self.vocab = data_map.get("vocab")

            self.model = BiLSTMCRF(tag_map=self.tag_map,
                                   vocab_size=input_size,
                                   embedding_dim=self.embedding_size,
                                   hidden_dim=self.hidden_size)
            self.restore_model()

    def parse_argument(self):
        """
        :argument
        :return:
        """
        parser = argparse.ArgumentParser(description="NER")
        parser.add_argument("-c",
                            "--config",
                            dest="config_file",
                            type=str,
                            default="./Config/config.cfg",
                            help="config path")
        parser.add_argument("-device",
                            "--device",
                            dest="device",
                            type=str,
                            default="cuda:0",
                            help="device[‘cpu’,‘cuda:0’,‘cuda:1’,......]")
        parser.add_argument("--train",
                            dest="train",
                            action="store_true",
                            default=True,
                            help="train model")
        parser.add_argument("-p",
                            "--process",
                            dest="process",
                            action="store_true",
                            default=True,
                            help="data process")
        parser.add_argument("-t",
                            "--test",
                            dest="test",
                            action="store_true",
                            default=False,
                            help="test model")
        parser.add_argument("--t_model",
                            dest="t_model",
                            type=str,
                            default=None,
                            help="model for test")
        parser.add_argument("--t_data",
                            dest="t_data",
                            type=str,
                            default=None,
                            help="data[train, dev, test, None] for test model")
        parser.add_argument("--predict",
                            dest="predict",
                            action="store_true",
                            default=False,
                            help="predict model")
        args = parser.parse_args()
        # print(vars(args))
        config = configurable.Configurable(config_file=args.config_file)
        config.device = args.device
        config.train = args.train
        config.process = args.process
        config.test = args.test
        config.t_model = args.t_model
        config.t_data = args.t_data
        config.predict = args.predict
        # config
        if config.test is True:
            config.train = False
        if config.t_data not in [None, "train", "dev", "test"]:
            print("\nUsage")
            parser.print_help()
            print("t_data : {}, not in [None, 'train', 'dev', 'test']".format(
                config.t_data))
            exit()
        print("***************************************")
        print("Device : {}".format(config.device))
        print("Data Process : {}".format(config.process))
        print("Train model : {}".format(config.train))
        print("Test model : {}".format(config.test))
        print("t_model : {}".format(config.t_model))
        print("t_data : {}".format(config.t_data))
        print("predict : {}".format(config.predict))
        print("***************************************")

        return config

    def load_config(self):
        try:
            fopen = open("models/config.yml")
            config = yaml.load(fopen)
            fopen.close()
        except Exception as error:
            print("Load config failed, using default config {}".format(error))
            fopen = open("models/config.yml", "w")
            config = {
                "embedding_size": 100,
                "hidden_size": 128,
                "batch_size": 20,
                "dropout": 0.5,
                "model_path": "models/",
                "tasg": ["ORG", "PER"]
            }
            yaml.dump(config, fopen)
            fopen.close()
        self.embedding_size = config.get("embedding_size")
        self.hidden_size = config.get("hidden_size")
        self.batch_size = config.get("batch_size")
        self.model_path = config.get("model_path")
        self.tags = config.get("tags")
        self.dropout = config.get("dropout")

    def restore_model(self):
        try:
            self.model.load_state_dict(
                torch.load(self.model_path + "params.pkl"))
            print("model restore success!")
        except Exception as error:
            print("model restore faild! {}".format(error))

    def save_params(self, data):
        with open("models/data.pkl", "wb") as fopen:
            pickle.dump(data, fopen)

    def load_params(self):
        with open("models/data.pkl", "rb") as fopen:
            data_map = pickle.load(fopen)
        return data_map

    def train(self):
        optimizer = optim.Adam(self.model.parameters())
        # optimizer = optim.SGD(ner_model.parameters(), lr=0.01)
        for epoch in range(100):
            index = 0
            for batch in self.train_manager.get_batch():
                index += 1
                self.model.zero_grad()

                sentences, tags, length = zip(*batch)
                sentences_tensor = torch.tensor(sentences, dtype=torch.long)
                tags_tensor = torch.tensor(tags, dtype=torch.long)
                length_tensor = torch.tensor(length, dtype=torch.long)

                loss = self.model.neg_log_likelihood(sentences_tensor,
                                                     tags_tensor,
                                                     length_tensor)
                progress = ("█" * int(index * 25 / self.total_size)).ljust(25)
                print("""epoch [{}] |{}| {}/{}\n\tloss {:.2f}""".format(
                    epoch, progress, index, self.total_size,
                    loss.cpu().tolist()[0]))
                self.evaluate()
                print("-" * 50)
                loss.backward()
                optimizer.step()
                torch.save(self.model.state_dict(),
                           self.model_path + 'params.pkl')

    def evaluate(self):
        sentences, labels, length = zip(*self.dev_batch.__next__())
        _, paths = self.model(sentences)
        print("\teval")
        for tag in self.tags:
            f1_score(labels, paths, tag, self.model.tag_map)

    def predict(self, input_str=""):
        if not input_str:
            input_str = input("请输入文本: ")
        input_vec = [self.vocab.get(i, 0) for i in input_str]
        # convert to tensor
        sentences = torch.tensor(input_vec).view(1, -1)
        _, paths = self.model(sentences)

        entities = []
        for tag in self.tags:
            tags = get_tags(paths[0], tag, self.tag_map)
            entities += format_result(tags, input_str, tag)
        return entities
Exemple #26
0
class BiLSTMCRFEnter(object):

    def __init__(self, entry="train"):
        # 导入训练参数
        # 利用配置文件对main函数里面需要的变量进行初始化
        self.load_config()
        # 这里传入的entry是train,也就是训练集,也就是说对model初始化时是利用训练集对模型初始化的
        self.__init_model(entry)

    def __init_model(self, entry):
        # 模型训练的参数准备
        if entry == "train":
            #创建训练数据集的管理对象
            print(self.tags)
            self.train_manager = DataManager(batch_size=self.batch_size, tags=self.tags)
            print(self.train_manager.batch_data)
            print(len(self.train_manager.batch_data))
            self.total_size = len(self.train_manager.batch_data)
            # print(self.train_manager.batch_data)
            data = {
                "batch_size": self.train_manager.batch_size,
                "input_size": self.train_manager.input_size,
                "vocab": self.train_manager.vocab,
                "tag_map": self.train_manager.tag_map,
            }
            # 保存参数
            self.save_params(data)
            # 验证数据集的准备
            # 创建验证数据集的管理对象
            dev_manager = DataManager(batch_size=30, data_type="dev")
            # 通过data_manager中的迭代器不断将创建的数据管理器对象赋值到dev_batch中,用于下面计算损失的函数
            self.dev_batch = dev_manager.iteration()

            # 模型的主体使用的是BiLSTM来进行语义编码,CRF用来约束各个标签
            self.model = BiLSTMCRF(
                tag_map=self.train_manager.tag_map,
                batch_size=self.batch_size,
                vocab_size=len(self.train_manager.vocab),
                dropout=self.dropout,
                embedding_dim=self.embedding_size,
                hidden_dim=self.hidden_size,
            )
            # 加载恢复模型参数
            self.restore_model()
        # 模型用来预测的参数准备
        elif entry == "predict":
            data_map = self.load_params()
            input_size = data_map.get("input_size")
            self.tag_map = data_map.get("tag_map")
            self.vocab = data_map.get("vocab")
            # 这里创建一个模型对象model
            self.model = BiLSTMCRF(
                tag_map=self.tag_map,
                vocab_size=input_size,
                embedding_dim=self.embedding_size,
                hidden_dim=self.hidden_size
            )
            self.restore_model()

    def load_config(self):
        try:
            fopen = open("models/config.yml")
            #读取yml文件
            config = yaml.load(fopen)
            fopen.close()
        except Exception as error:
            print("Load config failed, using default config {}".format(error))
            #这里是重写config.yml文件
            fopen = open("models/config.yml", "w")
            config = {
                # 用于重写的数据,即初始化数据
                "embedding_size": 100,
                "hidden_size": 128,
                "batch_size": 50,
                "dropout": 0.5,
                "model_path": "models/",
                #这里原来的tags写成了tasg了,需要改过来
                "tags": ["Medicinal_Name", "Medicinal_Other_Name", "Medicinal_Function", "Medicinal_Taste", "Medicinal_Use_Num"]
            }
            yaml.dump(config, fopen)
            fopen.close()
        #重写过后再读取,感觉有点多此一举,主要就是将tags写进了config文件
        # word_embedding的维度大小
        self.embedding_size = config.get("embedding_size")
        # 隐藏层的维度
        self.hidden_size = config.get("hidden_size")
        # 每一个batch导入多少条数据
        self.batch_size = config.get("batch_size")
        # 模型的保存数据
        self.model_path = config.get("model_path")
        self.tags = config.get("tags")
        # 模型中神经百分之多少激活
        self.dropout = config.get("dropout")
        # 模型一共训练多少轮
        self.epoch = config.get("epoch")

    # 模型在测试过程中进行参数导入
    def restore_model(self):
        try:
            # 加载模型字典、
            # 这个load_state_dict函数并没有出现在任何一个文件中,所以这是怎么调用的?
            self.model.load_state_dict(torch.load(self.model_path + "params.pkl"))
            print("model restore success!")
        except Exception as error:
            print("model restore faild! {}".format(error))

    # 训练过程中保存模型的参数
    def save_params(self, data):
        with open("models/data.pkl", "wb") as fopen:
            pickle.dump(data, fopen)
    # 训练过程中读取更新后的模型的参数
    def load_params(self):
        # pkl文件的读取
        with open("models/data.pkl", "rb") as fopen:
            data_map = pickle.load(fopen)
            # print("*"*50+data_map+"*"*50)
        return data_map

    def train(self):
        # 使用Adam优化器进行梯度下降算法的优化迭代
        # 这里的parameters函数也没有在任何文件中声明过
        optimizer = optim.Adam(self.model.parameters(), lr=0.05)
        # optimizer = optim.SGD(ner_model.parameters(), lr=0.01)
        # 模型一共训练多少轮轮
        for epoch in range(self.epoch):
            index = 0
            # 获取每一个batch的数据
            for batch in self.train_manager.get_batch():
                index += 1
                self.model.zero_grad()

                sentences, tags, length = zip(*batch)
                sentences_tensor = torch.tensor(sentences, dtype=torch.long)
                tags_tensor = torch.tensor(tags, dtype=torch.long)
                length_tensor = torch.tensor(length, dtype=torch.long)

                # 计算模型训练过程中的损失

                loss = self.model.neg_log_likelihood(sentences_tensor, tags_tensor, length_tensor)
                # 进度加载
                progress = ("█" * int(index * 25 / self.total_size)).ljust(25)
                print("""epoch [{}] |{}| {}/{}\n\tloss {:.2f}""".format(
                    epoch, progress, index, self.total_size, loss.cpu().tolist()[0]
                )
                )
                self.evaluate()
                print("-" * 50)
                # 梯度回传
                loss.backward()
                # 优化器优化

                optimizer.step()
                # 保存模型
                torch.save(self.model.state_dict(), self.model_path + 'params.pkl')
                # torch.save(self.model)

    # 训练过程中的损失计算
    def evaluate(self):
        sentences, labels, length = zip(*self.dev_batch.__next__())
        _, paths = self.model(sentences)
        print("\teval")
        for tag in self.tags:
            f1_score(labels, paths, tag, self.model.tag_map)

    # 模型训练好之后的预测
    def predict(self, input_str=""):
        if not input_str:
            input_str = input("请输入文本: ")
        input_vec = [self.vocab.get(i, 0) for i in input_str]
        # convert to tensor
        sentences = torch.tensor(input_vec).view(1, -1)
        _, paths = self.model(sentences)
        entities = []
        for tag in self.tags:
            # 这里调用了工具类里面的get_tags用来对数据进行标注,就是标一些B-FUNC什么的
            tags = get_tags(paths[0], tag, self.tag_map)
            print(tag)
            print(self.tag_map)
            print(paths[0])
            print(tags)
            entities += format_result(tags, input_str, tag)

        return entities

        # 模型对文件中的句子进行实体预测
    def predict_file(self, f_r_path, f_w_path):
        # 去除重复预测的实体
        duplication = set()
        with open(f_r_path, encoding='utf-8') as f_r:
            with open(f_w_path, 'ab') as f_w:
                for line in f_r.readlines():
                    sent = line.split('\t')[-3].strip()
                    res = self.predict(sent)
                    for i in range(len(res)-1):
                        entity = res[i]['word']
                        tag=res[i]["type"]
                        if entity not in duplication:
                            # print(entity)
                            duplication.add(tag)
                            duplication.add(entity)
                            f_w.write((tag+" : "+entity + '\n').encode())
                        if res[i]["type"]!=res[i+1]["type"]:
                            f_w.write('\n'.encode())
class ChineseNER(object):
    def __init__(self, entry="train"):
        self.load_config()
        self.__init_model(entry)

    def __init_model(self, entry):
        if entry == "train":
            self.train_manager = DataManager(batch_size=self.batch_size,
                                             tags=self.tags)
            self.total_size = len(self.train_manager.batch_data)
            data = {
                "batch_size": self.train_manager.batch_size,
                "input_size": self.train_manager.input_size,
                "vocab": self.train_manager.vocab,
                "tag_map": self.train_manager.tag_map,
            }
            self.save_params(data)
            self.dev_manager = DataManager(batch_size=60, data_type="dev")
            # 验证集
            # self.dev_batch = self.dev_manager.iteration()

            self.model = BiLSTMCRF(
                tag_map=self.train_manager.tag_map,
                batch_size=self.batch_size,
                vocab_size=len(self.train_manager.vocab),
                dropout=self.dropout,
                embedding_dim=self.embedding_size,
                hidden_dim=self.hidden_size,
            )
            self.model = self.model.cuda()
            self.restore_model()
        elif entry == "predict" or "evaluate":
            # python main.py predict
            data_map = self.load_params()
            input_size = data_map.get("input_size")
            self.tag_map = data_map.get("tag_map")
            self.vocab = data_map.get("vocab")
            print('input_size', input_size)
            print('tag_map', self.tag_map)
            self.model = BiLSTMCRF(tag_map=self.tag_map,
                                   vocab_size=input_size,
                                   embedding_dim=self.embedding_size,
                                   hidden_dim=self.hidden_size)
            self.model = self.model.cuda()
            self.test_manager = DataManager(batch_size=60, data_type="dev")
            self.restore_model()

    # 加载配置项
    def load_config(self):
        try:
            fopen = open("models/config.yml")
            config = yaml.load(fopen)
            fopen.close()
        except Exception as error:
            print("Load config failed, using default config {}".format(error))
            fopen = open("models/config.yml", "w")
            config = {
                "embedding_size": 300,
                "hidden_size": 128,
                "batch_size": 30,
                "dropout": 0.5,
                "model_path": "models/",
                "tags": ["TREATMENT", "BODY", "SIGNS", "CHECK", "DISEASE"]
            }
            yaml.dump(config, fopen)
            fopen.close()
        self.embedding_size = config.get("embedding_size")
        self.hidden_size = config.get("hidden_size")
        self.batch_size = config.get("batch_size")
        self.model_path = config.get("model_path")
        self.tags = config.get("tags")
        self.dropout = config.get("dropout")

    # 保存模型各种训练参数
    def restore_model(self):
        try:
            self.model.load_state_dict(
                torch.load(self.model_path + "params_6all.pkl"))
            print("model restore success!")
        except Exception as error:
            print("model restore faild! {}".format(error))

    # 保存模型超参数
    def save_params(self, data):
        with open("models/data_6all.pkl", "wb") as fopen:
            pickle.dump(data, fopen)

    # 加载模型超参数
    def load_params(self):
        with open("models/data_6all.pkl", "rb") as fopen:
            data_map = pickle.load(fopen)
        return data_map

    def train(self):
        optimizer = optim.Adam(self.model.parameters(),
                               weight_decay=0.002,
                               lr=0.0000004)  # 0.000001
        # optimizer = optim.SGD(self.model.parameters(), lr=0.00000008,weight_decay=0.001,momentum=0.9) #4e-7
        scheduler_lr = optim.lr_scheduler.ReduceLROnPlateau(optimizer,
                                                            mode='min',
                                                            factor=0.5,
                                                            patience=2,
                                                            cooldown=5,
                                                            verbose=True,
                                                            min_lr=1e-8,
                                                            eps=1e-8)
        best_loss = 240
        lossList = [0] * self.total_size
        for epoch in range(268, 401):
            losses = []
            index = 0
            startTime = time.process_time()
            for batch in self.train_manager.get_batch():
                start = time.process_time()
                index += 1
                self.model.zero_grad()

                sentences, tags, length = zip(*batch)
                # lenght 是句子的原本长度
                # shape (batch_size,max.len(sentence) (20,332) batch_size 和 每个batch最长句子的长度
                sentences_tensor = torch.tensor(sentences,
                                                dtype=torch.long).cuda()
                tags_tensor = torch.tensor(tags, dtype=torch.long).cuda()
                length_tensor = torch.tensor(length, dtype=torch.long).cuda()

                loss = self.model.neg_log_likelihood(sentences_tensor,
                                                     tags_tensor,
                                                     length_tensor)
                losses.append(loss.cpu().item())
                progress = ("█" * int(index * 60 / self.total_size)).ljust(60)
                loss.backward()
                optimizer.step()
                # torch.save(self.model.state_dict(), self.model_path + 'params_6all.pkl')
                end = time.process_time()
                dur = end - start
                print(
                    """epoch [{}] |{}| {}/{}\n\tloss {:.3f}\t\tlast_loss {:.3f}\t\ttime {}\t\tbest_avg_loss {:.3f}"""
                    .format(epoch, progress, index, self.total_size,
                            loss.cpu().tolist()[0], lossList[index - 1],
                            str(dur), best_loss))
                lossList[index - 1] = loss.cpu().item()
                print("-" * 90)
            endTime = time.process_time()
            totalTime = endTime - startTime
            avg_loss = np.mean(losses)
            # 保存最好的模型
            if avg_loss < best_loss:
                best_loss = avg_loss
                torch.save(self.model.state_dict(),
                           self.model_path + 'params_6all.pkl')
            writer.add_scalar('BiLstm_CRF:avg_loss-epoch', avg_loss, epoch)
            print('epoch ', epoch, '   avg_loss ', avg_loss, '   total_time ',
                  totalTime)
            if epoch % 5 == 0:
                self.evaluate(epoch / 5, manager=self.dev_manager)
            print("-" * 100)
            scheduler_lr.step(avg_loss)
        writer.close()

    # train: BODY 7507, SIGNS 6355, CHECK 6965, DISEASE 474, TREATMENT 805
    # test:
    # 计算f1,评估模型
    def evaluate(self, epoch, manager, add_scalar=True):
        print('正在开始评估')
        all_origins = all_founds = all_rights = 0
        for tag in self.tags:
            origins = founds = rights = 0
            for batch in manager.get_batch():
                sentences, labels, length = zip(*batch)
                _, paths = self.model(sentences)
                origin, found, right = f1_score(labels, paths, tag,
                                                self.model.tag_map)
                origins += origin
                founds += found
                rights += right
            all_origins += origins
            all_founds += founds
            all_rights += rights
            recall = 0. if origins == 0 else (rights / origins)
            precision = 0. if founds == 0 else (rights / founds)
            f1 = 0. if recall + precision == 0 else (
                2 * precision * recall) / (precision + recall)
            print("\t{}\torigins:{}\t\t\tfounds:{}\t\t\trights:{}".format(
                tag, origins, founds, rights))
            print("\t\t\trecall:{}\tprecision:{}\tf1:{}".format(
                recall, precision, f1))
            if add_scalar:
                tag_epoch = tag + '-5epoch'
                writer.add_scalars(tag_epoch, {
                    'recall': recall,
                    'precision': precision,
                    'f1': f1
                }, epoch)
        all_recall = 0. if all_origins == 0 else (all_rights / all_origins)
        all_precision = 0. if all_founds == 0 else (all_rights / all_founds)
        all_f1 = 0. if all_recall + all_precision == 0 else (
            2 * all_precision * all_recall) / (all_precision + all_recall)
        print("\tall_origins:{}\t\t\tall_founds:{}\t\t\tall_rights:{}".format(
            all_origins, all_founds, all_rights))
        print("\tall_recall:{}\tall_precision:{}\tall_f1:{}".format(
            all_recall, all_precision, all_f1))
        if add_scalar:
            writer.add_scalars(
                "ALL-5epoch", {
                    'all_recall': all_recall,
                    'all_precision': all_precision,
                    'all_f1': all_f1
                }, epoch)
        print('评估结束')
        return all_recall, all_precision, all_f1

    # 预测方法
    def predict(self, input_str=""):
        if not input_str:
            input_str = input("请输入文本: ")
        # 获取输入句子所有汉字的在vocab的索引
        input_vec = [self.vocab.get(i, 0) for i in input_str]
        # convert to tensor
        sentences = torch.tensor(input_vec, dtype=torch.long).view(1, -1)
        sentences = sentences.cuda()
        # paths 预测出来的标签索引 shape 为 [1,1]
        _, paths = self.model(sentences)

        entities = []
        # "tags": ["ORG", "PER"]
        for tag in self.tags:
            tags = get_tags(paths[0], tag, self.tag_map)
            entities += format_result(tags, input_str, tag)
        print(entities)
        print(json.dumps(entities, indent=4, ensure_ascii=False))
        return entities
Exemple #28
0
    #单字个数,包括数据集中所有出现的字符
    vocab_size = 4688

    #词嵌入维度
    embed_size = 128

    #lstm结构中隐藏层维度
    units = 64

    #标签类别
    tag_list = ['b', 'm', 'e', 's']
    num_tags = len(tag_list)

    datafile = "./data/data.txt"

    my_model = BiLSTMCRF(vocab_size, embed_size, units, num_tags)

    train_loss = tf.keras.metrics.Mean(name='train_loss')
    lstm_acc = tf.keras.metrics.SparseCategoricalAccuracy(
        name='train_accuracy')

    #optimizer = tf.keras.optimizers.SGD(learning_rate=0.1)
    #optimizer = tf.keras.optimizers.Adam(learning_rate=0.1)
    #optimizer = tf.keras.optimizers.RMSprop(learning_rate=0.1)
    optimizer = tf.keras.optimizers.Adagrad(learning_rate=0.1)
    #优化器大致效果Adagrad>Adam>RMSprop>SGD

    #设置checkpoint,只保存最新的3个
    ckpt = tf.train.Checkpoint(my_model=my_model, optimizer=optimizer)
    ckpt_manager = tf.train.CheckpointManager(ckpt,
                                              './save_checkpoint/',