Example #1
0
    def test_load_from_config(self):
        ww = get_window_data_set(train = True)
        print(ww)

        self.assertTrue(ww.num_tokens > 0)
        self.assertTrue(ww.num_tokens > 0)


        ww2 = get_window_data_set('../data/esp/esp.train')
        # ww2 = get_window_data_set('/home/user/dsf/named_entity_recognition/data/esp/esp.train')
        print(ww2)

        ww3 = get_window_data_set(data = (['wo', 'bu', 'shi', 'zhong', 'wen'],[],[]))
        print(ww3)
Example #2
0
def seq_ner(sentences, model):
    # 默认的输入的格式是list of str
    sentences = [list(sent.strip().replace(' ', '')) for sent in sentences]

    if model == 'mlp':
        # mlp 预测
        model_path = get_model_path('mlp')
        conf = estimators.load_conf(model_path)
        mlp = estimators.MultiLayerPerceptron(conf)
        mlp.load()
        test = get_window_data_set(train=False)
        tags = []
        for sentence in sentences:
            test.set_data(data=(sentence, [], []))
            mlp_pro = mlp.proba(test)
            # del mlp
            rev_dict = {k: v for v, k in conf.tag_dict.items()}
            pred = [np.argmax(mlp_pro[i])
                    for i in range(len(mlp_pro))]  # 获取每个字对应类别概率最大的类别下标
            tag = [rev_dict[tag] for tag in pred]
            tags.append(tag)
        return sentences, tags, uti.get_entity(sentences, tags)
        # return sentences, tags, uti.get_entity_sent(list(chain.from_iterable(sentences)), tags)
    else:
        # crf 预测
        model_path = get_model_path('crf')
        crf = estimators.CRF(model_dir=model_path)
        test = get_crf_data_set(data=(sentences, [], []), train=False)
        # test.set_data(data=(sentences, [], []))
        crf_pro = crf.proba(test)
        tags = crf.predict(test)
        tags = uti.clear_single_tag(tags)
        return sentences, tags, uti.get_entity(sentences, tags)
Example #3
0
def train(path=None, crf_flag=True, mlp_flag=False):
    """
    第一步,根据配置文件获得初始模型参数,并初始化模型
    第二部,数据模型导入输入和接口
    第二步,根据NER的模型,转换数据的格式
    第三步,训练NER模型,并保存到指定的位置。

    :return:
    """

    # 还需要根据配置文件初始化

    if mlp_flag:
        train_set = get_window_data_set(path)
        print(train_set)
        conf = get_default_mlp_config()
        conf.n_classes = len(train_set.labels)
        conf.n_input = train_set.dictionary.vector_size * train_set._windows_size
        conf.labels = train_set.labels
        conf.tag_dict = train_set.tag_dict

        mlp = estimators.MultiLayerPerceptron(config=conf)
        mlp.fit(train_set)

        test_set = get_window_data_set(path, language='zh', train=False)
        # mlp.evaluate_wn(test_set)       # 重写评估方式,按识别出的实体个数统计P、R、F值
        mlp.evaluate(test_set)

        model_dir = get_model_path('mlp')
        mlp.save(model_dir)

    if crf_flag:
        train_set = get_crf_data_set(path)
        print(train_set)
        test_set = get_crf_data_set(path, train=False)
        print(test_set)

        model_dir = get_model_path('crf')
        crf = estimators.CRF(model_dir=model_dir)

        # 如果存在训练好的crf模型,则无须再训练
        # if not os.listdir(model_dir):
        crf.fit(train_set)
        crf.evaluate_wn(test_set)  # 重写评估方式,按识别出的实体个数统计P、R、F值
        crf.evaluate(test_set)

    return
    def test_predict(self):
        model_path = get_model_path('mlp')
        default_conf = load_conf(model_path=model_path)
        mlp = MultiLayerPerceptron(default_conf)
        mlp.load()

        ww = get_window_data_set()
        ww.set_data(data=([list('我来到中国'), list('今天我和张建国去东南大学做报告')], [], []))

        print(mlp.predict(ww))
Example #5
0
    def test_word2vec(self):
        word = '中'
        ww = get_window_data_set()
        fea = ww.word2vec(word)
        print(fea)

        self.assertTrue(isinstance(fea, list))


        fea = ww.word2vec('nn')
        self.assertTrue(isinstance(fea, list))

        fea = ww.word2vec('space')
        self.assertTrue(isinstance(fea, list))

        self.assertTrue(sum(fea) == 0)
Example #6
0
def seq_label(sentence, flags, ensemble=False):
    sent = list(sentence)

    if not flags:
        raise ValueError()

    r1 = []
    r2 = []
    y_true = []
    labels = []
    rev_dict = {}

    if flags['mlp']:
        model_path = get_model_path('mlp')
        conf = estimators.load_conf(model_path)
        labels = conf.labels

        mlp = estimators.MultiLayerPerceptron(conf)
        mlp.load()

        test = get_window_data_set(train=False)
        y_true = test.extract_label(test.tags)
        # test.set_data(data=([sent], [], []))

        rev_dict = {k: v for v, k in conf.tag_dict.items()}
        # print(list(rev_dict[token] for token in mlp.predict(test)))
        r1 = mlp.proba(test)

        # mlp.evaluate(test)
        # mlp.proba(DataSet.w2v_DataSet(data=([sent], [], []), dictionary_path=dic_path, word2vec=True, window_size=7,
        #                               vector_size=100))

    if flags['crf']:
        model_path = get_model_path('crf')

        crf = estimators.CRF(model_dir=model_path)

        test = get_crf_data_set(train=False)
        # test.set_data(data=([sent], [], []))
        # re = crf.predict_sent(DataSet.crf_DataSet(path=None, data=([sent], [], [])))
        r2 = crf.proba(test)
        # crf.evaluate(test)

        r_pred = np.argmax(r2, 1)
        # print(len(r2))
        # print(len(y_true))
        # print(precision_score(r_pred, y_true, len(labels)))

    y_pred = r1 * 0.2 + r2 * 0.8
    y_pred = np.argmax(y_pred, 1)

    print(precision_score(y_pred, y_true, len(labels)))
    print(recall_score(y_pred, y_true, len(labels)))
    print(
        classification_report(y_true,
                              np.argmax(r2, 1),
                              labels=list(range(1, len(labels))),
                              target_names=labels[1:],
                              digits=4))
    print(
        classification_report(y_true,
                              y_pred,
                              labels=list(range(1, len(labels))),
                              target_names=labels[1:],
                              digits=4))
Example #7
0
 def test_ww_extract_sentence(self):
     ww = get_window_data_set()
     tag = ww.extract_label_from_sentence(ww.tags[12])
     print('tag' , tag)
Example #8
0
    def test_extract_sentence(self):

        sent = list('我喜欢中国,但我更爱北京')
        ww = get_window_data_set()
        ww.extract_feature_from_sentence(sent)
Example #9
0
    def test_load_default_ww(self):
        ww = get_window_data_set(train = True)
        print(ww)

        ww = get_window_data_set(train = False)
        print(ww)