Beispiel #1
0
 def process_X(self, data, word2idx, max_sentence_length):
     sentence_getter = SentenceGetter(data, label_adapter=get_label)
     X = [[word2idx[w[0]] for w in s] for s in sentence_getter.sentences]
     X = pad_sequences(maxlen=max_sentence_length,
                       sequences=X,
                       padding="post",
                       value=word2idx["PAD"])
     return X
Beispiel #2
0
 def process_Y(self, data, tag2idx, max_sentence_length, n_tags):
     sentence_getter = SentenceGetter(data, label_adapter=get_label)
     Y = [[tag2idx[w[1]] for w in s] for s in sentence_getter.sentences]
     Y_str = copy.deepcopy(Y)
     Y = pad_sequences(maxlen=max_sentence_length,
                       sequences=Y,
                       padding="post",
                       value=tag2idx["PAD"])
     Y = np.array([to_categorical(i, num_classes=n_tags + 1)
                   for i in Y])  # n_tags+1(PAD)
     return Y, Y_str
Beispiel #3
0
import pandas as pd
import numpy as np
from utils import SentenceGetter
from sklearn_crfsuite import CRF
from sklearn.cross_validation import cross_val_predict
from sklearn_crfsuite.metrics import flat_classification_report

data = pd.read_csv("./data/ner_dataset.csv", encoding="utf8")
data = data.fillna(method="ffill")
sentences = SentenceGetter(data).sentences

# 特征:
# 1. 词
# 2. 词性
# 3. 比较候选词(0or1)
# 4. 启发式位置(0or1)
# 5. 浅层句法

# 模板:
# 1. 该词前后三个词的所有特征(共5*7=35)
# 2. 该词的每个特征两两组合(共7个)
# 3. 相邻两个词的同一特征组合(共10*3=30)
# 4. 中心词词窗为1的同一特征组合(共4*3=12)
# 总计特征:84个






# 是否位于介词与比较词之间
# CRF
import argparse
from utils import load_data, SentenceGetter, sent2features, sent2labels
from sklearn_crfsuite import CRF
from sklearn.externals import joblib
from sklearn.model_selection import cross_val_predict
from sklearn_crfsuite.metrics import flat_classification_report

# 1 加载数据
ner_dataset_dir = '../data/ner_dataset.csv'
data = load_data(ner_dataset_dir)

# 2 构建数据集
getter = SentenceGetter(data)
sentences = getter.sentences
X = [sent2features(s) for s in sentences]
y = [sent2labels(s) for s in sentences]


# 3 CRF 训练 2
def train():
    crf = CRF(algorithm='lbfgs',
              c1=10,
              c2=0.1,
              max_iterations=100,
              all_possible_transitions=True)

    pred = cross_val_predict(estimator=crf, X=X, y=y, cv=5)
    report = flat_classification_report(y_pred=pred, y_true=y)
    print(report)
model = tf.keras.models.load_model(full_model_name)
model.summary()
file_directory= '../../sample_texts/inputs/'
for file_name in os.listdir(file_directory):
    file_path = f'{file_directory}/{file_name}'
    file = open(file_path, mode='r')
    test_file_content = file.read()
    file.close()
    #print(test_file_content)
    #test_file_content = re.sub(r"[^\w\s.\d]", "", test_file_content)

    test_words = word_tokenize(test_file_content)
    test_data = pd.DataFrame(test_words, index=test_words)
    X_te = model_instance.process_X(test_data, word2idx, max_sentence_length)
    X_char_te = get_char_indices(test_data, max_word_length, max_sentence_length, char2idx)
    sentence_getter = SentenceGetter(test_data, label_adapter=get_label)
    padding_start = [len(s) for s in sentence_getter.sentences]
    X_te = np.asarray(X_te).astype(np.float32)

    sentence_preds = model.predict([X_te, X_char_te])
    sentence_preds = [s[:padding_start_index] for s, padding_start_index in zip(sentence_preds, padding_start)]

    predicted_labels = [idx2tag[np.argmax(prob)] for sentence in sentence_preds for prob in sentence]
    food_entities = []

    all_sentences = list(test_data.index)
    food_pieces = []
    for index, label in enumerate(predicted_labels):
        if label == 'B-FOOD':
            food_pieces=[all_sentences[index]]
        elif label == 'I-FOOD' and len(food_pieces) > 0: