-
Notifications
You must be signed in to change notification settings - Fork 0
/
initial_named_entitiy_recognition.py
83 lines (64 loc) · 2.28 KB
/
initial_named_entitiy_recognition.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
from nltk.corpus import conll2002
import numpy as np
def word2features(sent, i):
word = sent[i][0][0]
postag = sent[i][0][1]
features = [
'bias',
word.lower(),
word[-3:],
word[-2:],
word[-1:],
word[:1],
word[:2],
word[:3],
word.isupper(),
word.isdigit(),
postag,
]
return features
def sent2features(sent):
return [word2features(sent, i) for i in range(len(sent))]
def sent2labels(sent):
return [label for token, postag, label in sent]
def sent2tokens(sent):
return [token for token, postag, label in sent]
def sent2features(sent):
return [word2features(sent, i) for i in range(len(sent)) if type(sent[i]) != tuple]
def sent2labels(sent):
labels = []
for i in range(len(sent)):
if type(sent[i]) != tuple:
label = sent[i]._label
labels.append(label)
return labels
def sent2tokens(sent):
return [token for token, postag, label in sent]
etr = conll2002.chunked_sents('esp.train') # In Spanish
eta = conll2002.chunked_sents('esp.testa') # In Spanish
etb = conll2002.chunked_sents('esp.testb') # In Spanish
dtr = conll2002.chunked_sents('ned.train') # In Dutch
dta = conll2002.chunked_sents('ned.testa') # In Dutch
dtb = conll2002.chunked_sents('ned.testb') # In Dutch
train_sents = etr
test_sents = etb
X_train = [sent2features(s) for s in train_sents]
X_train = [item for sublist in X_train for item in sublist]
# normalizing the values of x:
for index in range(len(X_train[0])):
mean = np.mean(np.array([row[index] for row in X_train]))
sd = np.std(np.array([row[index] for row in X_train]))
for row in X_train:
row[index] = (row[index] - mean) / sd
y_train = [sent2labels(s) for s in train_sents]
y_train = [item for sublist in y_train for item in sublist]
X_test = [sent2features(s) for s in test_sents]
X_test = [item for sublist in X_test for item in sublist]
# normalizing the values of x:
for index in range(len(X_test[0])):
mean = np.mean(np.array([row[index] for row in X_train]))
sd = np.std(np.array([row[index] for row in X_train]))
for row in X_test:
row[index] = (row[index] - mean) / sd
y_test = [sent2labels(s) for s in test_sents]
y_test = [item for sublist in y_test for item in sublist]