-
Notifications
You must be signed in to change notification settings - Fork 0
/
stanford.py
112 lines (99 loc) · 4.95 KB
/
stanford.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
# This functions tags a given tsv file with named entities using the standford ner.
import os
java_path = "C:/Program Files (x86)/Common Files/Oracle/Java/javapath/java.exe"
os.environ['JAVAHOME'] = java_path
import nltk
nltk.internals.config_java("C:/Program Files (x86)/Common Files/Oracle/Java/javapath/java.exe")
from nltk.tag.stanford import StanfordNERTagger
def stanford_ner_tagger(stanford_dir, jarfile, modelfile, tag_this_file, output_file):
jarfile = stanford_dir + jarfile
modelfile = stanford_dir + modelfile
#st = StanfordNERTagger(model_filename=modelfile, path_to_jar=jarfile)
stanford_classifier = 'D:\\NLP\\ner_evals\\classifiers\\stanford-ner-2018-02-27\\classifiers\\english.all.3class.distsim.crf.ser.gz'
stanford_ner_path = 'D:\\NLP\\ner_evals\\classifiers\\stanford-ner-2018-02-27\\stanford-ner.jar'
# Creating Tagger Object
st = StanfordNERTagger(stanford_classifier, stanford_ner_path, encoding='utf-8')
i = 0
tagged_ne = []
with open(tag_this_file, "r") as f:
for line in f:
line = line.split()
i += 1
if len(line) > 0:
tagged_ne.append(line[0])
else:
# Remove the SENENDs from the output file afterwards. Needed to keep the format consistent
# Keep in mind, that some "/" are still removed. Is replace in postprecessing step.
tagged_ne.append("SENEND")
print(tagged_ne)
# Tag the file using Stanford NER
out = st.tag(tagged_ne)
# Write the results to a tsv file
with open(output_file, "w") as f:
for i in out:
f.write(str(i[0])+"\t"+i[1]+"\n")
# Tagging conll with conll model
stanford_ner_tagger('classifiers/stanford-ner-2018-02-27/', \
'stanford-english-corenlp-2018-02-27-models.jar', \
"classifiers/english.conll.4class.distsim.crf.ser.gz", \
"data/conll03/eng.testa", \
"stanford_trained_with_conll_tested_on_conlltesta")
#
## Tagging europarl with conll model
#stanford_ner_tagger('classifiers/stanford-ner-2016-10-31/', \
# 'stanford-german-corenlp-2016-10-31-models.jar', \
# "classifiers/german.conll.hgc_175m_600.crf.ser.gz", \
# "corpora/europarl/ep-96-04-15_pado_annotated.tsv", \
# "stanford_trained_with_conll_tested_on_europarl")
#
## Tagging germeval with conll model
#stanford_ner_tagger('classifiers/stanford-ner-2016-10-31/', \
# 'stanford-german-corenlp-2016-10-31-models.jar', \
# "classifiers/german.conll.hgc_175m_600.crf.ser.gz", \
# "corpora/germaner/NER-de-test-conll-formated.txt", \
# "stanford_trained_with_conll_tested_on_germeval")
#
## Tagging conll with germeval model
#stanford_ner_tagger('classifiers/stanford-ner-2016-10-31/', \
# 'stanford-german-corenlp-2016-10-31-models.jar', \
# "classifiers/ner-model-germaeval.ser.gz", \
# "corpora/conll2003/deuutf.testa", \
# "stanford_trained_with_germeval_tested_on_conlltesta")
#
## Tagging europarl with germeval model
#stanford_ner_tagger('classifiers/stanford-ner-2016-10-31/', \
# 'stanford-german-corenlp-2016-10-31-models.jar', \
# "classifiers/ner-model-germaeval.ser.gz", \
# "corpora/europarl/ep-96-04-15_pado_annotated.tsv", \
# "stanford_trained_with_germeval_tested_on_europarl")
#
## Tagging germeval with germeval model
#stanford_ner_tagger('classifiers/stanford-ner-2016-10-31/', \
# 'stanford-german-corenlp-2016-10-31-models.jar', \
# "classifiers/ner-model-germaeval.ser.gz", \
# "corpora/germaner/NER-de-test-conll-formated.txt", \
# "stanford_trained_with_germeval_tested_on_germeval")
def postprocess(tagged_files):
i = 1
output = "classified_output/" + tagged_files + ".tsv"
with open(tagged_files, "r") as pre, open(output, "w") as post:
for line in pre:
line = line.split("\t")
# Fixing ignored tokens in germaner conll formated files by stanford ner on lines 64899 and 99279
if i == 64899 or i == 99279:
post.write("<>" + "\t" + "O" + "\n")
if len(line) >= 1:
if line[0] == "SENEND":
post.write("\n")
else:
post.write(line[0] + "\t" + line[1])
else:
print(line, i)
i += 1
postprocess("stanford_trained_with_conll_tested_on_conlltesta")
#postprocess("stanford_trained_with_conll_tested_on_germeval")
#postprocess("stanford_trained_with_conll_tested_on_europarl")
#
#postprocess("stanford_trained_with_germeval_tested_on_conlltesta")
#postprocess("stanford_trained_with_germeval_tested_on_germeval")
#postprocess("stanford_trained_with_germeval_tested_on_europarl")