forked from proycon/valkuil
/
opentaalerrorharvest2folia.py
executable file
·119 lines (100 loc) · 5.35 KB
/
opentaalerrorharvest2folia.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
#!/usr/bin/python
import sys
import codecs
from pynlpl.formats import folia
from pynlpl.clients.frogclient import FrogClient
doc = None
outputfile = ''
docnum = 0
try:
inputfile = sys.argv[1]
frogport = int(sys.argv[2])
outputdir = sys.argv[3]
if len(sys.argv) >= 5:
stripcorrections = bool(int(sys.argv[4]))
else:
stripcorrections = False
except:
print >>sys.stderr ,"Usage: opentaalerrorharvest2folia.py inputfile frogport outputdir [stripcorrections=0/1]\nStart a frog server with: $ frog --skip=mp -S portnum"
sys.exit(2)
frogclient = FrogClient('localhost', frogport)
correctioncount = 0
with codecs.open(inputfile,'r','utf-8','ignore') as f:
for i, line in enumerate(f):
print >>sys.stderr,"@" + str(i),
if i % 1000 == 0:
if doc:
doc.save(outputfile)
print >>sys.stderr,"Saved " + outputfile
docnum += 1
outputfile = outputdir + '/opentaalerrorharvest' + str(docnum) + '.xml'
doc = folia.Document(id='opentaalerrorharvest' + str(docnum))
doc.declare(folia.AnnotationType.TOKEN, set='http://ilk.uvt.nl/folia/sets/ucto-nl.foliaset', annotator='Frog',annotatortype=folia.AnnotatorType.AUTO)
doc.declare(folia.AnnotationType.POS, set='http://ilk.uvt.nl/folia/sets/cgn-legacy.foliaset', annotator='Frog',annotatortype=folia.AnnotatorType.AUTO)
doc.declare(folia.AnnotationType.LEMMA, set='http://ilk.uvt.nl/folia/sets/mblem-nl.foliaset', annotator='Frog',annotatortype=folia.AnnotatorType.AUTO)
if not stripcorrections:
doc.declare(folia.AnnotationType.CORRECTION, set='opentaal', annotator='unknown',annotatortype= folia.AnnotatorType.MANUAL)
textbody = doc.append(folia.Text) #, id='opentaalerrorharvest' + str(docnum) + '.text')
line = line.strip()
if line:
sample_id = None
corrections = {} #original -> new
in_correction = 0
skipsample = False
#get natural text string and extract corrections
text = ""
for j, c in enumerate(line):
if c == '|':
if not sample_id:
sample_id = 'OPENTAAL-s' + line[:j]
print >>sys.stderr, sample_id
if in_correction:
correction_sep = j
elif c == '~' and sample_id:
if in_correction and correction_sep:
if line[in_correction:correction_sep] in corrections:
print >>sys.stderr,"WARNING: Can not deal with two similar corrections ("+line[in_correction:correction_sep]+") in one sample. Skipping sample " + sample_id
skipsample = True
break
elif ' ' in line[in_correction:correction_sep] or ' ' in line[correction_sep+1:j]:
print >>sys.stderr,"WARNING: Can not deal splits and merges (\"" + line[in_correction:correction_sep] + "\" -> \"" + line[correction_sep+1:j] + "\" ) . This correction will be omitted"
else:
print >>sys.stderr,"Found correction (\"" + line[in_correction:correction_sep] + "\" -> \"" + line[correction_sep+1:j] + "\" )"
corrections[line[in_correction:correction_sep]] = line[correction_sep+1:j]
text += line[in_correction:correction_sep]
in_correction = 0
correction_sep = 0
else:
in_correction = j+1
elif not in_correction and sample_id:
text += c
if "\\" in text:
print >>sys.stderr,"WARNING: backslash in text, skipping sample to prevent Frog bug."
continue
if skipsample:
continue
if text and corrections and sample_id:
print >>sys.stderr,"Invoking Frog and processing text: " + text
paragraph = folia.Paragraph(doc, id=sample_id)
sentence = paragraph.append(folia.Sentence)
for j, (wordtext, lemma, morph, pos) in enumerate(frogclient.process(text)):
if not wordtext or not wordtext.strip():
print >>sys.stderr,"Empty word, moving to next sentence"
sentence = paragraph.append(folia.Sentence)
else:
word = sentence.append(folia.Word, text=wordtext)
if lemma:
word.append(folia.LemmaAnnotation, cls=lemma)
if pos:
word.append(folia.PosAnnotation, cls=pos)
if wordtext in corrections and not stripcorrections:
try:
word.correct(new=corrections[wordtext])
except ValueError as e:
print >>sys.stderr, "Error correcting, ignoring:", e
correctioncount += 1
print >>sys.stderr, "Succesfully added a correction (" + str(correctioncount) + ")"
textbody.append(paragraph)
if doc:
doc.save(outputfile)
print >>sys.stderr,"Saved " + outputfile