-
Notifications
You must be signed in to change notification settings - Fork 0
/
hfst-tokenise.py
131 lines (122 loc) · 5.28 KB
/
hfst-tokenise.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
A command-line client for using a HFST automaton to tokenise running text
"""
import libhfst
from argparse import ArgumentParser, FileType
from sys import stderr, stdin, stdout, exit
def take_greedy_lrlm_tokens(paths):
if len(paths) > 1:
tokenlength = 0
goodpaths = set(paths)
position = 0
while len(goodpaths) > 1:
longest = position
for path in paths:
tokenlength = path.output.find("@TOKEN@", position)
if tokenlength == -1:
tokenlenght = len(path.output)
if tokenlength > longest:
longest = tokenlength
goodpaths = set([path])
elif tokenlength == longest:
goodpaths.add(path)
position = longest
return goodpaths.pop().output.split("@TOKEN@")
elif len(paths) == 1:
return paths[0].output.split("@TOKEN@")
else:
return None
def main():
a = ArgumentParser(
description="Tokeniser for plain text data using HFST automata. "
"Takes a text stream input and outputs TSV token stream where "
"one line is one token. Tokens should include white-space tokens,"
"but this decision is solely up to output of used automata. "
"Some automata may be able to parse non-plain marked up text.",
epilog="If INFILE or OFILE is omitted, standard streams will be "
"used.\n"
"If DISAMB is omitted, greedy LRLM will be used.")
a.add_argument('inputs', metavar='INFILE', type=open,
nargs='*', help="Files to process with corpus tool")
a.add_argument('--output', '-o', metavar='OFILE',
type=FileType('w'), help="store result in OFILE")
a.add_argument('--tokeniser', '-t', action='append', metavar='TFILE',
help="Pre-process input stream with automata from TFILE")
a.add_argument('--disambiguation', '-d', metavar='DISAMB',
choices=['LRLM'], default='LRLM',
help="use DISAMB tactic to select from multiple paths")
a.add_argument("--verbose", '-v', action='store_true',
help="print verbosely while processing")
opts = a.parse_args()
tokenisers = list()
if not opts.output:
if opts.verbose:
print("printing output to stdout, disabling verbose", stderr)
opts.verbose = False
opts.output = stdout
if not opts.tokeniser:
if opts.verbose:
print("Using Unicode tokeniser with character classes")
tokeniserstream = libhfst.HfstInputStream("tokeniser-unicode.openfst.hfst")
t = libhfst.HfstTransducer(tokeniserstream)
tokenisers.append(t)
else:
for tokeniserfile in opts.tokeniser:
if opts.verbose:
print("Reading from", tokeniserfile)
tokeniserstream = libhfst.HfstInputStream(tokeniserfile)
t = libhfst.HfstTransducer(tokeniserstream)
if opts.verbose:
print("Read tokeniser", t.get_property('name'))
tokenisers.append(t)
if len(opts.inputs) < 1:
if opts.verbose:
print("Reading corpus data from <stdin>")
opts.inputs = [stdin]
if opts.verbose:
print("Creating UTF-8 character tokeniser for HFST")
hfst_tokeniser = libhfst.HfstTokenizer()
for inputfile in opts.inputs:
print("# hfst-tokenise.py TSV token stream 1", file=opts.output)
print("# From input file", inputfile, file=opts.output)
print("# Next line is a header line", file=opts.output)
print("Token", file=opts.output)
for line in inputfile:
line = line.strip('\n')
if not line or line == '':
print('\\n', file=opts.output)
continue
could_tokenise = False
for tokeniser in tokenisers:
if tokeniser.get_type() == libhfst.TROPICAL_OPENFST_TYPE:
pathmaton = libhfst.HfstTransducer(line, hfst_tokeniser,
libhfst.TROPICAL_OPENFST_TYPE)
tokenisation = libhfst.extract_paths_fd(pathmaton.compose(tokeniser))
paths = libhfst.detokenize_paths(tokenisation)
tokens = None
if opts.disambiguation == 'LRLM':
tokens = take_greedy_lrlm_tokens(paths)
else:
print("What is this DISAMB?", opts.disambiguation,
file=stderr)
if tokens:
for token in tokens:
print(token.replace('@_EPSILON_SYMBOL_@', ''))
could_tokenise = True
break
else:
if opts.verbose:
print("Got no tokens with FOO using",
opts.disambiguation)
else:
print("Not impl !OFST", file=stderr)
exit(2)
if not could_tokenise:
for token in line.split():
print(token, file=opts.output)
print("\\n", file=opts.output)
exit(0)
if __name__ == '__main__':
main()