forked from miotto/treetagger-python
-
Notifications
You must be signed in to change notification settings - Fork 0
/
treetagger.py
143 lines (115 loc) · 4.97 KB
/
treetagger.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
# -*- coding: utf-8 -*-
# Natural Language Toolkit: Interface to the TreeTagger POS-tagger
#
# Copyright (C) Mirko Otto
# Author: Mirko Otto <dropsy@gmail.com>
"""
A Python module for interfacing with the Treetagger by Helmut Schmid.
"""
import os
from subprocess import Popen, PIPE
from nltk.internals import find_binary, find_file
from nltk.tag.api import TaggerI
from sys import platform as _platform
_treetagger_url = 'http://www.cis.uni-muenchen.de/~schmid/tools/TreeTagger/'
_treetagger_languages = ['bulgarian', 'dutch', 'english', 'estonian', 'finnish', 'french', 'galician', 'german', 'italian', 'polish', 'russian', 'slovak', 'slovak2', 'spanish']
class TreeTagger(TaggerI):
r"""
A class for pos tagging with TreeTagger. The default encoding used by TreeTagger is utf-8. The input is the paths to:
- a language trained on training data
- (optionally) the path to the TreeTagger binary
This class communicates with the TreeTagger binary via pipes.
Example:
.. doctest::
:options: +SKIP
>>> from treetagger3 import TreeTagger
>>> tt = TreeTagger(language='english')
>>> tt.tag('What is the airspeed of an unladen swallow ?')
[['What', 'WP', 'What'],
['is', 'VBZ', 'be'],
['the', 'DT', 'the'],
['airspeed', 'NN', 'airspeed'],
['of', 'IN', 'of'],
['an', 'DT', 'an'],
['unladen', 'JJ', '<unknown>'],
['swallow', 'NN', 'swallow'],
['?', 'SENT', '?']]
.. doctest::
:options: +SKIP
>>> from treetagger3 import TreeTagger
>>> tt = TreeTagger(language='german')
>>> tt.tag('Das Haus hat einen großen hübschen Garten.')
[['Das', 'ART', 'die'],
['Haus', 'NN', 'Haus'],
['hat', 'VAFIN', 'haben'],
['einen', 'ART', 'eine'],
['großen', 'ADJA', 'groß'],
['hübschen', 'ADJA', 'hübsch'],
['Garten', 'NN', 'Garten'],
['.', '$.', '.']]
"""
def __init__(self, path_to_home=None, language='german',
verbose=False, abbreviation_list=None):
"""
Initialize the TreeTagger.
:param path_to_home: The TreeTagger binary.
:param language: Default language is german.
The encoding used by the model. Unicode tokens
passed to the tag() and batch_tag() methods are converted to
this charset when they are sent to TreeTagger.
The default is utf-8.
This parameter is ignored for str tokens, which are sent as-is.
The caller must ensure that tokens are encoded in the right charset.
"""
treetagger_paths = ['.', '/usr/bin', '/usr/local/bin', '/opt/local/bin',
'/Applications/bin', '~/bin', '~/Applications/bin',
'~/work/tmp/treetagger/cmd', '~/tree-tagger/cmd']
treetagger_paths = list(map(os.path.expanduser, treetagger_paths))
self._abbr_list = abbreviation_list
if language in _treetagger_languages:
if _platform == "win32":
treetagger_bin_name = 'tag-' + language
else:
treetagger_bin_name = 'tree-tagger-' + language
else:
raise LookupError('Language not in language list!')
try:
self._treetagger_bin = find_binary(
treetagger_bin_name, path_to_home,
env_vars=('TREETAGGER', 'TREETAGGER_HOME'),
searchpath=treetagger_paths,
url=_treetagger_url,
verbose=verbose)
except LookupError:
print('NLTK was unable to find the TreeTagger bin!')
def tag(self, sentences):
"""Tags a single sentence: a list of words.
The tokens should not contain any newline characters.
"""
# Write the actual sentences to the temporary input file
if isinstance(sentences, list):
_input = '\n'.join((x for x in sentences))
else:
_input = sentences
# Run the tagger and get the output
if(self._abbr_list is None):
p = Popen([self._treetagger_bin],
shell=False, stdin=PIPE, stdout=PIPE, stderr=PIPE)
elif(self._abbr_list is not None):
p = Popen([self._treetagger_bin,"-a",self._abbr_list],
shell=False, stdin=PIPE, stdout=PIPE, stderr=PIPE)
(stdout, stderr) = p.communicate(bytes(_input, 'UTF-8'))
# Check the return code.
if p.returncode != 0:
print(stderr)
raise OSError('TreeTagger command failed!')
treetagger_output = stdout.decode('UTF-8')
# Output the tagged sentences
tagged_sentences = []
for tagged_word in treetagger_output.strip().split('\n'):
tagged_word_split = tagged_word.split('\t')
tagged_sentences.append(tagged_word_split)
return tagged_sentences
if __name__ == "__main__":
import doctest
doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE)