Esempio n. 1
0
def setup_tokenizers(terminal_punctuation):
	PunktLanguageVars.sent_end_chars = terminal_punctuation
	PunktLanguageVars.re_boundary_realignment = re.compile(r'[›»》’”\'\")\)\]\}\>]+?(?:\s+|(?=--)|$)', re.MULTILINE)
	global word_tokenizer
	global sentence_tokenizers

	#Accessing private variables of PunktLanguageVars because nltk has a faulty design pattern that necessitates it.
	#Issue reported here: https://github.com/nltk/nltk/issues/2068
	word_tokenizer = PunktLanguageVars()
	word_tokenizer._re_word_tokenizer = re.compile(PunktLanguageVars._word_tokenize_fmt % {
	    'NonWord': r"(?:[\d\.\?¿؟\!¡!‽…⋯᠁ฯ,،,、、。°※··᛫~\:;;\\\/⧸⁄()\(\)\[\]\{\}\<\>\'\"‘’“”‹›«»《》\|‖\=\-\‐\‒\–\—\―_\+\*\^\$£€§%#@&†‡])",
	    'MultiChar': PunktLanguageVars._re_multi_char_punct,
	    'WordStart': r"[^\d\.\?¿؟\!¡!‽…⋯᠁ฯ,،,、、。°※··᛫~\:;;\\\/⧸⁄()\(\)\[\]\{\}\<\>\'\"‘’“”‹›«»《》\|‖\=\-\‐\‒\–\—\―_\+\*\^\$£€§%#@&†‡]",
	}, re.UNICODE | re.VERBOSE)
	word_tokenizer._re_period_context = re.compile(PunktLanguageVars._period_context_fmt % {
		'NonWord': r"(?:[\d\.\?¿؟\!¡!‽…⋯᠁ฯ,،,、、。°※··᛫~\:;;\\\/⧸⁄()\(\)\[\]\{\}\<\>\'\"‘’“”‹›«»《》\|‖\=\-\‐\‒\–\—\―_\+\*\^\$£€§%#@&†‡])",
		'SentEndChars': word_tokenizer._re_sent_end_chars, 
	}, re.UNICODE | re.VERBOSE)

	x = PunktLanguageVars()
	x._re_word_tokenizer = re.compile(PunktLanguageVars._word_tokenize_fmt % {
	    'NonWord': r"(?:[\?¿؟\!¡!‽…⋯᠁ฯ,،,、、。°※··᛫~\:;;\\\/⧸⁄()\(\)\[\]\{\}\<\>\'\"‘’“”‹›«»《》\|‖\=\-\‐\‒\–\—\―_\+\*\^\$£€§%#@&†‡])",
	    'MultiChar': PunktLanguageVars._re_multi_char_punct,
	    'WordStart': r"[^\?¿؟\!¡!‽…⋯᠁ฯ,،,、、。°※··᛫~\:;;\\\/⧸⁄()\(\)\[\]\{\}\<\>\'\"‘’“”‹›«»《》\|‖\=\-\‐\‒\–\—\―_\+\*\^\$£€§%#@&†‡]",
	}, re.UNICODE | re.VERBOSE)
	x._re_period_context = re.compile(PunktLanguageVars._period_context_fmt % {
		'NonWord': r"(?:[\?¿؟\!¡!‽…⋯᠁ฯ,،,、、。°※··᛫~\:;;\\\/⧸⁄()\(\)\[\]\{\}\<\>\'\"‘’“”‹›«»《》\|‖\=\-\‐\‒\–\—\―_\+\*\^\$£€§%#@&†‡])",
		'SentEndChars': x._re_sent_end_chars, 
	}, re.UNICODE | re.VERBOSE)

	#Read tokenizers from pickle files (also include an untrained tokenizer). Mapping from language name to tokenizer
	sentence_tokenizers = dict({None: PunktSentenceTokenizer(lang_vars=PunktLanguageVars())}, **{
		current_file_name[:current_file_name.index('.')]: pickle.load(open(join(current_path, current_file_name), mode='rb'))
		for current_path, current_dir_names, current_file_names in os.walk(sentence_tokenizer_dir) 
		for current_file_name in current_file_names if current_file_name.endswith('.pickle')
	})
	for s in sentence_tokenizers.values():
		s._lang_vars._re_period_context = x._re_period_context
		s._lang_vars._re_word_tokenizer = x._re_word_tokenizer
Esempio n. 2
0
from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktLanguageVars

import context  #pylint: disable=unused-import
from qcrit import textual_feature

#[^\s\d’”\'\")\)\]\}\.,:;]
#[“‘—\-†&vâ\*\^(α-ωΑ-Ὠ`̔]
#΄´´``′″‴
textual_feature.setup_tokenizers(terminal_punctuation=('.', ';', ';'))
p = PunktLanguageVars()
#TODO don't mess with the PunktLanguageVars instance variables, mess with the class variables
p._re_word_tokenizer = re.compile(
    PunktLanguageVars._word_tokenize_fmt % {
        'NonWord':
        r"(?:[\d\.\?¿؟\!¡!‽…⋯᠁ฯ,،,、、。°※··᛫~\:;;\\\/⧸⁄()\(\)\[\]\{\}\<\>\'\"‘’“”‹›«»《》\|‖\=\-\‐\‒\–\—\―_\+\*\^\$£€§%#@&†‡])",
        'MultiChar':
        PunktLanguageVars._re_multi_char_punct,
        'WordStart':
        r"[^\d\.\?¿؟\!¡!‽…⋯᠁ฯ,،,、、。°※··᛫~\:;;\\\/⧸⁄()\(\)\[\]\{\}\<\>\'\"‘’“”‹›«»《》\|‖\=\-\‐\‒\–\—\―_\+\*\^\$£€§%#@&†‡]",
    }, re.UNICODE | re.VERBOSE)
p._re_period_context = re.compile(
    PunktLanguageVars._period_context_fmt % {
        'NonWord':
        r"(?:[\d\.\?¿؟\!¡!‽…⋯᠁ฯ,،,、、。°※··᛫~\:;;\\\/⧸⁄()\(\)\[\]\{\}\<\>\'\"‘’“”‹›«»《》\|‖\=\-\‐\‒\–\—\―_\+\*\^\$£€§%#@&†‡])",
        'SentEndChars': p._re_sent_end_chars,
    }, re.UNICODE | re.VERBOSE)
test_sentence_tokenizer = PunktSentenceTokenizer(lang_vars=p)


class TestParsers(unittest.TestCase):
    def setUp(self):
from nltk.tokenize.punkt import PunktLanguageVars
from cltk.tokenize.word import WordTokenizer
import re

p = PunktLanguageVars()
w = WordTokenizer('greek')

p._re_word_tokenizer = re.compile(PunktLanguageVars._word_tokenize_fmt % {
    'NonWord': r"(?:[?!.)\";;}\]\*:@\'\({\[])", #Incorporates period and greek question mark to exclude from word tokens (PunktLanguageVars._re_non_word_chars includes these in word tokens)
    'MultiChar': PunktLanguageVars._re_multi_char_punct,
    'WordStart': PunktLanguageVars._re_word_start,
}, re.UNICODE | re.VERBOSE)


s = 'test test test. test test; test test? test test test test test. test. test test. test? test test.'
assert p.word_tokenize(s) == w.tokenize(s)

d = [('tesserae/texts/grc/achilles_tatius.leucippe_et_clitophon.tess', 'feature_data/a.txt', 'feature_data/b.txt'), 
	('tesserae/texts/grc/bacchylides.epinicians.tess', 'feature_data/m.txt', 'feature_data/n.txt'), 
	('tesserae/texts/grc/polybius.histories.tess', 'feature_data/x.txt', 'feature_data/y.txt')]

for t in d:
	with open(t[0], mode='r', encoding='utf-8') as f:
		from io import StringIO
		file_text = StringIO()
		for line in f:
			#Ignore lines without tess tags, or parse the tag out and strip whitespace
			if not line.startswith('<'):
				continue
			assert '>' in line
			file_text.write(line[line.index('>') + 1:].strip())
Esempio n. 4
0
from textual_feature import sentence_tokenizers
from nltk.tokenize.punkt import PunktLanguageVars, PunktSentenceTokenizer
import re

p = PunktLanguageVars()

p._re_word_tokenizer = re.compile(
    PunktLanguageVars._word_tokenize_fmt % {
        'NonWord': r"(?:[0-9\.?!\-))\"“”‘’`··~,«»;;}\]\*\#:@&\'\(({\[])",
        'MultiChar': PunktLanguageVars._re_multi_char_punct,
        'WordStart': r"[^0-9\.?!\-))\"“”‘’`··~,«»;;}\]\*\#:@&\'\(({\[]",
    }, re.UNICODE | re.VERBOSE)

s = 'Σιδὼν ἐπὶ θαλάττῃ πόλις, ̓Ασσυρίων ἡ θάλασσα. test ""test test? test test. test test! test test. test. test test; test: test; test; test23 test45 89test. test test” test test “test test“. "test test". test test... test‘ test’. 00000test. test.test. .....test. test-test test- test. test) test test( test test. «test test» test.'
print(p.word_tokenize(s))
print()

s = '"ss ss". "ss ss." «s s. abc 123409 abc. 5ff5g s. ~ab cd~ ab~cd s.'
print(p.word_tokenize(s))
print()

# From polybius.histories.tess
s = '1συμμάχοις. ἀποδοῦναι Καρχηδονίους ̔Ρωμαίοις "1χωρὶς λύτρων ἅπαντας τοὺς αἰχμαλώτους. ἀργυ"1ρίου κατενεγκεῖν Καρχηδονίους ̔Ρωμαίοις ἐν ἔτεσιν "1εἴκοσι δισχίλια καὶ διακόσια τάλαντα Εὐβοϊκά."2'
print(p.word_tokenize(s))
print()

# From polybius.histories.tess
s = "διόπερ οὐχ ὁρῶν ποίαν ἄν τις ὀξυτέραν ἢ μείζονα λάβοι μεταβολὴν τῶν καθ' ἡμᾶς τῆς γε ̔Ρωμαίοις συμβάσης, εἰς τοῦτον ἀπεθέμην τὸν καιρὸν τὸν ὑπὲρ τῶν προειρημένων ἀπολογισμόν: γνοίη δ' ἄν τις τὸ μέγεθος τῆς μεταβολῆς ἐκ τούτων. ζήτει ἐν τῷ περὶ στρατηγίας. [εχξ. Vατ. π. 369 μαι. 24, 4 ηεψς.]"
print(p.word_tokenize(s))
print()