def test_compounds(self): tokens = es.tokenize(decode("Desde allí, consigna el documento, los aviones C-17 podrían alcanzar casi todo el continente sudamericano sin necesidad de reabastecimiento de combustible."), as_unicode=True) self.failUnless(u'C-17' in tokens) self.failUnless(decode('allí') in tokens) tokens = es.tokenize(decode("La empresa dueña del proyecto es DE-LIO Company"), as_unicode=True) self.failUnless(u'DE-LIO' in tokens)
def test_numerics(self): tokens = es.tokenize(decode("En total 2.113 civiles perdieron la vida en 2008."), as_unicode=True) self.failUnless(u'2.113' in tokens) self.failUnless(u'2008' in tokens) tokens = es.tokenize(decode(r"registró un aumento del 24% en 2009"), as_unicode=True) self.failUnless(u'24%' in tokens) tokens = es.tokenize(decode("Es vergonzoso que se haya gastado US$38.000 millones en armas"), as_unicode=True) self.failUnless(u'US$38.000' in tokens)
def test_simple(self): tokens = es.tokenize(decode("¿A qué le temen los afganos?"), as_unicode=True) self.assertEqual(tokens, [ u'\xbf', u'A', u'qu\xe9', u'le', u'temen', u'los', u'afganos', u'?'])
def test_compounds_regression(self): tokens = es.tokenize(decode("3M y McDonald´s."), as_unicode=True) self.assertEqual(len(tokens), 4)
def test_punctuation(self): tokens = es.tokenize(decode("Hay juegos bélicos en los que se ve gente mutilada, disparos, choques con carro, es una violencia fuerte..."), as_unicode=True) self.failUnless(u'...' in tokens)
def test_abbreviations(self): tokens = es.tokenize(decode("¿deben Colombia y EE.UU. explicar los detalles del acuerdo?"), as_unicode=True) self.failUnless(u'EE.UU.' in tokens)
def test_urls(self): tokens = es.tokenize(decode("que se desarrolló \"debido a la saturación de Twitter.com\""), as_unicode=True) self.failUnless(u'Twitter.com' in tokens) tokens = es.tokenize(decode("http://www.bbc.co.uk/mundo/lg/internacional/2009/08/090723_vida_afganistan_jp.shtml"), as_unicode=True) self.assertEqual(len(tokens), 1)
punctuation.PUNCTUATION_PATTERN ), re.U | re.X | re.I) TOKEN_RE_NO_ABBR = re.compile(r""" %s|%s| # Abbreviations, URLs, and tags ((US)?\$|\#)?\d+([\.\,\-]\d+)?(%%|\b)| # Numerals, money and percentages \w+(-\w+)*(\'s|\xb4s)?| # Words (possibly hyphenated) \n|\r\n|%s # Newlines and punctuation """ % ( URL_PATTERN, TAG_PATTERN, punctuation.PUNCTUATION_PATTERN ), re.U | re.X | re.I) STACK_PUNCTUATION = punctuation.STACK_PUNCTUATION + ( (decode('¿'), u'?'), (decode('¡'), u'!'), ) class SpanishPunctuationStack(punctuation.PunctuationStack): def __init__(self, punctuation=STACK_PUNCTUATION): super(SpanishPunctuationStack, self).__init__( punctuation=STACK_PUNCTUATION) def tokenize(text, ignore_abbreviations=False, as_unicode=False): regex = TOKEN_RE_NO_ABBR if ignore_abbreviations else TOKEN_RE tokens, last = [], EMPTY_TOKEN for m in regex.finditer(text): t = Token(m.group(0), m.start(0), m.end(0))
BR = u'<br>' ELLIPSES_PATTERN = r'\.\.+' ELLIPSES_RE = re.compile(ELLIPSES_PATTERN) NON_UNICODE_PUNCTUATION = [u'`', u'\xb4', u'\xa9', u'\xa3', u'$', u'=', u'+'] PUNCTUATION = [ unicode(unichr(x)) for x in range(65536) if unicodedata.category(unichr(x)).startswith('P') ] + NON_UNICODE_PUNCTUATION PUNCTUATION_PATTERN = r'%s|%s' % ( ELLIPSES_PATTERN, u'|'.join([re.escape(p) for p in PUNCTUATION]) ) PUNCTUATION_RE = re.compile(PUNCTUATION_PATTERN, re.U) PUNCTUATION_ES = decode("—»«¿¡•°^><|£~§„–™“”©■€®№±―…・¥♦,□►。·´▼▲") NEWLINE_PATTERN = r'\r\n|\n|\<br\>' NEWLINE_RE = re.compile(NEWLINE_PATTERN) SEGMENT_PATTERN = r'%s|\.|\?|\!|%s' % (ELLIPSES_PATTERN, NEWLINE_PATTERN) SEGMENT_RE = re.compile(SEGMENT_PATTERN) ALPHA_START_PATTERN = r'^\w(?<=[^\d\-])' ALPHA_START_RE = re.compile(ALPHA_START_PATTERN, re.U) CONTINUATION_PATTERN = r'(?<!\.|\?|\!)\ ?\n' CONTINUATION_RE = re.compile(CONTINUATION_PATTERN) STACK_PUNCTUATION = ( (u'(', u')'),