Beispiel #1
0
    def test_compounds(self):
        tokens = es.tokenize(decode("Desde allí, consigna el documento, los aviones C-17 podrían alcanzar casi todo el continente sudamericano sin necesidad de reabastecimiento de combustible."), as_unicode=True)
        self.failUnless(u'C-17' in tokens)
        self.failUnless(decode('allí') in tokens)

        tokens = es.tokenize(decode("La empresa dueña del proyecto es DE-LIO Company"), as_unicode=True)
        self.failUnless(u'DE-LIO' in tokens)
Beispiel #2
0
    def test_numerics(self):
        tokens = es.tokenize(decode("En total 2.113 civiles perdieron la vida en 2008."), as_unicode=True)
        self.failUnless(u'2.113' in tokens)
        self.failUnless(u'2008' in tokens)

        tokens = es.tokenize(decode(r"registró un aumento del 24% en 2009"), as_unicode=True)
        self.failUnless(u'24%' in tokens)

        tokens = es.tokenize(decode("Es vergonzoso que se haya gastado US$38.000 millones en armas"), as_unicode=True)
        self.failUnless(u'US$38.000' in tokens)
Beispiel #3
0
 def test_simple(self):
     tokens = es.tokenize(decode("¿A qué le temen los afganos?"), as_unicode=True)
     self.assertEqual(tokens, [
         u'\xbf', u'A', u'qu\xe9',
         u'le', u'temen', u'los', u'afganos', u'?'])
Beispiel #4
0
 def test_compounds_regression(self):
     tokens = es.tokenize(decode("3M y McDonald´s."), as_unicode=True)
     self.assertEqual(len(tokens), 4)
Beispiel #5
0
 def test_punctuation(self):
     tokens = es.tokenize(decode("Hay juegos bélicos en los que se ve gente mutilada, disparos, choques con carro, es una violencia fuerte..."), as_unicode=True)
     self.failUnless(u'...' in tokens)
Beispiel #6
0
 def test_abbreviations(self):
     tokens = es.tokenize(decode("¿deben Colombia y EE.UU. explicar los detalles del acuerdo?"), as_unicode=True)
     self.failUnless(u'EE.UU.' in tokens)
Beispiel #7
0
    def test_urls(self):
        tokens = es.tokenize(decode("que se desarrolló \"debido a la saturación de Twitter.com\""), as_unicode=True)
        self.failUnless(u'Twitter.com' in tokens)

        tokens = es.tokenize(decode("http://www.bbc.co.uk/mundo/lg/internacional/2009/08/090723_vida_afganistan_jp.shtml"), as_unicode=True)
        self.assertEqual(len(tokens), 1)
Beispiel #8
0
    punctuation.PUNCTUATION_PATTERN
), re.U | re.X | re.I)

TOKEN_RE_NO_ABBR = re.compile(r"""
%s|%s|                                   # Abbreviations, URLs, and tags
((US)?\$|\#)?\d+([\.\,\-]\d+)?(%%|\b)|   # Numerals, money and percentages
\w+(-\w+)*(\'s|\xb4s)?|                  # Words (possibly hyphenated)
\n|\r\n|%s                               # Newlines and punctuation
""" % (
    URL_PATTERN,
    TAG_PATTERN,
    punctuation.PUNCTUATION_PATTERN
), re.U | re.X | re.I)

STACK_PUNCTUATION = punctuation.STACK_PUNCTUATION + (
    (decode('¿'), u'?'),
    (decode('¡'), u'!'),
)


class SpanishPunctuationStack(punctuation.PunctuationStack):
    def __init__(self, punctuation=STACK_PUNCTUATION):
        super(SpanishPunctuationStack, self).__init__(
            punctuation=STACK_PUNCTUATION)


def tokenize(text, ignore_abbreviations=False, as_unicode=False):
    regex = TOKEN_RE_NO_ABBR if ignore_abbreviations else TOKEN_RE
    tokens, last = [], EMPTY_TOKEN
    for m in regex.finditer(text):
        t = Token(m.group(0), m.start(0), m.end(0))
Beispiel #9
0
BR = u'<br>'

ELLIPSES_PATTERN = r'\.\.+'
ELLIPSES_RE = re.compile(ELLIPSES_PATTERN)

NON_UNICODE_PUNCTUATION = [u'`', u'\xb4', u'\xa9', u'\xa3', u'$', u'=', u'+']
PUNCTUATION = [
    unicode(unichr(x)) for x in range(65536)
    if unicodedata.category(unichr(x)).startswith('P')
] + NON_UNICODE_PUNCTUATION
PUNCTUATION_PATTERN = r'%s|%s' % (
    ELLIPSES_PATTERN,
    u'|'.join([re.escape(p) for p in PUNCTUATION])
)
PUNCTUATION_RE = re.compile(PUNCTUATION_PATTERN, re.U)
PUNCTUATION_ES = decode("—»«¿¡•°^><|£~§„–™“”©■€®№±―…・¥♦,□►。·´▼▲")

NEWLINE_PATTERN = r'\r\n|\n|\<br\>'
NEWLINE_RE = re.compile(NEWLINE_PATTERN)

SEGMENT_PATTERN = r'%s|\.|\?|\!|%s' % (ELLIPSES_PATTERN, NEWLINE_PATTERN)
SEGMENT_RE = re.compile(SEGMENT_PATTERN)

ALPHA_START_PATTERN = r'^\w(?<=[^\d\-])'
ALPHA_START_RE = re.compile(ALPHA_START_PATTERN, re.U)

CONTINUATION_PATTERN = r'(?<!\.|\?|\!)\ ?\n'
CONTINUATION_RE = re.compile(CONTINUATION_PATTERN)

STACK_PUNCTUATION = (
    (u'(', u')'),