def _split_contractions(tokens): """ A function to split apostrophe contractions at the end of alphanumeric (and hyphenated) tokens. Takes the output of any of the tokenizer functions and produces and updated list. :param tokens: a list of tokens :returns: an updated list if a split was made or the original list otherwise Credit: (adapted from 'segtok/tokenizer.py') """ from segtok.tokenizer import IS_CONTRACTION, APOSTROPHES repeat = True while (repeat): repeat = False idx = -1 for token in list(tokens): idx += 1 if IS_CONTRACTION.match(token) is not None: length = len(token) if length > 1: for pos in range(length - 1, -1, -1): if token[pos] in APOSTROPHES: if 2 < length and pos + 2 == length and token[ -1] == 't' and token[pos - 1] == 'n': pos -= 1 else: repeat = True tokens.insert(idx, token[:pos]) idx += 1 tokens[idx] = token[pos:] break return tokens
def test_unicode(self): self.assertIsNotNone(IS_CONTRACTION.match("Frank\u02BCs")) self.assertIsNotNone(IS_POSSESSIVE.match("Charles\u2019")) self.assertIsNotNone(IS_POSSESSIVE.match("home-less\u2032"))
def test_matches(self): self.assertIsNotNone(IS_CONTRACTION.match("I've")) self.assertIsNotNone(IS_CONTRACTION.match("don't"))
def test_misses(self): self.assertIsNone(IS_CONTRACTION.match("don'r")) self.assertIsNone(IS_CONTRACTION.match("'ve"))