Exemple #1
0
def _split_contractions(tokens):
    """
    A function to split apostrophe contractions at the end of alphanumeric (and hyphenated) tokens.

    Takes the output of any of the tokenizer functions and produces and updated list.

    :param tokens: a list of tokens
    :returns: an updated list if a split was made or the original list otherwise

    Credit: (adapted from 'segtok/tokenizer.py')
    """

    from segtok.tokenizer import IS_CONTRACTION, APOSTROPHES

    repeat = True

    while (repeat):

        repeat = False
        idx = -1

        for token in list(tokens):
            idx += 1

            if IS_CONTRACTION.match(token) is not None:
                length = len(token)

                if length > 1:
                    for pos in range(length - 1, -1, -1):
                        if token[pos] in APOSTROPHES:
                            if 2 < length and pos + 2 == length and token[
                                    -1] == 't' and token[pos - 1] == 'n':
                                pos -= 1
                            else:
                                repeat = True

                            tokens.insert(idx, token[:pos])
                            idx += 1
                            tokens[idx] = token[pos:]

                            break
    return tokens
Exemple #2
0
 def test_unicode(self):
     self.assertIsNotNone(IS_CONTRACTION.match("Frank\u02BCs"))
     self.assertIsNotNone(IS_POSSESSIVE.match("Charles\u2019"))
     self.assertIsNotNone(IS_POSSESSIVE.match("home-less\u2032"))
Exemple #3
0
 def test_matches(self):
     self.assertIsNotNone(IS_CONTRACTION.match("I've"))
     self.assertIsNotNone(IS_CONTRACTION.match("don't"))
Exemple #4
0
 def test_misses(self):
     self.assertIsNone(IS_CONTRACTION.match("don'r"))
     self.assertIsNone(IS_CONTRACTION.match("'ve"))
Exemple #5
0
 def test_unicode(self):
     self.assertIsNotNone(IS_CONTRACTION.match("Frank\u02BCs"))
     self.assertIsNotNone(IS_POSSESSIVE.match("Charles\u2019"))
     self.assertIsNotNone(IS_POSSESSIVE.match("home-less\u2032"))
Exemple #6
0
 def test_matches(self):
     self.assertIsNotNone(IS_CONTRACTION.match("I've"))
     self.assertIsNotNone(IS_CONTRACTION.match("don't"))
Exemple #7
0
 def test_misses(self):
     self.assertIsNone(IS_CONTRACTION.match("don'r"))
     self.assertIsNone(IS_CONTRACTION.match("'ve"))