Exemple #1
0
    def test_parse_tokens(self):
        """ test_parse_tokens """

        options = 0

        # No RIGHT-WALL, no CAPS
        options |= BIT_STRIP
        # tokens = parse_tokens(self.tokens_all_walls, options)
        # self.assertTrue(self.cmp_lists(tokens, ['###LEFT-WALL###', 'dad', 'was', 'not', 'a',
        #                                         'parent', 'before', '.']))

        # Tokens without walls
        tokens = parse_tokens(self.tokens_no_walls, options)[0]
        self.assertTrue(
            self.cmp_lists(tokens,
                           ['###LEFT-WALL###', 'eagle', 'has', 'wing', '.']))

        # RIGHT-WALL and CAPS, no STRIP
        options |= (BIT_RWALL | BIT_CAPS)
        options &= ~BIT_STRIP
        tokens = parse_tokens(self.tokens_all_walls, options)[0]
        self.assertTrue(
            self.cmp_lists(tokens, [
                '###LEFT-WALL###', 'Dad[!]', 'was.v-d', 'not.e', 'a',
                'parent.n', 'before', '.', '###RIGHT-WALL###'
            ]))

        # Tokens without walls
        tokens = parse_tokens(self.tokens_no_walls, options)[0]
        # print(tokens, file=sys.stdout)
        self.assertTrue(
            self.cmp_lists(tokens,
                           ['###LEFT-WALL###', 'eagle', 'has', 'wing', '.']))
Exemple #2
0
    def test_parse_gutenchildren_bug_002(self):
        """ Test for number of tokens (bug from Gutenberg Children corpus) """
        options = BIT_NO_LWALL | BIT_NO_PERIOD | BIT_STRIP

        tokens = parse_tokens(gutenberg_children_bug_002t, options)[0]

        self.assertEqual(tokens, gutenberg_children_bug_002tr)
Exemple #3
0
    def test_parse_no_period_if_no_period(self):
        """ Test for parsing sentence with no walls and period """
        options = 0
        options |= BIT_STRIP | BIT_NO_PERIOD | BIT_RWALL
        tokens = parse_tokens(self.tokens_no_walls_no_period, options)[0]

        self.assertTrue(self.cmp_lists(tokens, ['###LEFT-WALL###', 'eagle', 'has', 'wing']))
Exemple #4
0
    def test_parse_tokens_no_period(self):
        options = 0
        options |= BIT_STRIP | BIT_NO_PERIOD | BIT_RWALL
        tokens = parse_tokens(self.tokens_no_walls, options)[0]

        print(tokens)

        self.assertTrue(self.cmp_lists(tokens, ['###LEFT-WALL###', 'eagle', 'has', 'wing']))
Exemple #5
0
    def test_parse_tokens_no_walls_no_period(self):
        options = 0
        options |= BIT_STRIP | BIT_NO_PERIOD | BIT_NO_LWALL
        tokens = parse_tokens(self.tokens_all_walls, options)[0]

        # print(tokens)

        self.assertTrue(self.cmp_lists(tokens, ['dad', 'was', 'not', 'a', 'parent', 'before']))
Exemple #6
0
    def test_parse_tokens_no_left_wall(self):
        # NO_LWALL and CAPS, no STRIP
        options = 0
        options |= BIT_CAPS | BIT_NO_LWALL
        # options |= (BIT_NO_LWALL | BIT_CAPS)
        # options &= (~(BIT_STRIP | BIT_RWALL))
        tokens = parse_tokens(self.tokens_all_walls, options)[0]

        # print(tokens)

        self.assertTrue(self.cmp_lists(tokens, ['Dad[!]', 'was.v-d', 'not.e', 'a',
                                                'parent.n', 'before', '.']))
Exemple #7
0
    def test_parse_tokens_alice_004(self):
        """ Test for proper parsing of square brackets revealed by Alice in Wonderland corpus """
        options = BIT_STRIP | BIT_NO_LWALL | BIT_NO_PERIOD

        post = "(LEFT-WALL)(posting.g)(date.n)(:.j)(@date@[?].a)([)(ebook[?].a)([#])(@number@[?].n)(])(release.n)" \
               "(date.n)(:.j)([@date@])(last.ord)(updated.v-d)(:.v)(@date@[?].n)"

        ref = ["###LEFT-WALL###", "posting", "date", ":", "@date@", "[", "ebook", "[#]", "@number@", "]", "release",
               "date", ":", "[@date@]", "last", "updated", ":", "@date@"]

        tokens = parse_tokens(post, options)[0]
        self.assertEqual(ref, tokens)
Exemple #8
0
    def test_parse_tokens_alice_003(self):
        """ Test for proper parsing of '[(]' revealed by Alice in Wonderland corpus """
        options = BIT_STRIP | BIT_NO_LWALL | BIT_NO_PERIOD

        # sent = "(alice had no idea what latitude was, or longitude either, but thought they were nice grand words to say.)"
        post = "(LEFT-WALL)([(])(alice[?].n)(had.v-d)(no.misc-d)(idea.n)(what)(latitude.n-u)(was.v-d)(,)(or.ij)" \
               "(longitude.n-u)(either.r)(,)([but])(thought.q-d)(they)(were.v-d)(nice.a)(grand.a)(words.n)(to.r)(say.v)" \
               "(.)([)])"
        ref = \
        ["###LEFT-WALL###", "[(]", "alice", "had", "no", "idea", "what", "latitude", "was", ",", "or", "longitude",
        "either", ",", "[but]", "thought", "they", "were", "nice", "grand", "words", "to", "say", ".", "[)]"]

        tokens = parse_tokens(post, options)[0]
        self.assertEqual(ref, tokens)