Esempio n. 1
0
class TokenParserTests(unittest.TestCase):
    def setUp(self):
        self.tp = TokenParser(stem_threshold=4, max_token_length=20)
        self.tp._single_shot = False

    def test_process_token(self):
        self.tp.max_token_length = 100

        tokens = [
            ("UpperCamelCase", ["upper", "camel", "case"]),
            ("camelCase", ["camel", "case"]),
            ("FRAPScase", ["frap", "case"]),
            ("SQLThing", ["sqlt", "hing"]),
            ("_Astra", ["astra"]),
            ("CAPS_CONST", ["caps", "const"]),
            ("_something_SILLY_", ["someth", "silli"]),
            ("blink182", ["blink"]),
            ("FooBar100500Bingo", ["foo", "bar", "bingo"]),
            ("Man45var", ["man", "var"]),
            ("method_name", ["method", "name"]),
            ("Method_Name", ["method", "name"]),
            ("101dalms", ["dalm"]),
            ("101_dalms", ["dalm"]),
            ("101_DalmsBug", ["dalm", "bug"]),
            ("101_Dalms45Bug7", ["dalm", "bug"]),
            ("wdSize", ["wd", "size", "wdsize"]),
            ("Glint", ["glint"]),
            ("foo_BAR", ["foo", "bar"]),
            ("sourced.ml.algorithms.uast_ids_to_bag", [
                "sourc", "sourcedml", "algorithm", "mlalgorithm", "uast",
                "ids", "idsto", "bag", "tobag"
            ]),
            ("WORSTnameYOUcanIMAGINE",
             ["worst", "name", "you", "can", "imagin"]),
            # Another bad example. Parser failed to parse it correctly
            ("SmallIdsToFoOo", ["small", "ids", "idsto", "fo", "oo"]),
            ("SmallIdFooo", ["small", "smallid", "fooo", "idfooo"]),
            ("ONE_M0re_.__badId.example", [
                "one", "onem", "re", "bad", "rebad", "badid", "exampl",
                "idexampl"
            ]),
            ("never_use_Such__varsableNames",
             ["never", "use", "such", "varsabl", "name"]),
            ("a.b.c.d", ["a", "b", "c", "d"]),
            ("A.b.Cd.E", ["a", "b", "cd", "e"]),
            ("looong_sh_loooong_sh",
             ["looong", "looongsh", "loooong", "shloooong", "loooongsh"]),
            ("sh_sh_sh_sh", ["sh", "sh", "sh", "sh"]),
            ("loooong_loooong_loooong", ["loooong", "loooong", "loooong"])
        ]

        for token, correct in tokens:
            res = list(self.tp.process_token(token))
            self.assertEqual(res, correct)

    def test_process_token_single_shot(self):
        self.tp.max_token_length = 100
        self.tp._single_shot = True
        self.tp.min_split_length = 1
        tokens = [
            ("UpperCamelCase", ["upper", "camel", "case"]),
            ("camelCase", ["camel", "case"]),
            ("FRAPScase", ["frap", "case"]),
            ("SQLThing", ["sqlt", "hing"]),
            ("_Astra", ["astra"]),
            ("CAPS_CONST", ["caps", "const"]),
            ("_something_SILLY_", ["someth", "silli"]),
            ("blink182", ["blink"]),
            ("FooBar100500Bingo", ["foo", "bar", "bingo"]),
            ("Man45var", ["man", "var"]),
            ("method_name", ["method", "name"]),
            ("Method_Name", ["method", "name"]),
            ("101dalms", ["dalm"]),
            ("101_dalms", ["dalm"]),
            ("101_DalmsBug", ["dalm", "bug"]),
            ("101_Dalms45Bug7", ["dalm", "bug"]),
            ("wdSize", ["wd", "size"]),
            ("Glint", ["glint"]),
            ("foo_BAR", ["foo", "bar"]),
            ("sourced.ml.algorithms.uast_ids_to_bag",
             ["sourc", "ml", "algorithm", "uast", "ids", "to", "bag"]),
            ("WORSTnameYOUcanIMAGINE",
             ["worst", "name", "you", "can", "imagin"]),
            # Another bad example. Parser failed to parse it correctly
            ("SmallIdsToFoOo", ["small", "ids", "to", "fo", "oo"]),
            ("SmallIdFooo", ["small", "id", "fooo"]),
            ("ONE_M0re_.__badId.example",
             ["one", "m", "re", "bad", "id", "exampl"]),
            ("never_use_Such__varsableNames",
             ["never", "use", "such", "varsabl", "name"]),
            ("a.b.c.d", ["a", "b", "c", "d"]),
            ("A.b.Cd.E", ["a", "b", "cd", "e"]),
            ("looong_sh_loooong_sh", ["looong", "sh", "loooong", "sh"]),
            ("sh_sh_sh_sh", ["sh", "sh", "sh", "sh"]),
            ("loooong_loooong_loooong", ["loooong", "loooong", "loooong"])
        ]

        for token, correct in tokens:
            res = list(self.tp.process_token(token))
            self.assertEqual(res, correct)

        min_split_length = 3
        self.tp.min_split_length = min_split_length
        for token, correct in tokens:
            res = list(self.tp.process_token(token))
            self.assertEqual(
                res, [c for c in correct if len(c) >= min_split_length])

    def test_split(self):
        self.assertEqual(list(self.tp.split("set for")), ["set", "for"])
        self.assertEqual(list(self.tp.split("set /for.")), ["set", "for"])
        self.assertEqual(list(self.tp.split("NeverHav")), ["never", "hav"])
        self.assertEqual(list(self.tp.split("PrintAll")), ["print", "all"])
        self.assertEqual(list(self.tp.split("PrintAllExcept")),
                         ["print", "all", "except"])
        self.assertEqual(
            list(self.tp.split("print really long line")),
            # 'longli' is expected artifact due to edge effects
            ["print", "really", "long", "longli"])
        self.assertEqual(list(self.tp.split("set /for. *&PrintAll")),
                         ["set", "for", "print", "all"])
        self.assertEqual(list(self.tp.split("JumpDown not Here")),
                         ["jump", "down", "not", "here"])

        self.assertEqual(list(self.tp.split("a b c d")), ["a", "b", "c", "d"])
        self.assertEqual(list(self.tp.split("a b long c d")),
                         ["a", "b", "long", "blong", "longc", "d"])
        self.assertEqual(list(self.tp.split("AbCd")), ["ab", "cd"])

    def test_split_single_shot(self):
        self.tp._single_shot = True
        self.tp.min_split_length = 1
        self.assertEqual(
            list(self.tp.split("print really long line")),
            # 'longli' is expected artifact due to edge effects
            ["print", "really", "long", "li"])
        self.assertEqual(list(self.tp.split("a b c d")), ["a", "b", "c", "d"])
        self.assertEqual(list(self.tp.split("a b long c d")),
                         ["a", "b", "long", "c", "d"])
        self.assertEqual(list(self.tp.split("AbCd")), ["ab", "cd"])

    def test_stem(self):
        self.assertEqual(self.tp.stem("lol"), "lol")
        self.assertEqual(self.tp.stem("apple"), "appl")
        self.assertEqual(self.tp.stem("orange"), "orang")
        self.assertEqual(self.tp.stem("embedding"), "embed")
        self.assertEqual(self.tp.stem("Alfred"), "Alfred")
        self.assertEqual(self.tp.stem("Pluto"), "Pluto")

    def test_pickle(self):
        tp = pickle.loads(pickle.dumps(self.tp))
        self.assertEqual(tp.stem("embedding"), "embed")
Esempio n. 2
0
class TokenParserTests(unittest.TestCase):
    def setUp(self):
        self.tp = TokenParser(stem_threshold=4, max_token_length=20)

    def test_process_token(self):

        _max_token_length = self.tp.max_token_length
        self.tp.max_token_length = 100

        tokens = [
            ("sourced.ml.algorithms.uast_ids_to_bag", [
                "sourc", "sourcedml", "algorithm", "mlalgorithm", "uast",
                "ids", "idsto", "bag", "tobag"
            ]),
            ("WORSTnameYOUcanIMAGINE",
             ['worst', 'name', 'you', 'can', 'imagin']),
            # Another bad example. Parser failed to parse it correctly
            ("SmallIdsToFoOo", ["small", "ids", 'idsto', 'fo', 'oo']),
            ("SmallIdFooo", ["small", "smallid", 'fooo', 'idfooo']),
            ("ONE_M0re_.__badId.example", [
                'one', 'onem', 're', 'bad', 'rebad', 'badid', 'exampl',
                'idexampl'
            ]),
            ("never_use_Such__varsableNames",
             ['never', 'use', 'such', 'varsabl', 'name']),
            ("a.b.c.d", ["a", "b", "c", "d"]),
            ("A.b.Cd.E", ['a', 'b', 'cd', 'e']),
            ("looong_sh_loooong_sh",
             ['looong', 'looongsh', 'loooong', 'shloooong', 'loooongsh']),
            ("sh_sh_sh_sh", ['sh', 'sh', 'sh', 'sh']),
            ("loooong_loooong_loooong", ['loooong', 'loooong', 'loooong'])
        ]

        for token, correct in tokens:
            res = list(self.tp.process_token(token))
            self.assertEqual(res, correct)
        self.tp.max_token_length = _max_token_length

    def test_split(self):
        self.assertEqual(list(self.tp.split("set for")), ["set", "for"])
        self.assertEqual(list(self.tp.split("set /for.")), ["set", "for"])
        self.assertEqual(list(self.tp.split("NeverHav")), ["never", "hav"])
        self.assertEqual(list(self.tp.split("PrintAll")), ["print", "all"])
        self.assertEqual(list(self.tp.split("PrintAllExcept")),
                         ["print", "all", "except"])
        self.assertEqual(
            list(self.tp.split("print really long line")),
            # 'longli' is expected artifact due to edge effects
            ["print", "really", "long", "longli"])
        self.assertEqual(list(self.tp.split("set /for. *&PrintAll")),
                         ["set", "for", "print", "all"])
        self.assertEqual(list(self.tp.split("JumpDown not Here")),
                         ["jump", "down", "not", "here"])

    def test_stem(self):
        self.assertEqual(self.tp.stem("lol"), "lol")
        self.assertEqual(self.tp.stem("apple"), "appl")
        self.assertEqual(self.tp.stem("orange"), "orang")
        self.assertEqual(self.tp.stem("embedding"), "embed")
        self.assertEqual(self.tp.stem("Alfred"), "Alfred")
        self.assertEqual(self.tp.stem("Pluto"), "Pluto")

    def test_pickle(self):
        tp = pickle.loads(pickle.dumps(self.tp))
        self.assertEqual(tp.stem("embedding"), "embed")