Ejemplo n.º 1
0
    def test_tokenize_case_sensitive(self):
        """obtain tokens in case sensitive manner"""

        tokenizer = Kmerizer(k=10, case_sensitive=True)
        tokens = dict()
        for id, data in tokenizer.tokenize_path(dataset_file):
            tokens[id] = data
        result = tokens["U"]["data"]
        self.assertTrue("ABCDEFG" in result)
        self.assertEqual(result.data["ABCDEFG"], 1)
Ejemplo n.º 2
0
    def test_tokenize_file(self):
        """obtain tokens for all entries in a dataset"""

        tokenizer = Kmerizer()
        tokens = dict()
        for id, data in tokenizer.tokenize_path(dataset_file):
            tokens[id] = data
        self.assertTrue("A" in tokens)
        self.assertTrue("ZZ" in tokens)
        self.assertGreater(len(tokens), 3)
Ejemplo n.º 3
0
    def test_tokenize_entry_structure_aux_neg(self):
        """obtain tokens also from aux_neg fields"""

        tokenizer = Kmerizer()
        tokens = dict()
        for id, data in tokenizer.tokenize_path(dataset_file):
            tokens[id] = data
        result = tokens["C"]
        self.assertTrue("aux_neg" in result)
        self.assertTrue("bob" in result["aux_neg"])
        self.assertEqual(result["aux_neg"].data["bob"], 1)
Ejemplo n.º 4
0
    def test_tokenize_entry_structure(self):
        """obtain tokens for every component of a document"""

        tokenizer = Kmerizer()
        tokens = dict()
        for id, data in tokenizer.tokenize_path(dataset_file):
            tokens[id] = data
        result = tokens["A"]
        self.assertTrue("data" in result)
        # data_neg is not in the result because the dataset does not define data_neg
        self.assertFalse("data_neg" in result)
Ejemplo n.º 5
0
    def test_count_all_tokens(self):
        """obtain summary of tokens in all documents"""

        tokenizer = Kmerizer(k=10)
        tokens = dict()
        for id, data in tokenizer.tokenize_path(dataset_file):
            tokens[id] = data
        result = token_counts(tokens)
        self.assertTrue("data" in result)
        self.assertGreater(result.data["with"], 1)
        self.assertEqual(result.data["abcdefg"], 1)
Ejemplo n.º 6
0
    def test_tokenize_case_insensitive(self):
        """obtain tokens, all in lowercase"""

        tokenizer = Kmerizer(k=10, case_sensitive=False)
        tokens = dict()
        for id, data in tokenizer.tokenize_path(dataset_file):
            tokens[id] = data
        result = tokens["U"]["data"]
        self.assertFalse("ABCDEFG" in result)
        self.assertTrue("abcdefg" in result)
        self.assertEqual(result.data["abcdefg"], 1)
Ejemplo n.º 7
0
    def test_tokenize_documents(self):
        """obtain tokens from documents"""

        tokenizer = Kmerizer(k=5)
        tokens = dict()
        for id, data in tokenizer.tokenize_path(dataset_file):
            tokens[id] = data
        result = tokens["D"]
        # data component should only be based on:
        #  Daniel (2 tokens), starts (2 tokens), with D
        self.assertEqual(len(result["data"]), 6)
        self.assertTrue("danie" in result["data"])
        self.assertTrue("with" in result["data"])
        # weights of short tokens should have value 1,
        # parts of longer word should have lower values
        self.assertEqual(result["data"].data["with"], 1.0)
        self.assertLess(result["data"].data["start"], 1.0)
        self.assertLess(result["data"].data["tarts"], 1.0)
Ejemplo n.º 8
0
class KmerizerSpecialCasesTests(unittest.TestCase):
    """Tokenize when yaml data has special cases"""

    tokenizer = Kmerizer(k=20)

    @classmethod
    def setUpClass(cls):
        tokens = dict()
        for id, data in cls.tokenizer.tokenize_path(specialcases_file):
            tokens[id] = data
        cls.tokens = tokens

    def test_tokenize_arrays_without_quotes(self):
        """tokenize when yaml array does not have quotes"""

        result = self.tokens["array_without_quotes"]["data"]
        self.assertTrue("abcdef" in result)
        self.assertTrue("mnopqr" in result)

    def test_tokenize_arrays_with_quotes(self):
        """tokenize when yaml array has quote around each item"""

        result = self.tokens["array_with_quotes"]["data"]
        self.assertTrue("abc123" in result)
        self.assertTrue("abc789" in result)

    def test_tokenize_unicode(self):
        """tokenize when yaml array does not have quotes"""

        result = self.tokens["alpha"]
        self.assertTrue("alpha" in result["data"])

    def test_tokenize_multiline(self):
        """tokenize when yaml has empty lines within items"""

        result = self.tokens["multiline"]
        self.assertTrue("multiple" in result["data"])

    def test_tokenize_data_dictionaries(self):
        """tokenize when data is itself a dictionary"""

        result = self.tokens["dictionary"]
        # content of dictionary should be tokenized
        self.assertTrue("alpha" in result["data"])
        # keys to the dictionary should not be tokenized
        self.assertFalse("short" in result["data"])
        self.assertFalse("long" in result["data"])

    def test_tokenize_data_with_slashes(self):
        """tokenize when data has slashes"""

        result = self.tokens["slashes"]
        self.assertEqual(len(result["data"]), 3)
        self.assertTrue("a" in result["data"])
        self.assertTrue("b" in result["data"])
        self.assertTrue("c" in result["data"])
Ejemplo n.º 9
0
    def test_tokenize_with_nondefault_alphanet(self):
        """tokenizing with alphabet with missing letters introduces spaces"""

        # custom tokenizer that does not allow the letter "a"
        tokenizer = Kmerizer(k=5, alphabet="bcdefghijklmnopqrstuvwxyz")
        tokens = dict()
        for id, data in tokenizer.tokenize_path(dataset_file):
            tokens[id] = data
        # item Alice - has a word just with letter A
        dataA = tokens["A"]["data"].data
        # letter "A" can turn into " " and reduce to ""
        # avoid tokens that are empty
        self.assertFalse("a" in dataA)
        keysA = list(dataA.keys())
        for k in keysA:
            self.assertNotEqual(k, "")
        # item Daniel - has letter "a" on a boundary of k=5
        keysD = list(tokens["D"]["data"].data.keys())
        # "Daniel" can create "D niel" which can kmerize to " niel"
        # avoid tokens that start with a space
        self.assertEqual(keysD, [_.strip() for _ in keysD])
Ejemplo n.º 10
0
    def test_init_default(self):
        """Configure a tokenizer with default alphabet"""

        tokenizer = Kmerizer()
        self.assertTrue("a" in tokenizer.alphabet)
        self.assertTrue("0" in tokenizer.alphabet)
Ejemplo n.º 11
0
    def test_init_small_k_int(self):
        """Configure a tokenizer with custom k (one value)"""

        tokenizer = Kmerizer(k=2)
        self.assertEqual(tokenizer.k, (2, 4))
Ejemplo n.º 12
0
    def test_init_small_k_array(self):
        """Configure a tokenizer with custom k (one value)"""

        tokenizer = Kmerizer(k=[2, 3])
        self.assertEqual(tokenizer.k, (2, 3))
Ejemplo n.º 13
0
 def setUp(self):
     self.encoder = CrossmapEncoder(test_map, Kmerizer(k=4))
Ejemplo n.º 14
0
 def setUp(self):
     self.builder = CrossmapEncoder(alphabet_map, Kmerizer(k=4))