Beispiel #1
0
    def toDB(self, fileDB, fileTxt):
        """
        Reads an input file and tokenizes it.
        If a token's type is alphabetical or numerical, records it into a database.
        :param fileDB: name of database files that will be created
        :param fileTxt: name of an input text file
        """
        # database is opened for operations
        db = shelve.open(fileDB, writeback=True)
        # input file is opened for reading
        myFile = open(fileTxt, "r", encoding="UTF-8")
        toTok = myFile.read()
        myFile.close()
        # input file is tokenized
        data = fullTok.Tokenizer().tokenizeCat(toTok)

        # checks type of each token
        for token in data:
            # if token is alphabetical or numerical, puts it into the database
            if (token.cat == "digit" or token.cat == "word"):
                # if same token is already in the database, adds new position
                db.setdefault(token.tok,
                              {}).setdefault(path.basename(fileTxt),
                                             []).append(token.pos)

        print(dict(db))
        # database is closed
        db.close()
Beispiel #2
0
 def test_multipunct_str(self):
     result = fullTok.Tokenizer().tokenizeCat(
         "Мама, помой раму!!! А я мыла пол, глядя на кота...эх")
     self.assertIsInstance(result, list)
     self.assertEqual(len(result), 24)
     self.assertEqual(result[6].tok, "!!!")
     self.assertEqual(result[6].pos, 16)
     self.assertEqual(result[6].cat, "punctuation")
     self.assertEqual(result[22].tok, "...")
     self.assertEqual(result[22].pos, 47)
     self.assertEqual(result[22].cat, "punctuation")
Beispiel #3
0
 def test_multicategory_string(self):
     result = fullTok.Tokenizer().tokenizeCat("Мама дома!")
     self.assertIsInstance(result, list)
     self.assertEqual(len(result), 4)
     self.assertEqual(result[0].tok, "мама")
     self.assertEqual(result[0].pos, 0)
     self.assertEqual(result[0].cat, "word")
     self.assertEqual(result[1].tok, " ")
     self.assertEqual(result[1].pos, 4)
     self.assertEqual(result[1].cat, "space")
     self.assertEqual(result[2].tok, "дома")
     self.assertEqual(result[2].pos, 5)
     self.assertEqual(result[2].cat, "word")
     self.assertEqual(result[3].tok, "!")
     self.assertEqual(result[3].pos, 9)
     self.assertEqual(result[3].cat, "punctuation")
Beispiel #4
0
 def test_one_symbol_str(self):
     result = fullTok.Tokenizer().tokenizeCat("5")
     self.assertIsInstance(result, list)
     self.assertEqual(len(result), 1)
Beispiel #5
0
 def test_not_str(self):
     with self.assertRaises(TypeError):
         fullTok.Tokenizer().tokenizeCat([1, 2, 3])
Beispiel #6
0
 def test_empty_str(self):
     result = fullTok.Tokenizer().tokenizeCat("")
     self.assertIsInstance(result, list)
     self.assertEqual(len(result), 0)
Beispiel #7
0
 def test_type_of_symbol_other(self):
     result = fullTok.Tokenizer().checkCategory('=')
     self.assertEqual(result, "other")
Beispiel #8
0
 def test_type_of_symbol_punct(self):
     result = fullTok.Tokenizer().checkCategory(',')
     self.assertEqual(result, "punctuation")
Beispiel #9
0
 def test_type_of_symbol_space(self):
     result = fullTok.Tokenizer().checkCategory(' ')
     self.assertEqual(result, "space")
Beispiel #10
0
 def test_type_of_symbol_alpha(self):
     result = fullTok.Tokenizer().checkCategory('Я')
     self.assertEqual(result, "word")
Beispiel #11
0
 def test_type_of_symbol_digit(self):
     result = fullTok.Tokenizer().checkCategory('1')
     self.assertEqual(result, "digit")