def toDB(self, fileDB, fileTxt): """ Reads an input file and tokenizes it. If a token's type is alphabetical or numerical, records it into a database. :param fileDB: name of database files that will be created :param fileTxt: name of an input text file """ # database is opened for operations db = shelve.open(fileDB, writeback=True) # input file is opened for reading myFile = open(fileTxt, "r", encoding="UTF-8") toTok = myFile.read() myFile.close() # input file is tokenized data = fullTok.Tokenizer().tokenizeCat(toTok) # checks type of each token for token in data: # if token is alphabetical or numerical, puts it into the database if (token.cat == "digit" or token.cat == "word"): # if same token is already in the database, adds new position db.setdefault(token.tok, {}).setdefault(path.basename(fileTxt), []).append(token.pos) print(dict(db)) # database is closed db.close()
def test_multipunct_str(self): result = fullTok.Tokenizer().tokenizeCat( "Мама, помой раму!!! А я мыла пол, глядя на кота...эх") self.assertIsInstance(result, list) self.assertEqual(len(result), 24) self.assertEqual(result[6].tok, "!!!") self.assertEqual(result[6].pos, 16) self.assertEqual(result[6].cat, "punctuation") self.assertEqual(result[22].tok, "...") self.assertEqual(result[22].pos, 47) self.assertEqual(result[22].cat, "punctuation")
def test_multicategory_string(self): result = fullTok.Tokenizer().tokenizeCat("Мама дома!") self.assertIsInstance(result, list) self.assertEqual(len(result), 4) self.assertEqual(result[0].tok, "мама") self.assertEqual(result[0].pos, 0) self.assertEqual(result[0].cat, "word") self.assertEqual(result[1].tok, " ") self.assertEqual(result[1].pos, 4) self.assertEqual(result[1].cat, "space") self.assertEqual(result[2].tok, "дома") self.assertEqual(result[2].pos, 5) self.assertEqual(result[2].cat, "word") self.assertEqual(result[3].tok, "!") self.assertEqual(result[3].pos, 9) self.assertEqual(result[3].cat, "punctuation")
def test_one_symbol_str(self): result = fullTok.Tokenizer().tokenizeCat("5") self.assertIsInstance(result, list) self.assertEqual(len(result), 1)
def test_not_str(self): with self.assertRaises(TypeError): fullTok.Tokenizer().tokenizeCat([1, 2, 3])
def test_empty_str(self): result = fullTok.Tokenizer().tokenizeCat("") self.assertIsInstance(result, list) self.assertEqual(len(result), 0)
def test_type_of_symbol_other(self): result = fullTok.Tokenizer().checkCategory('=') self.assertEqual(result, "other")
def test_type_of_symbol_punct(self): result = fullTok.Tokenizer().checkCategory(',') self.assertEqual(result, "punctuation")
def test_type_of_symbol_space(self): result = fullTok.Tokenizer().checkCategory(' ') self.assertEqual(result, "space")
def test_type_of_symbol_alpha(self): result = fullTok.Tokenizer().checkCategory('Я') self.assertEqual(result, "word")
def test_type_of_symbol_digit(self): result = fullTok.Tokenizer().checkCategory('1') self.assertEqual(result, "digit")