def test_unknown_token(self): """ Tests process with unrecognizable tokens. """ atf = ATFConverter(two_three=True) signs = ["a2", "☉", "be3"] target = ["a₂", "☉", "be₃"] output = atf.process(signs) self.assertEqual(output, target)
def test_accents(self): """ Tests process with two_three as inactive. """ atf = ATFConverter(two_three=False) signs = ["a", "a2", "a3", "be2", "bad3", "buru14"] target = ["a", "á", "à", "bé", "bàd", "buru₁₄"] output = atf.process(signs) self.assertEqual(output, target)
def test_single_sign(self): """ Tests process with two_three as active. """ atf = ATFConverter(two_three=True) signs = ["a", "a1", "a2", "a3", "be2", "be3", "bad2", "bad3"] target = ["a", "a₁", "a₂", "a₃", "be₂", "be₃", "bad₂", "bad₃"] output = atf.process(signs) self.assertEqual(output, target)
def test_convert_num(self): """ Tests _convert_num """ atf = ATFConverter() signs = ["a2", "☉", "be3", 'bad3'] target = ["a₂", "☉", "be₃", 'bad₃'] output = [atf._convert_num(s) for s in signs] # pylint: disable=protected-access self.assertEqual(output, target)
def test_get_number_from_sign(self): """ Tests get_number_from_sign. """ atf = ATFConverter() signs = ["a", "a1", "be2", "bad3", "buru14"] target = [0, 1, 2, 3, 14] output = [atf._get_number_from_sign(s)[1] for s in signs] # pylint: disable=protected-access self.assertEqual(output, target)
def test_convert_consonant(self): """ Tests convert_consonant. """ atf = ATFConverter() signs = ['as,', 'S,ATU', 'tet,', 'T,et', 'sza', 'ASZ'] target = ['aṣ', 'ṢATU', 'teṭ', 'Ṭet', 'ša', 'AŠ'] output = [atf._convert_consonant(s) for s in signs] # pylint: disable=protected-access self.assertEqual(output, target)
from collections import Counter from Importer.file_importer import FileImport from Importer.cdli_corpus import CDLICorpus from ATFConverter.tokenizer import Tokenizer from ATFConverter.atf_converter import ATFConverter fi = FileImport('texts/Akkadian.txt') fi.read_file() cc = CDLICorpus() cc.parse_file(fi.file_lines) tk = Tokenizer() atf = ATFConverter() stopwords = ['a-na', 'u3', 'sza', '[...]', 'i-na', '=', 'ARM', '01,', 'lang', 'akk', 'um-ma', 'la', 'u2-ul', 'mesz_', 'asz-szum', '0.1', 'broken', 'isz-tu', '_lu2_', 'ki-a-am', '1(disz)', 'ki-ma', 'x', 'sza-a-ti', 'the', '_lu2', '...]', 'lu-u2', 'sza#', 'a-na#', '_u4', 'beginning', 'of', '2(disz)', '[a-na', 'szum-ma', 'hi-a_', 'ana', 'a-di'] bag_of_words = [] for lines in cc.catalog['P249253']['transliteration']: for word in tk.word_tokenizer(lines): if word[0] not in stopwords: bag_of_words.append('-'.join(atf.process(word[0].split('-')))) frequency_analysis = Counter(bag_of_words).most_common(11) print(frequency_analysis)