def test_word_equality(self): """ Verify that words are correctly parsed into morphemes. """ for a, b in [(make_word('koka-nu-po^{12.3.4.56}', 'PART-B-C', 'N'), make_word('koka-nu-po^{12.3.4.65}', 'PART-B-C', 'N'))]: self.assertNotEqual(a, b) for a, b in [(make_word('koka-nu-po^{12.3.4.56}', 'PART-B-C', 'N'), make_word('koka-nu-po^{12.3.4.65}', 'PART-B-C', 'N'))]: self.assertNotEqual(a, b)
def test_word_dict_key(self): """ Words can be dictionary keys. """ b = dict() b[make_word('koka-nu-po^{12.3.4.56}', 'PART-B-C', 'N')] = 1 b[make_word('koka-nu-po^{12.3.4.56}', 'PART-B-C', 'N')] += 1 b[make_word('koke-nu-po^{12.3.4.56}', 'PART-B-C', 'N')] = 1 self.assertEqual( b[make_word('koka-nu-po^{12.3.4.56}', 'PART-B-C', 'N')], 2) self.assertEqual( b[make_word('koke-nu-po^{12.3.4.56}', 'PART-B-C', 'N')], 1)
def test_word_parsing(self): """ Verify that words are correctly parsed into morphemes. """ for parsed, manually in [ (make_word('koka-nu-po^{12.3.4.56}', 'PART-B-C', 'N'), Word(make_morphemes('koka-nu-po', 'PART-B-C'), make_syllables('kokanupo', '12.3.4.56'), 'N')), (make_word('a^{1}', 'A', 'V'), Word(make_morphemes('a', 'A'), make_syllables('a', '1'), 'V')), ]: self.assertEqual(parsed, manually)
def test_word_complete_morphemes(self): for parsed, manually in [ (make_word('koka-n-o^{12.3.4}', 'A-B-C', 'N'), [(make_morpheme('koka', 'A'), tuple(make_syllables('koka', '12.3')))]), (make_word('bo^{1}', 'A', 'N'), [(make_morpheme('bo', 'A'), tuple(make_syllables('bo', '1')))]), (make_word('b-o-kana-p-o^{1.2.2.3}', 'PART-PART-C-D-E', 'N'), [(make_morpheme('kana', 'C'), tuple(make_syllables('kana', '2.2')))]), (make_word('b-o-p-o^{1.2}', 'PART-A-B-C', 'N'), []), ]: self.assertEqual(list(parsed.iter_complete_morphemes()), manually)
def test_word_equality(self): """ Verify that words are correctly parsed into morphemes. """ for a, b in [ (make_word('koka-nu-po^{12.3.4.56}', 'PART-B-C', 'N'), make_word('koka-nu-po^{12.3.4.65}', 'PART-B-C', 'N')) ]: self.assertNotEqual(a, b) for a, b in [ (make_word('koka-nu-po^{12.3.4.56}', 'PART-B-C', 'N'), make_word('koka-nu-po^{12.3.4.65}', 'PART-B-C', 'N')) ]: self.assertNotEqual(a, b)
def test_word_parsing(self): """ Verify that words are correctly parsed into morphemes. """ for parsed, manually in [ ( make_word('koka-nu-po^{12.3.4.56}', 'PART-B-C', 'N'), Word( make_morphemes('koka-nu-po', 'PART-B-C'), make_syllables('kokanupo', '12.3.4.56'), 'N') ), ( make_word('a^{1}', 'A', 'V'), Word( make_morphemes('a', 'A'), make_syllables('a', '1'), 'V' ) ), ]: self.assertEqual(parsed, manually)
def test_word_complete_morphemes(self): for parsed, manually in [ ( make_word('koka-n-o^{12.3.4}', 'A-B-C', 'N'), [(make_morpheme('koka', 'A'), tuple(make_syllables('koka', '12.3')))] ), ( make_word('bo^{1}', 'A', 'N'), [(make_morpheme('bo', 'A'), tuple(make_syllables('bo', '1')))] ), ( make_word('b-o-kana-p-o^{1.2.2.3}', 'PART-PART-C-D-E', 'N'), [(make_morpheme('kana', 'C'), tuple(make_syllables('kana', '2.2')))] ), ( make_word('b-o-p-o^{1.2}', 'PART-A-B-C', 'N'), [] ), ]: self.assertEqual(list(parsed.iter_complete_morphemes()), manually)
def test_bad_word_tone(self): """ Words without well structured tone markings are rejected. """ for x in [ 'cat123', 'cat^{}', 'cat^{abc}', 'cat^1', 'cat^{1', 'cat^1}', 'cat^{a^bc}', 'cat^{a^bc}^', ]: self.assertRaises(BadIPATone, lambda: make_word(x, 'gloss', 'category'))
def test_bad_word_tone(self): """ Words without well structured tone markings are rejected. """ for x in [ 'cat123', 'cat^{}', 'cat^{abc}', 'cat^1', 'cat^{1', 'cat^1}', 'cat^{a^bc}', 'cat^{a^bc}^', ]: self.assertRaises( BadIPATone, lambda: make_word(x, 'gloss', 'category') )
def load_word_counts(filename): """ Reads in a file and returns a dictionary of words mapped to counts. """ raw_rows = csv_rows(filename) word_counts = defaultdict(lambda: 0) for line_number, raw_row in enumerate(raw_rows, 2): count = int(raw_row["count"]) ipa = raw_row["IPA"] if '*' in ipa: continue # Fixes random badness.. hopefully doesn't hide anything? mod_ipa = ipa.replace('(', '').replace(')', '') # Work around a passage with an error in it: gloss = raw_row["Gloss"] or raw_row["Text"] category = raw_row["Category"] skipword_characters = {'?'} try: for i, g in izip(mod_ipa.split('/'), gloss.split('/')): word = make_word(i, g, category) word_counts[word] += count except WordParseError as e: print (u"Error on line %d: %s [%s || %s]" % (line_number, repr(e), ipa, gloss)).encode('utf-8') except IndexError as e: unknown_index = e.args[0] if unknown_index in skipword_characters: print (u"Bad char on line %d: %s [%s || %s]" % (line_number, repr(e), ipa, gloss)).encode('utf-8') else: print "FATAL ERROR ON LINE %d" % line_number raise except: print "FATAL ERROR ON LINE %d" % line_number raise return word_counts