Example #1
0
    def test_word_equality(self):
        """
    Verify that words are correctly parsed into morphemes.
    """
        for a, b in [(make_word('koka-nu-po^{12.3.4.56}', 'PART-B-C', 'N'),
                      make_word('koka-nu-po^{12.3.4.65}', 'PART-B-C', 'N'))]:
            self.assertNotEqual(a, b)

        for a, b in [(make_word('koka-nu-po^{12.3.4.56}', 'PART-B-C', 'N'),
                      make_word('koka-nu-po^{12.3.4.65}', 'PART-B-C', 'N'))]:
            self.assertNotEqual(a, b)
Example #2
0
 def test_word_dict_key(self):
     """
 Words can be dictionary keys.
 """
     b = dict()
     b[make_word('koka-nu-po^{12.3.4.56}', 'PART-B-C', 'N')] = 1
     b[make_word('koka-nu-po^{12.3.4.56}', 'PART-B-C', 'N')] += 1
     b[make_word('koke-nu-po^{12.3.4.56}', 'PART-B-C', 'N')] = 1
     self.assertEqual(
         b[make_word('koka-nu-po^{12.3.4.56}', 'PART-B-C', 'N')], 2)
     self.assertEqual(
         b[make_word('koke-nu-po^{12.3.4.56}', 'PART-B-C', 'N')], 1)
 def test_word_dict_key(self):
   """
   Words can be dictionary keys.
   """
   b = dict()
   b[make_word('koka-nu-po^{12.3.4.56}', 'PART-B-C', 'N')] = 1
   b[make_word('koka-nu-po^{12.3.4.56}', 'PART-B-C', 'N')] += 1
   b[make_word('koke-nu-po^{12.3.4.56}', 'PART-B-C', 'N')] = 1
   self.assertEqual(
         b[make_word('koka-nu-po^{12.3.4.56}', 'PART-B-C', 'N')], 2)
   self.assertEqual(
         b[make_word('koke-nu-po^{12.3.4.56}', 'PART-B-C', 'N')], 1)
Example #4
0
 def test_word_parsing(self):
     """
 Verify that words are correctly parsed into morphemes.
 """
     for parsed, manually in [
         (make_word('koka-nu-po^{12.3.4.56}', 'PART-B-C', 'N'),
          Word(make_morphemes('koka-nu-po', 'PART-B-C'),
               make_syllables('kokanupo', '12.3.4.56'), 'N')),
         (make_word('a^{1}', 'A', 'V'),
          Word(make_morphemes('a', 'A'), make_syllables('a', '1'), 'V')),
     ]:
         self.assertEqual(parsed, manually)
Example #5
0
 def test_word_complete_morphemes(self):
     for parsed, manually in [
         (make_word('koka-n-o^{12.3.4}', 'A-B-C',
                    'N'), [(make_morpheme('koka', 'A'),
                            tuple(make_syllables('koka', '12.3')))]),
         (make_word('bo^{1}', 'A', 'N'), [(make_morpheme('bo', 'A'),
                                           tuple(make_syllables('bo',
                                                                '1')))]),
         (make_word('b-o-kana-p-o^{1.2.2.3}', 'PART-PART-C-D-E',
                    'N'), [(make_morpheme('kana', 'C'),
                            tuple(make_syllables('kana', '2.2')))]),
         (make_word('b-o-p-o^{1.2}', 'PART-A-B-C', 'N'), []),
     ]:
         self.assertEqual(list(parsed.iter_complete_morphemes()), manually)
  def test_word_equality(self):
    """
    Verify that words are correctly parsed into morphemes.
    """
    for a, b in [
        (make_word('koka-nu-po^{12.3.4.56}', 'PART-B-C', 'N'),
         make_word('koka-nu-po^{12.3.4.65}', 'PART-B-C', 'N'))
    ]:
      self.assertNotEqual(a, b)

    for a, b in [
        (make_word('koka-nu-po^{12.3.4.56}', 'PART-B-C', 'N'),
         make_word('koka-nu-po^{12.3.4.65}', 'PART-B-C', 'N'))
    ]:
      self.assertNotEqual(a, b)
 def test_word_parsing(self):
   """
   Verify that words are correctly parsed into morphemes.
   """
   for parsed, manually in [
       (
         make_word('koka-nu-po^{12.3.4.56}', 'PART-B-C', 'N'),
         Word(
           make_morphemes('koka-nu-po', 'PART-B-C'),
           make_syllables('kokanupo', '12.3.4.56'),
           'N')
         ),
       (
         make_word('a^{1}', 'A', 'V'),
         Word(
           make_morphemes('a', 'A'),
           make_syllables('a', '1'),
           'V'
         )
       ),
       ]:
     self.assertEqual(parsed, manually)
 def test_word_complete_morphemes(self):
   for parsed, manually in [
         (
           make_word('koka-n-o^{12.3.4}', 'A-B-C', 'N'),
           [(make_morpheme('koka', 'A'),
             tuple(make_syllables('koka', '12.3')))]
         ),
         (
           make_word('bo^{1}', 'A', 'N'),
           [(make_morpheme('bo', 'A'),
             tuple(make_syllables('bo', '1')))]
         ),
         (
           make_word('b-o-kana-p-o^{1.2.2.3}', 'PART-PART-C-D-E', 'N'),
           [(make_morpheme('kana', 'C'),
             tuple(make_syllables('kana', '2.2')))]
         ),
         (
           make_word('b-o-p-o^{1.2}', 'PART-A-B-C', 'N'),
           []
         ),
   ]:
     self.assertEqual(list(parsed.iter_complete_morphemes()), manually)
Example #9
0
 def test_bad_word_tone(self):
     """
 Words without well structured tone markings are rejected.
 """
     for x in [
             'cat123',
             'cat^{}',
             'cat^{abc}',
             'cat^1',
             'cat^{1',
             'cat^1}',
             'cat^{a^bc}',
             'cat^{a^bc}^',
     ]:
         self.assertRaises(BadIPATone,
                           lambda: make_word(x, 'gloss', 'category'))
 def test_bad_word_tone(self):
   """
   Words without well structured tone markings are rejected.
   """
   for x in [
       'cat123',
       'cat^{}',
       'cat^{abc}',
       'cat^1',
       'cat^{1',
       'cat^1}',
       'cat^{a^bc}',
       'cat^{a^bc}^',
   ]:
     self.assertRaises(
         BadIPATone,
         lambda: make_word(x, 'gloss', 'category')
     )
def load_word_counts(filename):
  """
  Reads in a file and returns a dictionary of words mapped to counts.
  """
  raw_rows = csv_rows(filename)
  word_counts = defaultdict(lambda: 0)

  for line_number, raw_row in enumerate(raw_rows, 2):
    count = int(raw_row["count"])
    ipa = raw_row["IPA"]
    if '*' in ipa:
      continue

    # Fixes random badness.. hopefully doesn't hide anything?
    mod_ipa = ipa.replace('(', '').replace(')', '')

    # Work around a passage with an error in it:
    gloss = raw_row["Gloss"] or raw_row["Text"]

    category = raw_row["Category"]

    skipword_characters = {'?'}
    try:
      for i, g in izip(mod_ipa.split('/'), gloss.split('/')):
        word = make_word(i, g, category)
        word_counts[word] += count
    except WordParseError as e:
      print (u"Error on line %d: %s [%s || %s]" %
          (line_number, repr(e), ipa, gloss)).encode('utf-8')
    except IndexError as e:
      unknown_index = e.args[0]
      if unknown_index in skipword_characters:
        print (u"Bad char on line %d: %s [%s || %s]" %
            (line_number, repr(e), ipa, gloss)).encode('utf-8')
      else:
        print "FATAL ERROR ON LINE %d" % line_number
        raise
    except:
      print "FATAL ERROR ON LINE %d" % line_number
      raise
  return word_counts