Exemple #1
0
 def test_getUnicodeStemsWithUnicode(self):
     wordList = "*(ande-)stād-(ī-tu-)".split()
     results = [x.encode("utf-8") for x in utils.getUnicodeStems(wordList)]
     self.assertEqual(
         results,
         ['and', 'ande-stad-i-tu-', 'ande-stād-ī-tu-', 'i', 'stad', 'stād',
          'tu', 'ī']
         )
Exemple #2
0
 def run(self):
     super(AddProtoCelticKeywords, self).run()
     reader = unicsv.UnicodeReader(self.inFilename)
     fieldnames = collection.ProtoCelticDictionaryV1().fields
     writer = unicsv.UnicodeWriter(self.outFilename, fieldnames)
     writer.writeheader()
     for row in reader:
         pclOrig = row["pcl"].split()
         engOrig = row["eng"].split()
         pcl = utils.getUnicodeStems(pclOrig)
         eng = utils.getStems(engOrig)
         row["see-also"] = ""
         row["pcl-keywords"] = json.dumps(utils.getUnicodeStems(pcl))
         row["eng-keywords"] = json.dumps(utils.getStems(eng))
         pcl = pcl + pclOrig
         eng = eng + engOrig
         try:
             row["pcl-metaphone"] = json.dumps(utils.getMetaphones(pcl))
             row["eng-metaphone"] = json.dumps(utils.getMetaphones(eng))
         except Exception, err:
             import pdb;pdb.set_trace()
         writer.writerow(row)
Exemple #3
0
 def run(self):
     super(AddProtoCelticKeywords, self).run()
     reader = unicsv.UnicodeReader(self.inFilename)
     fieldnames = collection.ProtoCelticDictionaryV1().fields
     writer = unicsv.UnicodeWriter(self.outFilename, fieldnames)
     writer.writeheader()
     for row in reader:
         pclOrig = row["pcl"].split()
         engOrig = row["eng"].split()
         pcl = utils.getUnicodeStems(pclOrig)
         eng = utils.getStems(engOrig)
         row["see-also"] = ""
         row["pcl-keywords"] = json.dumps(utils.getUnicodeStems(pcl))
         row["eng-keywords"] = json.dumps(utils.getStems(eng))
         pcl = pcl + pclOrig
         eng = eng + engOrig
         try:
             row["pcl-metaphone"] = json.dumps(utils.getMetaphones(pcl))
             row["eng-metaphone"] = json.dumps(utils.getMetaphones(eng))
         except Exception, err:
             import pdb
             pdb.set_trace()
         writer.writerow(row)
Exemple #4
0
 def test_getUnicodeStemsWithPunctuation(self):
     wordList = "*(o)bb-nod-e/o".split()
     results = utils.getUnicodeStems(wordList)
     self.assertEqual(results, ['eo', 'nod', 'obb', 'obb-nod-eo'])