def test_getUnicodeStemsWithUnicode(self): wordList = "*(ande-)stād-(ī-tu-)".split() results = [x.encode("utf-8") for x in utils.getUnicodeStems(wordList)] self.assertEqual( results, ['and', 'ande-stad-i-tu-', 'ande-stād-ī-tu-', 'i', 'stad', 'stād', 'tu', 'ī'] )
def run(self): super(AddProtoCelticKeywords, self).run() reader = unicsv.UnicodeReader(self.inFilename) fieldnames = collection.ProtoCelticDictionaryV1().fields writer = unicsv.UnicodeWriter(self.outFilename, fieldnames) writer.writeheader() for row in reader: pclOrig = row["pcl"].split() engOrig = row["eng"].split() pcl = utils.getUnicodeStems(pclOrig) eng = utils.getStems(engOrig) row["see-also"] = "" row["pcl-keywords"] = json.dumps(utils.getUnicodeStems(pcl)) row["eng-keywords"] = json.dumps(utils.getStems(eng)) pcl = pcl + pclOrig eng = eng + engOrig try: row["pcl-metaphone"] = json.dumps(utils.getMetaphones(pcl)) row["eng-metaphone"] = json.dumps(utils.getMetaphones(eng)) except Exception, err: import pdb;pdb.set_trace() writer.writerow(row)
def run(self): super(AddProtoCelticKeywords, self).run() reader = unicsv.UnicodeReader(self.inFilename) fieldnames = collection.ProtoCelticDictionaryV1().fields writer = unicsv.UnicodeWriter(self.outFilename, fieldnames) writer.writeheader() for row in reader: pclOrig = row["pcl"].split() engOrig = row["eng"].split() pcl = utils.getUnicodeStems(pclOrig) eng = utils.getStems(engOrig) row["see-also"] = "" row["pcl-keywords"] = json.dumps(utils.getUnicodeStems(pcl)) row["eng-keywords"] = json.dumps(utils.getStems(eng)) pcl = pcl + pclOrig eng = eng + engOrig try: row["pcl-metaphone"] = json.dumps(utils.getMetaphones(pcl)) row["eng-metaphone"] = json.dumps(utils.getMetaphones(eng)) except Exception, err: import pdb pdb.set_trace() writer.writerow(row)
def test_getUnicodeStemsWithPunctuation(self): wordList = "*(o)bb-nod-e/o".split() results = utils.getUnicodeStems(wordList) self.assertEqual(results, ['eo', 'nod', 'obb', 'obb-nod-eo'])