class TestSplitter(unittest.TestCase): def setUp(self): self.t = Splitter() self.t.load([u"test", u"hest", u"parkering", "billet", "automat", u"universitet", u'forlag']) def test_we_can_split_to_known_words(self): self.assertEquals([u"test", u"hest"], self.t.split(u"testhest")) def test_we_ignore_the_obvious(self): self.assertEquals(None, self.t.split(u"testr")) self.assertEquals(None, self.t.split(u"tests")) def test_we_can_handle_some_conjugations(self): self.assertEquals([u"test", u"hester"], self.t.split(u"testhester")) self.assertEquals([u"test", u"hesten"], self.t.split(u"testhesten")) self.assertEquals([u"testen", u"hesten"], self.t.split(u"testenhesten")) self.assertEquals([u"parkerings", u"billet", "automat"], self.t.split(u"parkeringsbilletautomat"))
class TestSplitter(unittest.TestCase): def setUp(self): self.t = Splitter() self.t.load([ u"test", u"hest", u"parkering", "billet", "automat", u"universitet", u'forlag' ]) def test_we_can_split_to_known_words(self): self.assertEquals([u"test", u"hest"], self.t.split(u"testhest")) def test_we_ignore_the_obvious(self): self.assertEquals(None, self.t.split(u"testr")) self.assertEquals(None, self.t.split(u"tests")) def test_we_can_handle_some_conjugations(self): self.assertEquals([u"test", u"hester"], self.t.split(u"testhester")) self.assertEquals([u"test", u"hesten"], self.t.split(u"testhesten")) self.assertEquals([u"testen", u"hesten"], self.t.split(u"testenhesten")) self.assertEquals([u"parkerings", u"billet", "automat"], self.t.split(u"parkeringsbilletautomat"))
def _extract_word(self, l): if self._is_number_prefix.match(l): s = l[3:].split(';') else: s = l.split(';') return (s[0], s[1][:-1].strip()) def strip_accents(self, string): accents=('COMBINING ACUTE ACCENT', 'COMBINING GRAVE ACCENT', 'COMBINING TILDE') accents = set(map(unicodedata.lookup, accents)) chars = [c for c in unicodedata.normalize('NFD', string) if c not in accents] return unicodedata.normalize('NFC', ''.join(chars)) if len(sys.argv) < 3: print "Usage: run.py RO2012.opslagsord.med.homnr.og.ordklasse.txt corpus.txt" exit(-1) p = Parser() s = Splitter() s.load(filter(None, map(p.sb_word, io.open(sys.argv[1], 'r')))) for l in map(unicode.strip, io.open(sys.argv[2], 'r')): if len(l) != 0: for w in l.split(' '): # print w, w = p.strip_accents(w.lower()) splits = s.split(w) if splits: print "%s (%s)" % (w, splits)