Ejemplo n.º 1
0
class TestSplitter(unittest.TestCase):
    def setUp(self):
        self.t = Splitter()
        self.t.load([u"test", u"hest", u"parkering", "billet", "automat", u"universitet", u'forlag'])

    def test_we_can_split_to_known_words(self):
        self.assertEquals([u"test", u"hest"], self.t.split(u"testhest"))

    def test_we_ignore_the_obvious(self):
        self.assertEquals(None, self.t.split(u"testr"))
        self.assertEquals(None, self.t.split(u"tests"))

    def test_we_can_handle_some_conjugations(self):
        self.assertEquals([u"test", u"hester"], self.t.split(u"testhester"))
        self.assertEquals([u"test", u"hesten"], self.t.split(u"testhesten"))
        self.assertEquals([u"testen", u"hesten"], self.t.split(u"testenhesten"))
        self.assertEquals([u"parkerings", u"billet", "automat"], self.t.split(u"parkeringsbilletautomat"))
Ejemplo n.º 2
0
class TestSplitter(unittest.TestCase):
    def setUp(self):
        self.t = Splitter()
        self.t.load([
            u"test", u"hest", u"parkering", "billet", "automat",
            u"universitet", u'forlag'
        ])

    def test_we_can_split_to_known_words(self):
        self.assertEquals([u"test", u"hest"], self.t.split(u"testhest"))

    def test_we_ignore_the_obvious(self):
        self.assertEquals(None, self.t.split(u"testr"))
        self.assertEquals(None, self.t.split(u"tests"))

    def test_we_can_handle_some_conjugations(self):
        self.assertEquals([u"test", u"hester"], self.t.split(u"testhester"))
        self.assertEquals([u"test", u"hesten"], self.t.split(u"testhesten"))
        self.assertEquals([u"testen", u"hesten"],
                          self.t.split(u"testenhesten"))
        self.assertEquals([u"parkerings", u"billet", "automat"],
                          self.t.split(u"parkeringsbilletautomat"))
Ejemplo n.º 3
0
    def _extract_word(self, l):
        if self._is_number_prefix.match(l):
            s = l[3:].split(';')
        else:
            s = l.split(';')
        return (s[0], s[1][:-1].strip())

    def strip_accents(self, string):
        accents=('COMBINING ACUTE ACCENT', 'COMBINING GRAVE ACCENT', 'COMBINING TILDE')
        accents = set(map(unicodedata.lookup, accents))
        chars = [c for c in unicodedata.normalize('NFD', string) if c not in accents]
        return unicodedata.normalize('NFC', ''.join(chars))

if len(sys.argv) < 3:
    print "Usage: run.py RO2012.opslagsord.med.homnr.og.ordklasse.txt corpus.txt"
    exit(-1)

p = Parser()
s = Splitter()
s.load(filter(None, map(p.sb_word, io.open(sys.argv[1], 'r'))))

for l in map(unicode.strip, io.open(sys.argv[2], 'r')):
    if len(l) != 0:
        for w in l.split(' '):
            # print w,
            w =  p.strip_accents(w.lower())
            splits = s.split(w)
            if splits:
                print "%s (%s)" % (w, splits)
Ejemplo n.º 4
0
 def setUp(self):
     self.t = Splitter()
     self.t.load([
         u"test", u"hest", u"parkering", "billet", "automat",
         u"universitet", u'forlag'
     ])
Ejemplo n.º 5
0
 def setUp(self):
     self.t = Splitter()
     self.t.load([u"test", u"hest", u"parkering", "billet", "automat", u"universitet", u'forlag'])