Example #1
0
class TestRetagger(unittest.TestCase):
    def setUp(self):
        self.r = Retagger()

    def tearDown(self):
        self.r = None

    def test(self):
        ag = self.r.retag("ag", "Sa")
        self.assertTrue("ASP" in ag)
        self.assertEqual(self.r.retag("a'", "Sa"), ["ASP"])
        agus = self.r.retag("agus", "Cc")
        self.assertTrue("CONJ" in agus)
        air = self.r.retag("air", "Sp")
        self.assertTrue("PP" in air)
        self.assertTrue("P" in air)
        self.assertEqual(self.r.retag("droch", "Ar"), ["DET"]) # not really
        comma = self.r.retag(",", "Fi")
        self.assertTrue("PUNC" in comma)
        fullstop = self.r.retag(".", "Fe")
        self.assertTrue("PUNC" in fullstop)
        self.assertEqual(self.r.retag("le", "Sp"), ["P"])
        self.assertEqual(self.r.retag("gun", "Qa"), ["GU"])
        self.assertEqual(self.r.retag("dìreach", "Rg"), ["ADV"])
        self.assertEqual(self.r.retag("Comhairle", "Ncsdf"), ["N"])
        self.assertEqual(self.r.retag("galain", "Ncsfn"), ["N"])
        self.assertEqual(self.r.retag("an", "Tdsf"), ["DET"])
        self.assertEqual(self.r.retag("na", "Tdsfg"), ["DETNMOD"])
        self.assertEqual(self.r.retag("[1]", "Xsc"), ["ADVPRE"])
        radh = self.r.retag("ràdh", "Nv")
        self.assertTrue("VPROP" in radh)
        rinn = self.r.retag("rinn", "V-s")
        self.assertTrue("TRANS" in rinn)
        tha = self.r.retag("tha", "V-p")
        self.assertTrue("BIPP" in tha)
        self.assertTrue("BIPROG" in tha)
Example #2
0
 def setUp(self):
     self.r = Retagger()
Example #3
0
    else:
        return s.replace("&", "&")

brownfile = open(sys.argv[1], 'rb')
corpus = pickle.load(brownfile)
brownfile.close()
output = open(sys.argv[2], 'w')
# features
with open("resources/features.txt") as f:
    for line in f:
        output.write(line)
# type-changing and type-raising rules
with open("resources/rules.txt") as r:
    for line in r:
        output.write(line)
retagger = Retagger()
typer = Typer()
families = set()
words = set()
# assumes a single list rather than a list of lists (need to think about this)
for surface, pos in corpus:
    if pos != "":
        tags = retagger.retag(surface, pos)
        for tag in tags:
            newtagtype = typer.type(surface, pos, tag)
            newtag = newtagtype[0]
            type = newtagtype[1]
            families.add("family %s { entry: %s; }" % (newtag, type))
            words.add('word "%s_%s":%s; # %s' % (tidyword(surface), newtag, newtag, pos))

for family in sorted(families):