Esempio n. 1
0
def rewrite_MOOD(word):
    value = word.feat_map()['MOOD']

    if value == 'Ind':      # indicative
        return [('Mood', 'Ind')]
    elif value == 'Imprt':  # imperative
        return [('Mood', 'Imp')]
    elif value == 'Cond':   # conditional
        return [('Mood', 'Cnd')]
    elif value == 'Pot':    # potential
        return [('Mood', 'Pot')]
    elif value == 'Opt':    # optative
        # Omorfi defines the archaic optative mood (e.g. "kävellös"),
        # which is extremely rare (e.g. no occurrences in TDT). Should
        # this ever appear, we will approximate as imperative.
        warn('mapping optative mood to imperative')
        return [('Mood', 'Imp')]
    elif value == 'Eve':    # eventive
        # Omorfi defines the archaic eventive mood (e.g. "kävelleisin"),
        # which is extremely rare (e.g. no occurrences in TDT). Should
        # this ever appear, we will approximate as potential.
        warn('mapping eventive mood to potential')
        return [('Mood', 'Pot')]
    else:
        return [] #assert False, 'unknown MOOD value ' + value
Esempio n. 2
0
 def setUp(self):
     if self.skip: return
     # Find a free port
     for port in range(8800, 8900):
         self.port = port
         # Start servertest.py in a subprocess
         cmd = [sys.executable, serverscript, self.server, str(port)]
         cmd += sys.argv[1:]  # pass cmdline arguments to subprocesses
         self.p = Popen(cmd, stdout=PIPE, stderr=PIPE)
         # Wait for the socket to accept connections
         for i in xrange(100):
             time.sleep(0.1)
             # Accepts connections?
             if ping('127.0.0.1', port): return
             # Server died for some reason...
             if not self.p.poll() is None: break
         rv = self.p.poll()
         if rv is None:
             raise AssertionError("Server took to long to start up.")
         if rv is 128:  # Import error
             tools.warn("Skipping %r test (ImportError)." % self.server)
             self.skip = True
             return
         if rv is 3:  # Port in use
             continue
     raise AssertionError("Could not find a free port to test server.")
Esempio n. 3
0
def add_VerbForm(word):
    fmap = word.feat_map()

    if word.cpostag not in VERB_TAGS:
        return []

    # (see https://github.com/TurkuNLP/UniversalFinnish/issues/28)
    if 'INF' in fmap:
        # infinitive
        assert 'PCP' not in fmap, 'INF and PCP'
        assert 'PRS' not in fmap, 'INF and PRS'
        assert 'MOOD' not in fmap, 'INF and MOOD'
        value = fmap['INF']
        if value in ('Inf1', 'Inf2', 'Inf3'):
            return [('VerbForm', 'Inf')]
        else:
            warn(u'unexpected INF value ' + value)
            return []
    if 'PCP' in fmap:
        # participle
        assert 'INF' not in fmap, 'PCP and INF'
        assert 'PRS' not in fmap, 'PCP and PRS'
        assert 'MOOD' not in fmap, 'PCP and MOOD'
        return [('VerbForm', 'Part')]
    else:
        # Should be finite, check for some marker. We consider any
        # non-infinitive, non-participle verb finite if it has either
        # MOOD or PRS.
        # (https://github.com/TurkuNLP/UniversalFinnish/issues/28)
        if 'MOOD' in fmap or 'PRS' in fmap:
            return [('VerbForm', 'Fin')]
        else:
            warn(u'failed to assign VerbForm to ' + unicode(word))
            return []
Esempio n. 4
0
 def setUp(self):
     if self.skip: return
     # Find a free port
     for port in range(8800, 8900):
         self.port = port
         # Start servertest.py in a subprocess
         cmd = [sys.executable, serverscript, self.server, str(port)]
         cmd += sys.argv[1:] # pass cmdline arguments to subprocesses
         self.p = Popen(cmd, stdout=PIPE, stderr=PIPE)
         # Wait for the socket to accept connections
         for i in xrange(100):
             time.sleep(0.1)
             # Accepts connections?
             if ping('127.0.0.1', port): return
             # Server died for some reason...
             if not self.p.poll() is None: break
         rv = self.p.poll()
         if rv is None:
             raise AssertionError("Server took to long to start up.")
         if rv is 128: # Import error
             tools.warn("Skipping %r test (ImportError)." % self.server)
             self.skip = True
             return
         if rv is 3: # Port in use
             continue
     raise AssertionError("Could not find a free port to test server.")
Esempio n. 5
0
def rewrite_PRON_SUBCAT(word):
    value = word.feat_map()['SUBCAT']

    if value == 'Dem':      # demonstrative
        return [('PronType', 'Dem')]
    elif value == 'Pers':   # personal
        return [('PronType', 'Prs')]
    elif value == 'Rel':    # relative
        return [('PronType', 'Rel')]
    elif value == 'Indef':  # indefinite
        return [('PronType', 'Ind')]
    elif value == 'Interr': # interrogative
        return [('PronType', 'Int')]
    elif value == 'Recipr': # reciprocal
        return [('PronType', 'Rcp')]
    elif value == 'Refl':   # reflexive
        # NOTE: UD defines Reflexive as a separate feature from PronType
        # (http://universaldependencies.github.io/docs/u/feat/Reflex.html)
        # TODO: consider adding PronType also?
        return [('Reflex', 'Yes')]
    elif value == 'Qnt':
        # NOTE: UD does not define "quantifier" as a pronoun type, so
        # these are (tentatively) mapped to the closest corresponding
        # subcategory, indefinite pronouns.
        # see https://github.com/TurkuNLP/UniversalFinnish/issues/37
        warn('mapping PRON SUBCAT ' + value + ' to Ind')
        return [('PronType', 'Ind')]
    else:
        return []
Esempio n. 6
0
 def test_json(self):
     self.app.route('/')(lambda: {'a': 1})
     try:
         self.assertBody(bottle.json_dumps({'a': 1}))
         self.assertHeader('Content-Type','application/json')
     except ImportError:
         warn("Skipping JSON tests.")
Esempio n. 7
0
 def test_json(self):
     self.app.route('/')(lambda: {'a': 1})
     try:
         self.assertBody(bottle.json_dumps({'a': 1}))
         self.assertHeader('Content-Type','application/json')
     except ImportError:
         warn("Skipping JSON tests.")
def rewrite_PRON_SUBCAT(word):
    value = word.feat_map()["SUBCAT"]

    if value == "Dem":  # demonstrative
        return [("PronType", "Dem")]
    elif value == "Pers":  # personal
        return [("PronType", "Prs")]
    elif value == "Rel":  # relative
        return [("PronType", "Rel")]
    elif value == "Indef":  # indefinite
        return [("PronType", "Ind")]
    elif value == "Interr":  # interrogative
        return [("PronType", "Int")]
    elif value == "Recipr":  # reciprocal
        return [("PronType", "Rcp")]
    elif value == "Refl":  # reflexive
        # NOTE: UD defines Reflexive as a separate feature from PronType
        # (http://universaldependencies.github.io/docs/u/feat/Reflex.html)
        # TODO: consider adding PronType also?
        return [("Reflex", "Yes")]
    elif value == "Qnt":
        # NOTE: UD does not define "quantifier" as a pronoun type, so
        # these are (tentatively) mapped to the closest corresponding
        # subcategory, indefinite pronouns.
        # see https://github.com/TurkuNLP/UniversalFinnish/issues/37
        warn("mapping PRON SUBCAT " + value + " to Ind")
        return [("PronType", "Ind")]
    else:
        return []
def rewrite_SUBCAT(word):
    for tagset, func in subcat_rewrite_func:
        if word.cpostag in tagset:
            return func(word)

    warn(word.cpostag + " with SUBCAT")
    return []
Esempio n. 10
0
def add_VerbForm(word):
    fmap = word.feat_map()

    if word.cpostag not in VERB_TAGS:
        return []

    # (see https://github.com/TurkuNLP/UniversalFinnish/issues/28)
    if 'INF' in fmap:
        # infinitive
        assert 'PCP' not in fmap, 'INF and PCP'
        assert 'PRS' not in fmap, 'INF and PRS'
        assert 'MOOD' not in fmap, 'INF and MOOD'
        value = fmap['INF']
        if value in ('Inf1', 'Inf2', 'Inf3'):
            return [('VerbForm', 'Inf')]
        else:
            warn(u'unexpected INF value ' + value)
            return []
    if 'PCP' in fmap:
        # participle
        assert 'INF' not in fmap, 'PCP and INF'
        assert 'PRS' not in fmap, 'PCP and PRS'
        assert 'MOOD' not in fmap, 'PCP and MOOD'
        return [('VerbForm', 'Part')]
    else:
        # Should be finite, check for some marker. We consider any
        # non-infinitive, non-participle verb finite if it has either
        # MOOD or PRS.
        # (https://github.com/TurkuNLP/UniversalFinnish/issues/28)
        if 'MOOD' in fmap or 'PRS' in fmap:
            return [('VerbForm', 'Fin')]
        else:
            warn(u'failed to assign VerbForm to ' + unicode(word))
            return []
Esempio n. 11
0
def rewrite_SUBCAT(word):
    for tagset, func in subcat_rewrite_func:
        if word.cpostag in tagset:
            return func(word)

    warn(word.cpostag + ' with SUBCAT')
    return []
def rewrite_MOOD(word):
    value = word.feat_map()["MOOD"]

    if value == "Ind":  # indicative
        return [("Mood", "Ind")]
    elif value == "Imprt":  # imperative
        return [("Mood", "Imp")]
    elif value == "Cond":  # conditional
        return [("Mood", "Cnd")]
    elif value == "Pot":  # potential
        return [("Mood", "Pot")]
    elif value == "Opt":  # optative
        # Omorfi defines the archaic optative mood (e.g. "kävellös"),
        # which is extremely rare (e.g. no occurrences in TDT). Should
        # this ever appear, we will approximate as imperative.
        warn("mapping optative mood to imperative")
        return [("Mood", "Imp")]
    elif value == "Eve":  # eventive
        # Omorfi defines the archaic eventive mood (e.g. "kävelleisin"),
        # which is extremely rare (e.g. no occurrences in TDT). Should
        # this ever appear, we will approximate as potential.
        warn("mapping eventive mood to potential")
        return [("Mood", "Pot")]
    else:
        return []  # assert False, 'unknown MOOD value ' + value
Esempio n. 13
0
 def test_json_HTTPResponse(self):
     self.app.route('/')(lambda: py3web.HTTPResponse({'a': 1}, 500))
     try:
         self.assertBody(py3web.json_dumps({'a': 1}))
         self.assertHeader('Content-Type','application/json')
     except ImportError:
         warn("Skipping JSON tests.")
def rewrite_A(word):
    # Assign "Pron" to instances of particular words given "A"
    # analyses by lemma.
    # (see https://github.com/TurkuNLP/UniversalFinnish/issues/66)
    if word.lemma in ('muu', 'sama'):
        warn('assigning %s to Pron/PRON' % word.form)
        return ('PRON', 'Pron')

    return 'ADJ'
Esempio n. 15
0
def rewrite_A(word):
    # Assign "Pron" to instances of particular words given "A"
    # analyses by lemma.
    # (see https://github.com/TurkuNLP/UniversalFinnish/issues/66)
    if word.lemma in ('muu', 'sama'):
        warn('assigning %s to Pron/PRON' % word.form)
        return ('PRON', 'Pron')

    return 'ADJ'
Esempio n. 16
0
def rewrite_TENSE(word):
    if word.cpostag not in VERB_TAGS:
        warn(word.cpostag + ' with TENSE')

    value = word.feat_map()['TENSE']
    if value == 'Prs':
        return [('Tense', 'Pres')]
    elif value == 'Prt':
        return [('Tense', 'Past')]
    else:
        return [] #assert False, 'unknown TENSE value %s' % value
Esempio n. 17
0
def rewrite_VOICE(word):
    if word.cpostag not in VERB_TAGS:
        warn(word.cpostag + ' with VOICE')

    value = word.feat_map()['VOICE']
    if value == 'Act':
        return [('Voice', 'Act')]
    elif value == 'Pass':
        return [('Voice', 'Pass')]
    else:
        return [] #assert False, 'unknown VOICE value %s' % value
def rewrite_VOICE(word):
    if word.cpostag not in VERB_TAGS:
        warn(word.cpostag + " with VOICE")

    value = word.feat_map()["VOICE"]
    if value == "Act":
        return [("Voice", "Act")]
    elif value == "Pass":
        return [("Voice", "Pass")]
    else:
        return []  # assert False, 'unknown VOICE value %s' % value
def rewrite_TENSE(word):
    if word.cpostag not in VERB_TAGS:
        warn(word.cpostag + " with TENSE")

    value = word.feat_map()["TENSE"]
    if value == "Prs":
        return [("Tense", "Pres")]
    elif value == "Prt":
        return [("Tense", "Past")]
    else:
        return []  # assert False, 'unknown TENSE value %s' % value
Esempio n. 20
0
 def test_json_serialization_error(self):
     """
     Verify that 500 errors serializing dictionaries don't return
     content-type application/json
     """
     self.app.route('/')(lambda: {'a': set()})
     try:
         self.assertStatus(500)
         self.assertHeader('Content-Type','text/html; charset=UTF-8')
     except ImportError:
         warn("Skipping JSON tests.")
Esempio n. 21
0
 def test_json_serialization_error(self):
     """
     Verify that 500 errors serializing dictionaries don't return
     content-type application/json
     """
     self.app.route('/')(lambda: {'a': set()})
     try:
         self.assertStatus(500)
         self.assertHeader('Content-Type','text/html; charset=UTF-8')
     except ImportError:
         warn("Skipping JSON tests.")
Esempio n. 22
0
def rewrite_Punct(word):
    # NOTE: likely not a final mapping, see
    # https://github.com/TurkuNLP/UniversalFinnish/issues/1
    if is_symbol(word.form):
        assert not is_punctuation(word.form), 'internal error'
        return 'SYM'
    elif is_punctuation(word.form):
        assert not is_symbol(word.form), 'internal error'
        return 'PUNCT'
    else:
        warn(u'assigning SYM to unrecognized word ' + word.form)
        return 'SYM'
def rewrite_Punct(word):
    # NOTE: likely not a final mapping, see
    # https://github.com/TurkuNLP/UniversalFinnish/issues/1
    if is_symbol(word.form):
        assert not is_punctuation(word.form), 'internal error'
        return 'SYM'
    elif is_punctuation(word.form):
        assert not is_symbol(word.form), 'internal error'
        return 'PUNCT'
    else:
        warn(u'assigning SYM to unrecognized word ' + word.form)
        return 'SYM'
def rewrite_Pron(word):
    # Assign "A" to pro-adjectives such as "millainen" based on lemma
    # (see https://github.com/TurkuNLP/UniversalFinnish/issues/67).
    if word.lemma in pro_adjective_lemmas:
        warn('assigning %s to A/ADJ' % word.form)
        return ('ADJ', 'A')

    # NOTE: this is not a full mapping: some words tagged Pron should
    # map into DET instead. See
    # https://github.com/TurkuNLP/UniversalFinnish/issues/1,
    # https://github.com/TurkuNLP/UniversalFinnish/issues/27,
    # https://github.com/UniversalDependencies/docs/issues/97.
    # However, we're currently postponing this exception.
    return 'PRON'
def rewrite_pos(sentence):
    for w in sentence.words():
        try:
            rewritten = rewrite_func[w.cpostag](w)
        except KeyError:
            warn(u'unexpected cpostag ' + w.cpostag)
            assert False, 'unexpected cpostag ' + w.cpostag

        # if rewrite_func returns a tuple, assign both cpostag and
        # postag; otherwise assign only cpostag
        if isinstance(rewritten, tuple):
            w.cpostag, w.postag = rewritten
        else:
            w.cpostag = rewritten
Esempio n. 26
0
def rewrite_pos(sentence):
    for w in sentence.words():
        try:
            rewritten = rewrite_func[w.cpostag](w)
        except KeyError:
            warn(u'unexpected cpostag ' + w.cpostag)
            assert False, 'unexpected cpostag ' + w.cpostag

        # if rewrite_func returns a tuple, assign both cpostag and
        # postag; otherwise assign only cpostag
        if isinstance(rewritten, tuple):
            w.cpostag, w.postag = rewritten
        else:
            w.cpostag = rewritten
Esempio n. 27
0
def rewrite_CMP(word):
    value = word.feat_map()['CMP']

    if word.cpostag not in (ADJ_TAGS | VERB_TAGS | ADV_TAGS):
        warn(word.cpostag + ' with CMP')

    if value == 'Comp':
        return [('Degree', 'Cmp')]
    elif value == 'Pos':
        return [('Degree', 'Pos')]
    elif value == 'Superl':
        return [('Degree', 'Sup')]
    else:
        return [] #assert False, 'unknown CMP value ' + value
Esempio n. 28
0
def rewrite_Pron(word):
    # Assign "A" to pro-adjectives such as "millainen" based on lemma
    # (see https://github.com/TurkuNLP/UniversalFinnish/issues/67).
    if word.lemma in pro_adjective_lemmas:
        warn('assigning %s to A/ADJ' % word.form)
        return ('ADJ', 'A')

    # NOTE: this is not a full mapping: some words tagged Pron should
    # map into DET instead. See
    # https://github.com/TurkuNLP/UniversalFinnish/issues/1,
    # https://github.com/TurkuNLP/UniversalFinnish/issues/27,
    # https://github.com/UniversalDependencies/docs/issues/97.
    # However, we're currently postponing this exception.
    return 'PRON'
def rewrite_CMP(word):
    value = word.feat_map()["CMP"]

    if word.cpostag not in (ADJ_TAGS | VERB_TAGS | ADV_TAGS):
        warn(word.cpostag + " with CMP")

    if value == "Comp":
        return [("Degree", "Cmp")]
    elif value == "Pos":
        return [("Degree", "Pos")]
    elif value == "Superl":
        return [("Degree", "Sup")]
    else:
        return []  # assert False, 'unknown CMP value ' + value
Esempio n. 30
0
def add_SUBCAT_to_Pron(word):
    # Assign feature SUBCAT to pronouns lacking it using surface
    # form-based heuristics.

    fmap = word.feat_map()

    if 'SUBCAT' in fmap or word.postag != 'Pron':
        return []

    try:
        sc = Pron_SUBCAT_by_lemma[word.lemma]
    except KeyError:
        warn(u'failed to assign SUBCAT to Pron')
        return []

    return [('SUBCAT', sc)]
Esempio n. 31
0
def add_SUBCAT_to_Pron(word):
    # Assign feature SUBCAT to pronouns lacking it using surface
    # form-based heuristics.

    fmap = word.feat_map()

    if 'SUBCAT' in fmap or word.postag != 'Pron':
        return []

    try:
        sc = Pron_SUBCAT_by_lemma[word.lemma]
    except KeyError:
        warn(u'failed to assign SUBCAT to Pron')
        return []

    return [('SUBCAT', sc)]
Esempio n. 32
0
    def tearDown(self):
        if self.skip: return

        if self.p.poll() == None:
            os.kill(self.p.pid, signal.SIGINT)
            time.sleep(0.5)
        while self.p.poll() == None:
            os.kill(self.p.pid, signal.SIGTERM)
            time.sleep(1)

        for stream in (self.p.stdout, self.p.stderr):
            for line in stream:
                if tob('warning') in line.lower():
                    tools.warn(line.strip().decode('utf8'))
                elif tob('error') in line.lower():
                    raise AssertionError(line.strip().decode('utf8'))
Esempio n. 33
0
    def tearDown(self):
        if self.skip: return

        if self.p.poll() == None:
            os.kill(self.p.pid, signal.SIGINT)
            time.sleep(0.5)
        while self.p.poll() == None:
            os.kill(self.p.pid, signal.SIGTERM)
            time.sleep(1)

        for stream in (self.p.stdout, self.p.stderr):
            for line in stream:
                if tob('warning') in line.lower():
                    tools.warn(line.strip().decode('utf8'))
                elif tob('error') in line.lower():
                    raise AssertionError(line.strip().decode('utf8'))
Esempio n. 34
0
 def tearDown(self):
     if self.skip: return
     for i in range(10):
         if self.p.poll() != None: break
         os.kill(self.p.pid, signal.SIGINT)
         time.sleep(0.1*i)
     for i in range(10):
         if self.p.poll() != None: break
         os.kill(self.p.pid, signal.SIGINT)
         time.sleep(i)
     for stream in (self.p.stdout, self.p.stderr):
         for line in stream:
             if tob('warning') in line.lower():
                 tools.warn(line.strip().decode('utf8'))
             elif tob('error') in line.lower():
                 raise AssertionError(line.strip().decode('utf8'))
Esempio n. 35
0
 def tearDown(self):
     if self.skip: return
     for i in range(10):
         if self.p.poll() != None: break
         os.kill(self.p.pid, signal.SIGINT)
         time.sleep(0.1 * i)
     for i in range(10):
         if self.p.poll() != None: break
         os.kill(self.p.pid, signal.SIGINT)
         time.sleep(i)
     for stream in (self.p.stdout, self.p.stderr):
         for line in stream:
             if tob('warning') in line.lower():
                 tools.warn(line.strip().decode('utf8'))
             elif tob('error') in line.lower():
                 raise AssertionError(line.strip().decode('utf8'))
Esempio n. 36
0
def add_Person(word):
    # Assign feature Person to personal pronouns, which for some
    # reason lack it in Omorfi analyses.

    fmap = word.feat_map()

    if word.cpostag != 'PRON':
        return []
    elif fmap.get('SUBCAT') != 'Pers':
        return []
    else:
        p = person_by_lemma.get(word.lemma, None)
        if p is not None:
            return [('Person', p)]
        else:
            warn(u'missing person for pronoun lemma ' + word.lemma)
            return []
def rewrite_NUM(word):
    fmap = word.feat_map()

    if word.cpostag not in (VERB_TAGS | NOUN_TAGS | ADJ_TAGS | PRON_TAGS):
        warn(word.cpostag + " with NUM")

    # Both PRS and NUM would generate redundant Number features
    assert "PRS" not in fmap

    value = word.feat_map()["NUM"]
    if value == "Sg":
        return [("Number", "Sing")]
    elif value == "Pl":
        return [("Number", "Plur")]
    else:
        # assert False, 'unknown NUM value %s' % value
        return []
Esempio n. 38
0
def rewrite_NUM(word):
    fmap = word.feat_map()

    if word.cpostag not in (VERB_TAGS | NOUN_TAGS | ADJ_TAGS | PRON_TAGS):
        warn(word.cpostag + ' with NUM')

    # Both PRS and NUM would generate redundant Number features
    assert 'PRS' not in fmap

    value = word.feat_map()['NUM']
    if value == 'Sg':
        return [('Number', 'Sing')]
    elif value == 'Pl':
        return [('Number', 'Plur')]
    else:
        #assert False, 'unknown NUM value %s' % value
        return []
Esempio n. 39
0
def add_Person(word):
    # Assign feature Person to personal pronouns, which for some
    # reason lack it in Omorfi analyses.

    fmap = word.feat_map()

    if word.cpostag != 'PRON':
        return []
    elif fmap.get('SUBCAT') != 'Pers':
        return []
    else:
        p = person_by_lemma.get(word.lemma, None)
        if p is not None:
            return [('Person', p)]
        else:
            warn(u'missing person for pronoun lemma ' + word.lemma)
            return []
Esempio n. 40
0
def rewrite_CASE(word):
    fmap = word.feat_map()
    value = fmap['CASE']

    # any nouns, pronouns, adjectives and numbers can take case, as
    # can non-finite verbs (infinititives and participles), others
    # can't.
    if word.cpostag not in (NOUN_TAGS | PRON_TAGS | ADJ_TAGS | NUM_TAGS):
        if word.cpostag not in VERB_TAGS:
            warn(word.cpostag + ' with CASE')
        elif 'INF' not in fmap and 'PCP' not in fmap:
            warn('non-INF/PCP ' + word.cpostag + ' with CASE')

    if value == 'Abe':        # abessive
        return [('Case', 'Abe')]
    elif value == 'Abl':      # ablative
        return [('Case', 'Abl')]
    elif value == 'Acc':      # accusative
        return [('Case', 'Acc')]
    elif value == 'Ade':      # adessive
        return [('Case', 'Ade')]
    elif value == 'All':      # allative
        return [('Case', 'All')]
    elif value == 'Com':      # comitative
        return [('Case', 'Com')]
    elif value == 'Ela':      # elative
        return [('Case', 'Ela')]
    elif value == 'Ess':      # essive
        return [('Case', 'Ess')]
    elif value == 'Gen':      # genitive
        return [('Case', 'Gen')]
    elif value == 'Ill':      # illative
        return [('Case', 'Ill')]
    elif value == 'Ine':      # inessive
        return [('Case', 'Ine')]
    elif value == 'Ins':      # instructive
        return [('Case', 'Ins')]
    elif value == 'Nom':      # nominative
        return [('Case', 'Nom')]
    elif value == 'Par':      # partitive
        return [('Case', 'Par')]
    elif value == 'Tra':      # translative
        return [('Case', 'Tra')]
    elif value == 'Dis':      # distributive
        # see https://github.com/TurkuNLP/UniversalFinnish/issues/55
        warn('not generating Case Dis')
        return []
    elif value == 'Lat':      # lative
        # see https://code.google.com/p/omorfi/wiki/AnalysisPossibilities,
        # http://scripta.kotus.fi/visk/sisallys.php?p=120 Huom 1,
        # https://github.com/TurkuNLP/UniversalFinnish/issues/44
        warn('not generating Case Lat')
        return []
    else:
        return [] #assert False, 'unknown CASE value ' + value
def rewrite_CASE(word):
    fmap = word.feat_map()
    value = fmap["CASE"]

    # any nouns, pronouns, adjectives and numbers can take case, as
    # can non-finite verbs (infinititives and participles), others
    # can't.
    if word.cpostag not in (NOUN_TAGS | PRON_TAGS | ADJ_TAGS | NUM_TAGS):
        if word.cpostag not in VERB_TAGS:
            warn(word.cpostag + " with CASE")
        elif "INF" not in fmap and "PCP" not in fmap:
            warn("non-INF/PCP " + word.cpostag + " with CASE")

    if value == "Abe":  # abessive
        return [("Case", "Abe")]
    elif value == "Abl":  # ablative
        return [("Case", "Abl")]
    elif value == "Acc":  # accusative
        return [("Case", "Acc")]
    elif value == "Ade":  # adessive
        return [("Case", "Ade")]
    elif value == "All":  # allative
        return [("Case", "All")]
    elif value == "Com":  # comitative
        return [("Case", "Com")]
    elif value == "Ela":  # elative
        return [("Case", "Ela")]
    elif value == "Ess":  # essive
        return [("Case", "Ess")]
    elif value == "Gen":  # genitive
        return [("Case", "Gen")]
    elif value == "Ill":  # illative
        return [("Case", "Ill")]
    elif value == "Ine":  # inessive
        return [("Case", "Ine")]
    elif value == "Ins":  # instructive
        return [("Case", "Ins")]
    elif value == "Nom":  # nominative
        return [("Case", "Nom")]
    elif value == "Par":  # partitive
        return [("Case", "Par")]
    elif value == "Tra":  # translative
        return [("Case", "Tra")]
    elif value == "Dis":  # distributive
        # see https://github.com/TurkuNLP/UniversalFinnish/issues/55
        warn("not generating Case Dis")
        return []
    elif value == "Lat":  # lative
        # see https://code.google.com/p/omorfi/wiki/AnalysisPossibilities,
        # http://scripta.kotus.fi/visk/sisallys.php?p=120 Huom 1,
        # https://github.com/TurkuNLP/UniversalFinnish/issues/44
        warn("not generating Case Lat")
        return []
    else:
        return []  # assert False, 'unknown CASE value ' + value
def remove_Inf5(word):
    # Remove Inf5 feature from verbs. Omorfi generates Inf5 *very*
    # rarely (once in TDT) and inconsistently, and the "maisillaan"
    # form termed as the "5th infinitive" is not considered as such by
    # ISK (http://scripta.kotus.fi/visk/sisallys.php?p=120).

    fmap = word.feat_map()
    if 'INF' not in fmap:
        return

    value = fmap['INF']
    if value != 'Inf5':
        return

    if word.cpostag not in VERB_TAGS:
        warn('unexpected CPOSTAG with INF=Inf5: ' + word.cpostag)

    word.remove_feat('INF', 'Inf5')
def remove_Inf5(word):
    # Remove Inf5 feature from verbs. Omorfi generates Inf5 *very*
    # rarely (once in TDT) and inconsistently, and the "maisillaan"
    # form termed as the "5th infinitive" is not considered as such by
    # ISK (http://scripta.kotus.fi/visk/sisallys.php?p=120).

    fmap = word.feat_map()
    if 'INF' not in fmap:
        return

    value = fmap['INF']
    if value != 'Inf5':
        return

    if word.cpostag not in VERB_TAGS:
        warn('unexpected CPOSTAG with INF=Inf5: ' + word.cpostag)

    word.remove_feat('INF', 'Inf5')
Esempio n. 44
0
def rewrite_ADJ_SUBCAT(word):
    value = word.feat_map()['SUBCAT']

    # NOTE: UD NumType applies also to adjectives, see
    # http://universaldependencies.github.io/docs/u/feat/NumType.html
    if value == 'Card':
        return [('NumType', 'Card')]
    elif value == 'Ord':
        return [('NumType', 'Ord')]
    if value == 'Interr' or value == 'Rel':
        # see https://github.com/TurkuNLP/UniversalFinnish/issues/61
        warn('not mapping ADJ SUBCAT ' + value)
    elif value == 'Pfx':
        # see https://github.com/TurkuNLP/UniversalFinnish/issues/60
        warn('not mapping ADJ SUBCAT Pfx')
    else:
        return []
        #assert False, 'unknown ADJ SUBCAT ' + value

    return []
def rewrite_ADJ_SUBCAT(word):
    value = word.feat_map()["SUBCAT"]

    # NOTE: UD NumType applies also to adjectives, see
    # http://universaldependencies.github.io/docs/u/feat/NumType.html
    if value == "Card":
        return [("NumType", "Card")]
    elif value == "Ord":
        return [("NumType", "Ord")]
    if value == "Interr" or value == "Rel":
        # see https://github.com/TurkuNLP/UniversalFinnish/issues/61
        warn("not mapping ADJ SUBCAT " + value)
    elif value == "Pfx":
        # see https://github.com/TurkuNLP/UniversalFinnish/issues/60
        warn("not mapping ADJ SUBCAT Pfx")
    else:
        return []
        # assert False, 'unknown ADJ SUBCAT ' + value

    return []
Esempio n. 46
0
    def tearDown(self):
        if self.skip: return

        if self.p.poll() == None:
            os.kill(self.p.pid, signal.SIGINT)
            time.sleep(0.5)
        if self.p.poll() == None:
            os.kill(self.p.pid, signal.SIGTERM)
            time.sleep(0.5)
        while self.p.poll() == None:
            tools.warn("Trying to kill server %r with pid %d." %
                       (self.server, self.p.pid))
            os.kill(self.p.pid, signal.SIGKILL)
            time.sleep(1)

        lines = [line for stream in (self.p.stdout, self.p.stderr) for line in stream]
        for line in lines:
            if tob('warning') in line.lower():
               tools.warn(line.strip().decode('utf8'))
            elif tob('error') in line.lower():
                raise AssertionError(line.strip().decode('utf8'))
def remove_Adv_CASE(word):
    # Remove case feature from adverbs. Omorfi is only expected to
    # assign the CASE feature value Dis (distributive) to adverbs, and
    # only inconsistently. Distributive is not recognized as a Finnish
    # case by ISK (http://scripta.kotus.fi/visk/sisallys.php?p=81).
    # Decided to remove this altogether, resulting in a consistent
    # treatment where no adjective has case.
    # https://github.com/TurkuNLP/UniversalFinnish/issues/17

    if word.cpostag != 'ADV':
        return

    fmap = word.feat_map()
    if 'CASE' not in fmap:
        return

    value = fmap['CASE']
    if value == 'Dis':
        word.remove_feat('CASE', 'Dis')
    else:
        warn('unexpected CASE value for ADV: ' + value)
def remove_Adv_CASE(word):
    # Remove case feature from adverbs. Omorfi is only expected to
    # assign the CASE feature value Dis (distributive) to adverbs, and
    # only inconsistently. Distributive is not recognized as a Finnish
    # case by ISK (http://scripta.kotus.fi/visk/sisallys.php?p=81).
    # Decided to remove this altogether, resulting in a consistent
    # treatment where no adjective has case.
    # https://github.com/TurkuNLP/UniversalFinnish/issues/17

    if word.cpostag != 'ADV':
        return

    fmap = word.feat_map()
    if 'CASE' not in fmap:
        return

    value = fmap['CASE']
    if value == 'Dis':
        word.remove_feat('CASE', 'Dis')
    else:
        warn('unexpected CASE value for ADV: ' + value)
Esempio n. 49
0
def rewrite_NOUN_SUBCAT(word):
    value = word.feat_map()['SUBCAT']

    # In the initial CoNLL-U conversion implementation, the only
    # common noun SUBCAT value was Prop (proper noun), which has
    # already been mapped in rewrite-pos.py. Since then, we've
    # encountered also Acro and Abbr. Just sanity-check the former but
    # add the latter two using a mapping matching that for ACRO=Yes
    # and ABBR=Yes.

    if value == 'Prop':
        assert word.cpostag == 'PROPN', 'internal error'
    elif value == 'Pfx':
        # see https://github.com/TurkuNLP/UniversalFinnish/issues/60
        warn('not mapping NOUN SUBCAT Pfx')
    elif value in ('Acro', 'Abbr'):
        return [('Abbr', 'Yes')]
    else:
        return []
        #assert False, 'unknown NOUN SUBCAT ' + value

    return []
def rewrite_NOUN_SUBCAT(word):
    value = word.feat_map()["SUBCAT"]

    # In the initial CoNLL-U conversion implementation, the only
    # common noun SUBCAT value was Prop (proper noun), which has
    # already been mapped in rewrite-pos.py. Since then, we've
    # encountered also Acro and Abbr. Just sanity-check the former but
    # add the latter two using a mapping matching that for ACRO=Yes
    # and ABBR=Yes.

    if value == "Prop":
        assert word.cpostag == "PROPN", "internal error"
    elif value == "Pfx":
        # see https://github.com/TurkuNLP/UniversalFinnish/issues/60
        warn("not mapping NOUN SUBCAT Pfx")
    elif value in ("Acro", "Abbr"):
        return [("Abbr", "Yes")]
    else:
        return []
        # assert False, 'unknown NOUN SUBCAT ' + value

    return []
def rewrite_Num(word):
    value = word.feat_map().get('SUBCAT')

    if value == 'Card':
        return 'NUM'
    elif value == 'Ord':
        return 'ADJ'

    # surface form-based heuristics
    t = numtype(word.form)

    if t == CARDINAL:
        return 'NUM'
    elif t == ORDINAL:
        # not quite sure about this, gives e.g. 1./ADJ
        warn('assigning ADJ to "ordinal": ' + word.form)
        return 'ADJ'
    elif t is None:
        warn(word.cpostag + u' without SUBCAT Card or Ord:' + word.form)
        # default to NUM (TODO: avoid guessing)
        return 'NUM'
    else:
        assert False, 'internal error'
def remove_Inf1_CASE_Lat(word):
    # Remove case feature with value Lat (lative) from infinitive
    # verbs. Omorfi follows a dated analysis where the base form of
    # the A-infinitive (Infinitive 1) is termed lative. Lative is not
    # recognized by ISK (http://scripta.kotus.fi/visk/sisallys.php?p=81,
    # see also http://scripta.kotus.fi/visk/sisallys.php?p=120 Huom 1).
    # Decided to remove this case. Note that no information is removed,
    # as the Lat value for case fully coincides with Inf1 and no other
    # case in Omorfi.
    # https://github.com/TurkuNLP/UniversalFinnish/issues/44

    fmap = word.feat_map()
    if 'CASE' not in fmap:
        return

    value = fmap['CASE']
    if value != 'Lat':
        return

    if word.cpostag not in VERB_TAGS:
        warn('unexpected CPOSTAG with CASE=Lat: ' + word.cpostag)

    word.remove_feat('CASE', 'Lat')
def rewrite_PRS(word):
    if word.cpostag not in VERB_TAGS:
        warn(word.cpostag + " with PRS")

    # Both PRS and NUM would generate redundant Number features
    assert "NUM" not in word.feat_map()

    value = word.feat_map()["PRS"]
    if value == "Sg1":
        return [("Person", "1"), ("Number", "Sing")]
    elif value == "Sg2":
        return [("Person", "2"), ("Number", "Sing")]
    elif value == "Sg3":
        return [("Person", "3"), ("Number", "Sing")]
    elif value == "Pl1":
        return [("Person", "1"), ("Number", "Plur")]
    elif value == "Pl2":
        return [("Person", "2"), ("Number", "Plur")]
    elif value == "Pl3":
        return [("Person", "3"), ("Number", "Plur")]
    else:
        warn("unmapped PRS " + word.feat_map()["PRS"])
        return []
def remove_Inf1_CASE_Lat(word):
    # Remove case feature with value Lat (lative) from infinitive
    # verbs. Omorfi follows a dated analysis where the base form of
    # the A-infinitive (Infinitive 1) is termed lative. Lative is not
    # recognized by ISK (http://scripta.kotus.fi/visk/sisallys.php?p=81,
    # see also http://scripta.kotus.fi/visk/sisallys.php?p=120 Huom 1).
    # Decided to remove this case. Note that no information is removed,
    # as the Lat value for case fully coincides with Inf1 and no other
    # case in Omorfi.
    # https://github.com/TurkuNLP/UniversalFinnish/issues/44

    fmap = word.feat_map()
    if 'CASE' not in fmap:
        return

    value = fmap['CASE']
    if value != 'Lat':
        return

    if word.cpostag not in VERB_TAGS:
        warn('unexpected CPOSTAG with CASE=Lat: ' + word.cpostag)

    word.remove_feat('CASE', 'Lat')
Esempio n. 55
0
def rewrite_PRS(word):
    if word.cpostag not in VERB_TAGS:
        warn(word.cpostag + ' with PRS')

    # Both PRS and NUM would generate redundant Number features
    assert 'NUM' not in word.feat_map()

    value = word.feat_map()['PRS']
    if value == 'Sg1':
        return [('Person', '1'), ('Number', 'Sing')]
    elif value == 'Sg2':
        return [('Person', '2'), ('Number', 'Sing')]
    elif value == 'Sg3':
        return [('Person', '3'), ('Number', 'Sing')]
    elif value == 'Pl1':
        return [('Person', '1'), ('Number', 'Plur')]
    elif value == 'Pl2':
        return [('Person', '2'), ('Number', 'Plur')]
    elif value == 'Pl3':
        return [('Person', '3'), ('Number', 'Plur')]
    else:
        warn('unmapped PRS '+word.feat_map()['PRS'])
        return []
Esempio n. 56
0
def rewrite_Num(word):
    value = word.feat_map().get('SUBCAT')

    if value == 'Card':
        return 'NUM'
    elif value == 'Ord':
        return 'ADJ'

    # surface form-based heuristics
    t = numtype(word.form)

    if t == CARDINAL:
        return 'NUM'
    elif t == ORDINAL:
        # not quite sure about this, gives e.g. 1./ADJ
        warn('assigning ADJ to "ordinal": ' + word.form)
        return 'ADJ'
    elif t is None:
        warn(word.cpostag + u' without SUBCAT Card or Ord:' + word.form)
        # default to NUM (TODO: avoid guessing)
        return 'NUM'
    else:
        assert False, 'internal error'
Esempio n. 57
0
    def tearDown(self):
        if self.skip: return

        if self.p.poll() == None:
            os.kill(self.p.pid, signal.SIGINT)
            time.sleep(0.5)
        if self.p.poll() == None:
            os.kill(self.p.pid, signal.SIGTERM)
            time.sleep(0.5)
        while self.p.poll() == None:
            tools.warn("Trying to kill server %r with pid %d." %
                       (self.server, self.p.pid))
            os.kill(self.p.pid, signal.SIGKILL)
            time.sleep(1)

        lines = [
            line for stream in (self.p.stdout, self.p.stderr)
            for line in stream
        ]
        for line in lines:
            if tob('warning') in line.lower():
                tools.warn(line.strip().decode('utf8'))
            elif tob('error') in line.lower():
                raise AssertionError(line.strip().decode('utf8'))
Esempio n. 58
0
        t = MakoTemplate(name='mako_inherit',
                         lookup=['./views/']).render(var='v')
        self.assertEqual('o\ncvc\no\n', t)
        t = MakoTemplate('<%inherit file="mako_base.tpl"/>\nc${var}c\n',
                         lookup=['./views/']).render(var='v')
        self.assertEqual('o\ncvc\no\n', t)
        t = MakoTemplate('<%inherit file="views/mako_base.tpl"/>\nc${var}c\n',
                         lookup=['./']).render(var='v')
        self.assertEqual('o\ncvc\no\n', t)

    def test_template_shortcut(self):
        result = mako_template('start ${var} end', var='middle')
        self.assertEqual(touni('start middle end'), result)

    def test_view_decorator(self):
        @mako_view('start ${var} end')
        def test():
            return dict(var='middle')

        self.assertEqual(touni('start middle end'), test())


try:
    import mako
except ImportError:
    warn("WARNING: No Mako template support. Skipping tests.")
    del TestMakoTemplate

if __name__ == '__main__':  #pragma: no cover
    unittest.main()
Esempio n. 59
0
    def test_inherit(self):
        """ Templates: Mako lookup and inherience """
        t = MakoTemplate(name='mako_inherit', lookup=['./views/']).render(var='v')
        self.assertEqual('o\ncvc\no\n', t)
        t = MakoTemplate('<%inherit file="mako_base.tpl"/>\nc${var}c\n', lookup=['./views/']).render(var='v')
        self.assertEqual('o\ncvc\no\n', t)
        t = MakoTemplate('<%inherit file="views/mako_base.tpl"/>\nc${var}c\n', lookup=['./']).render(var='v')
        self.assertEqual('o\ncvc\no\n', t)

    def test_template_shortcut(self):
        result = mako_template('start ${var} end', var='middle')
        self.assertEqual(touni('start middle end'), result)

    def test_view_decorator(self):
        @mako_view('start ${var} end')
        def test():
            return dict(var='middle')
        self.assertEqual(touni('start middle end'), test())


try:
  import mako
except ImportError:
  warn("No Mako template support. Skipping tests.")
  del TestMakoTemplate

if __name__ == '__main__': #pragma: no cover
    unittest.main()

Esempio n. 60
0
def rewrite_DRV(word):
    value = word.feat_map()['DRV']

    if value == 'Der_minen':
        # "-minen" produces noun (e.g. "valmistaminen")
        # http://scripta.kotus.fi/visk/sisallys.php?p=221,
        # https://github.com/TurkuNLP/UniversalFinnish/issues/21
        if word.cpostag not in NOUN_TAGS:
            warn(word.cpostag + ' with ' + value)
        return [('Derivation', 'Minen')]
    elif value == 'Der_sti':
        # "-sti" produces adverb (e.g. "pysyvästi")
        # http://scripta.kotus.fi/visk/sisallys.php?p=371
        if word.cpostag not in ADV_TAGS:
            warn(word.cpostag + ' with ' + value)
        return [('Derivation', 'Sti')]
    elif value == 'Der_inen':
        # "-inen" produces adjective or noun (e.g. "omenainen")
        # http://scripta.kotus.fi/visk/sisallys.php?p=261, for Omorfi
        # apparently only adjectives.
        if word.cpostag not in ADJ_TAGS:
            warn(word.cpostag + ' with ' + value)
        return [('Derivation', 'Inen')]
    elif value == 'Der_lainen':
        # "-lainen" produces noun (e.g. "Turkulainen")
        # http://scripta.kotus.fi/visk/sisallys.php?p=190
        if word.cpostag not in (ADJ_TAGS | NOUN_TAGS):
            warn(word.cpostag + ' with ' + value)
        return [('Derivation', 'Lainen')]
    elif value == 'Der_ja':
        # "-ja" produces noun (e.g. "oppija")
        # http://scripta.kotus.fi/visk/sisallys.php?p=252
        if word.cpostag not in NOUN_TAGS:
            warn(word.cpostag + ' with ' + value)
        return [('Derivation', 'Ja')]
    elif value == 'Der_ton':
        # "-ton" produces adjective (e.g. "voimaton")
        # http://scripta.kotus.fi/visk/sisallys.php?p=292
        if word.cpostag not in ADJ_TAGS:
            warn(word.cpostag + ' with ' + value)
        return [('Derivation', 'Ton')]
    elif value == 'Der_vs':
        # "-vs" produces noun (e.g. "toimivuus")
        if word.cpostag not in NOUN_TAGS:
            warn(word.cpostag + ' with ' + value)
        return [('Derivation', 'Vs')]
    elif value == 'Der_llinen':
        # "-llinen" produces adjective (e.g. "vaunullinen")
        # http://scripta.kotus.fi/visk/sisallys.php?p=276
        if word.cpostag not in ADJ_TAGS:
            warn(word.cpostag + ' with ' + value)
        return [('Derivation', 'Llinen')]
    elif value == 'Der_ttain':
        # "-ttain" produces adverb (e.g. "lajeittain")
        if word.cpostag not in ADV_TAGS:
            warn(word.cpostag + ' with ' + value)
        return [('Derivation', 'Ttain')]
    elif value == 'Der_u':
        # "-u" produces noun (e.g. "lopettelu")
        # http://scripta.kotus.fi/visk/sisallys.php?p=221,
        # https://github.com/TurkuNLP/UniversalFinnish/issues/21
        if word.cpostag not in NOUN_TAGS:
            warn(word.cpostag + ' with ' + value)
        return [('Derivation', 'U')]
    elif value == 'Der_ttaa':
        # "-ttaa" produces verb (e.g. "vaivaannuttaa")
        if word.cpostag not in VERB_TAGS:
            warn(word.cpostag + ' with ' + value)
        return [('Derivation', 'Ttaa')]
    elif value == 'Der_tar':
        # "tar-" produces noun (e.g. "suojelijatar")
        if word.cpostag not in NOUN_TAGS:
            warn(word.cpostag + ' with ' + value)
        return [('Derivation', 'Tar')]
    else:
        warn('unknown DRV value ' + value)
        return []