Esempio n. 1
0
def format_lexc_segments(wordmap):
    wordmap['analysis'] = lexc_escape(wordmap['stub']) + '{STUB}'
    retvals = []
    lex_stub = lexc_escape(wordmap['stub'])
    for new_para in wordmap['new_paras']:
        retvals += ["%s:%s\t%s\t;" %(wordmap['analysis'], lex_stub, new_para)]
    return "\n".join(retvals)
Esempio n. 2
0
def format_continuation_lexc_omor(anals, surf, cont, format):
    omorstring = ''
    if 'DIGITS_' in cont and not ('BACK' in cont or 'FRONT' in cont):
        omorstring = lexc_escape(surf)
        if anals and anals != 'LEMMA-START':
            omorstring += ']'
    
    # Collapse DRV=NUT/TU and PCP=NUT to PCP=NUT with full inflection
    if anals == 'Dnut':
        anals = 'Vact|Cnut'
    elif anals == 'Dtu':
        anals = 'Vpss|Cnut'
    # Collapse DRV=VA/TAVA and PCP=VA to PCP=VA with full inflection
    elif anals == 'Dva':
        anals = 'Vact|Cva'
    elif anals == 'Dtava':
        anals = 'Vpss|Cva'
    # Collapse DRV=MA and PCP=AGENT to PCP=AGENT with full inflection
    elif anals == 'Dma':
        anals = 'Cma'
    # Collapse DRV=MATON and PCP=NEG to PCP=NEG with full inflection
    elif anals == 'Dmaton':
        anals = 'Cmaton'
    elif ('Cnut' in anals or 'Cva' in anals or 'Cma' in anals or 'Cmaton' in anals) and \
         (anals.endswith('Npl') or anals.endswith('Nsg')):
        anals = anals + '|Xnom'
    
    tags = anals.split('|')
    for tag in tags:
        omorstring += format_tag_omor(tag, format)
    surf = lexc_escape(surf)
    return "%s:%s\t%s ;\n" %(omorstring, surf, cont)
Esempio n. 3
0
def format_continuation_lexc_ftb3(anals, surf, cont):
    ftbstring = format_analysis_lexc_ftb3(anals)
    if 'COMPOUND' in cont:
        # XXX: there was += before
        ftbstring =  surf.replace(morph_boundary, '').replace(deriv_boundary, '')
    elif 'NUM_' in cont and ('BACK' in cont or 'FRONT' in cont and not ('C**T' in cont or 'POSS' in cont)):
        ftbstring +=  surf.replace(morph_boundary, '').replace(deriv_boundary, '')
    elif 'DIGITS_' in cont and not ('BACK' in cont or 'FRONT' in cont):
        ftbstring = lexc_escape(surf) + ftbstring
    surf = lexc_escape(surf)
    return "%s:%s\t%s ;\n" %(ftbstring, surf, cont)
Esempio n. 4
0
def format_tag_apertium(stuff):
    if len(stuff) == 0:
        return ""
    elif stuff in stuff2monodix:
        if stuff2monodix[stuff] in ['+', '-', '#', '0', '']:
            return stuff2monodix[stuff]
        elif stuff2monodix[stuff].startswith('+'):
            return (lexc_escape(stuff2monodix[stuff]) + '%>')
        else:
            return ('%<' + lexc_escape(stuff2monodix[stuff]) + '%>')
    else:
        fail_formatting_missing_for(stuff, "apertium")
        return ""
Esempio n. 5
0
def format_lexc_ftb3(wordmap, format):
    '''
    format string for canonical ftb3 format for morphological analysis
    '''
    if wordmap['stub'] == ' ':
        # do not include normal white space for now
        return ""
    wordmap['stub'] = lexc_escape(wordmap['stub'])
    wordmap['analysis'] = "%s" %(lexc_escape(wordmap['bracketstub'].replace(word_boundary, '#')  + '←<Del>'))
    if (wordmap['pos'] == 'ACRONYM' and (len(wordmap['stub']) == 1 and not wordmap['stub'].isalpha())) or wordmap['stub'] == '§§':
        wordmap['analysis'] += format_tag_ftb3('PUNCTUATION')
    elif wordmap['pos'] in ['NOUN', 'VERB', 'ADJECTIVE', 'PRONOUN', 'NUMERAL', 'ACRONYM', 'PUNCTUATION']:
        wordmap['analysis'] += format_tag_ftb3(wordmap['pos'])
    elif wordmap['pos'] == 'CONJUNCTIONVERB':
        if wordmap['lemma'] == 'eikä':
            wordmap['lemma'] = 'ei'
            wordmap['analysis'] = format_tag_ftb3('COORDINATING') + \
                    format_tag_ftb3('Nneg')
        else:
            wordmap['analysis'] = format_tag_ftb3('ADVERBIAL') + \
                    format_tag_ftb3('Nneg')
    elif wordmap['particle']:
        for pclass in wordmap['particle'].split('|'):
            wordmap['analysis'] += format_tag_ftb3(pclass)
    else:
        print("not in FTB3 known poses or particle!\n", wordmap)
        exit(1)
    if wordmap['subcat']:
        if 'PERSONAL' in wordmap['subcat']:
            wordmap['subcat'] = 'PERSONAL'
        for subcat in wordmap['subcat'].split('|'):
            wordmap['analysis'] += format_tag_ftb3(subcat)
    if wordmap['is_proper']:
        wordmap['analysis'] += format_tag_ftb3('PROPER')
    if wordmap['symbol']:
        for subcat in wordmap['symbol'].split('|'):
            wordmap['analysis'] += format_tag_ftb3(subcat)
        if wordmap['lemma'] == '–':
            wordmap['analysis'].replace('Dash', 'EnDash')
        if wordmap['lemma'] == '—':
            wordmap['analysis'].replace('Dash', 'EmDash')
    lex_stub = wordmap['stub']
    retvals = []
    for new_para in wordmap['new_paras']:
        retvals += ["%s:%s\t%s\t;" %(wordmap['analysis'], lex_stub, 
                new_para)]
    if wordmap['lemma'] in ['-', '–', '—', '(']:
        retvals += ["%s%% %%>%%>%%>:%s\t%s\t;" %(wordmap['analysis'], lex_stub,
            new_para)]

    return "\n".join(retvals)
Esempio n. 6
0
def format_lexc_apertium(wordmap):
    wordmap['analysis'] = lexc_escape(wordmap['lemma'])
    wordmap['analysis'] = wordmap['analysis'].replace(word_boundary, '+').replace(weak_boundary, '')
    if wordmap['is_suffix']:
        wordmap['analysis'] = "+" + wordmap['analysis']
    elif wordmap['is_prefix']:
        wordmap['analysis'] += "+"
     
    if wordmap['pos'] == 'NOUN':
        if wordmap['is_proper']:
            wordmap['analysis'] += '%<np%>'
            for pc in wordmap['proper_noun_class'].split(','):
                wordmap['analysis'] += format_tag_apertium(pc)
        else:
            wordmap['analysis'] += '%<n%>'
    elif wordmap['pos'] == 'VERB':
        if wordmap['argument']:
            wordmap['analysis'] += format_tag_apertium(wordmap['argument'] + '_arg')
        else:
            wordmap['analysis'] += format_tag_apertium(wordmap['pos'])
    elif wordmap['pos'] == 'CONJUNCTIONVERB':
        if wordmap['lemma'] == 'eikä':
            wordmap['lemma'] = 'ei'
            wordmap['analysis'] = 'ja' + \
                    format_tag_apertium('COORDINATING') + \
                    '+ei' + \
                    format_tag_apertium('Nneg')
        else:
            wordmap['analysis'] = wordmap['lemma'][:-2] +\
                    format_tag_apertium('ADVERBIAL') + \
                    '+' + wordmap['lemma'][-2:] + \
                    format_tag_apertium('Nneg')
    elif wordmap['particle']:
        for pclass in wordmap['particle'].split('|'):
            wordmap['analysis'] += format_tag_apertium(pclass)
    else:
        wordmap['analysis'] += format_tag_apertium(wordmap['pos'])

    if wordmap['subcat']:
        for subcat in wordmap['subcat'].split('|'):
            wordmap['analysis'] += format_tag_apertium(subcat)
    if wordmap['symbol']:
        for subcat in wordmap['symbol'].split('|'):
            wordmap['analysis'] += format_tag_apertium(subcat)
    retvals = ""
    wordmap['stub'] = wordmap['stub'].replace(word_boundary, optional_hyphen)
    wordmap['stub'] = lexc_escape(wordmap['stub'])
    for new_para in wordmap['new_paras']:
        retvals += "%s:%s\t%s\t;\n" %(wordmap['analysis'], wordmap['stub'], new_para)
    return retvals
Esempio n. 7
0
def format_continuation_lexc_google(anals, surf, cont):
    ftbstring = format_analysis_lexc_google(anals)
    if 'COMPOUND' in cont:
        ftbstring =  surf.replace(morph_boundary, '').replace(deriv_boundary, '')
    if surf != '0':
        surf = lexc_escape(surf)
    return "%s:%s\t%s ;\n" %(ftbstring, surf, cont)
Esempio n. 8
0
def format_lexc_google(wordmap):
    '''
    format string for canonical google universal pos format for morphological analysis
    '''
    if wordmap['stub'] == ' ':
        # do not include normal white space for now
        return ""
    wordmap['stub'] = lexc_escape(wordmap['stub'])
    wordmap['analysis'] = "%s" %(lexc_escape(wordmap['bracketstub'].replace(word_boundary, '#')  + '←<Del>'))
    wordmap['analysis'] += format_tag_google(wordmap['pos'])
    if wordmap['particle']:
        for pclass in wordmap['particle'].split('|'):
            wordmap['analysis'] += format_tag_google(pclass)
    if wordmap['subcat']:
        for subcat in wordmap['subcat'].split('|'):
            wordmap['analysis'] += format_tag_google(subcat)
    if wordmap['is_proper']:
        wordmap['analysis'] += format_tag_google('PROPER')
    lex_stub = wordmap['stub']
    retvals = []
    for new_para in wordmap['new_paras']:
        retvals += ["%s:%s\t%s\t;" %(wordmap['analysis'], lex_stub, 
                new_para)]
    return "\n".join(retvals)
Esempio n. 9
0
def format_multichars_lexc_apertium():
    multichars = "!! Apertium standard tags:\n"
    for mcs in apertium_multichars:
        if not '><' in mcs and not mcs in ['', '+', '-', '#', '0']:
            multichars += '%<' + lexc_escape(mcs) + "%>\n"
    return multichars
Esempio n. 10
0
def format_continuation_lexc_apertium(anals, surf, cont):
    analstring = format_analysis_lexc_apertium(anals)
    if 'DIGITS_' in cont and not ('BACK' in cont or 'FRONT' in cont):
        analstring = lexc_escape(surf) + analstring
    surf = lexc_escape(surf)
    return "%s:%s\t%s ;\n" %(analstring, surf, cont)
Esempio n. 11
0
def format_lexc_omor(wordmap, format):
    '''
    format string for canonical omor format for morphological analysis
    '''
    if wordmap['stub'] == ' ':
        # do not include normal white space for now
        return ""
    wordmap['stub'] = lexc_escape(wordmap['stub'])
    wordmap['analysis'] = "[WORD_ID=%s]" %(lexc_escape(wordmap['lemma']))
    wordmap['particle'] = wordmap['particle'].replace('QUALIFIER', 'ADJECTIVE')
    if wordmap['pos'] != 'PARTICLE' or not wordmap['particle'].startswith('AD'):
        wordmap['analysis'] += format_tag_omor(wordmap['pos'], format)
    if wordmap['is_suffix']:
        wordmap['analysis'] += format_tag_omor('SUFFIX', format)
    if wordmap['is_prefix']:
        wordmap['analysis'] += format_tag_omor('PREFIX', format)
        if wordmap['pos'] == 'ADJECTIVE':
            wordmap['analysis'] += format_tag_omor('Cpos', format)

    if wordmap['particle']:
        for pclass in wordmap['particle'].split('|'):
            wordmap['analysis'] += format_tag_omor(pclass, format)

    if wordmap['symbol']:
        for subcat in wordmap['symbol'].split('|'):
            wordmap['analysis'] += format_tag_omor(subcat, format)
    
    if wordmap['subcat']:
        for subcat in wordmap['subcat'].split('|'):
            wordmap['analysis'] += format_tag_omor(subcat, format)
    
    if wordmap['is_proper']:
        if '+propers' in format and wordmap['proper_noun_class']:
            for prop in wordmap['proper_noun_class'].split(','):
                wordmap['analysis'] += format_tag_omor(prop, format)
        else:
            wordmap['analysis'] += format_tag_omor('PROPER', format)

    if '+semantics' in format and wordmap['sem']:
        for sem in wordmap['sem'].split(','):
            wordmap['analysis'] += format_tag_omor(sem, format)

    if wordmap['style']:
        wordmap['analysis'] += format_tag_omor(wordmap['style'], format)
    
    if '+ktnkav' in format and wordmap['pos'] != 'ACRONYM':
        tag = "[KTN=%s]" %(lexc_escape(wordmap['kotus_tn']))
        if tag in ktnkav_multichars:
            wordmap['analysis'] += tag
            if wordmap['kotus_av']:
                wordmap['analysis'] += "[KAV=%(kotus_av)s]" %(wordmap)
    elif '+newparas' in format:
        for new_para in wordmap['new_paras']:
            wordmap['analysis'] += "[NEWPARA=%s]" %(new_para)

    # match WORD_ID= with epsilon, then stub and lemma might match
    lex_stub = '0' + wordmap['stub']
    retvals = []
    for new_para in wordmap['new_paras']:
        retvals += ["%s:%s\t%s\t;" %(wordmap['analysis'], lex_stub, 
                new_para)]
    return "\n".join(retvals)
Esempio n. 12
0
def format_continuation_lexc_segments(anals, surf, cont):
    surf = lexc_escape(surf)
    return "%s:%s\t%s ; \n" %(surf.replace(optional_hyphen, word_boundary),
            surf, cont)