コード例 #1
0
ファイル: sukija.py プロジェクト: m5w/corevoikko
def write_word_without_accents(main_vocabulary, vocabulary_files, word, entry, wordform):
    if (rx_accents.search(wordform) != None) and (wordform != u"šakki"):
        n = entry.find(u" luokka: ")
        if n == -1:
            print("write_word_without_accents: Virhe Malaga-koodissa: " + entry + u"\n")
        entry2 = deaccent(entry, 0, n)
        # 		print (entry  + entry2 + u"\n")
        generate_lex_common.write_entry(main_vocabulary, vocabulary_files, word, entry2)
コード例 #2
0
def write_word_without_accents(main_vocabulary, vocabulary_files, word, entry,
                               wordform):
    if ((rx_accents.search(wordform) != None) and (wordform != u"šakki")):
        n = entry.find(u" luokka: ")
        if (n == -1):
            print("write_word_without_accents: Virhe Malaga-koodissa: " +
                  entry + u"\n")
        entry2 = deaccent(entry, 0, n)
        #		print (entry  + entry2 + u"\n")
        generate_lex_common.write_entry(main_vocabulary, vocabulary_files,
                                        word, entry2)
コード例 #3
0
def handle_word(word):
    global OPTIONS
    global CLASSMAP
    # Drop words that are not needed in the Voikko lexicon
    # but only if not generating Sukija lexicon.
    if generate_lex_common.has_flag(word,
                                    "not_voikko") and not OPTIONS["sukija"]:
        return
    if not check_style(word): return
    if not check_usage(word): return
    if frequency(word) >= OPTIONS["frequency"] + 1: return
    if frequency(
            word) == OPTIONS["frequency"] and generate_lex_common.has_flag(
                word, "confusing"):
        return

    # Get the inflection class. Exactly one inflection class is needed
    voikko_infclass = None
    if OPTIONS["sukija"]:
        for infclass in word.getElementsByTagName("infclass"):
            if infclass.getAttribute("type") == "historical":
                voikko_infclass = generate_lex_common.tValue(infclass)
                if voikko_infclass == "banaali":  # Banaali taipuu kuten paperi.
                    voikko_infclass = "paperi"
                elif voikko_infclass == "pasuuna":
                    voikko_infclass = "peruna"
                if voikko_infclass not in [
                        "aavistaa-av1", "arvelu", "arvelu-av1",
                        "haravoida-av2", "karahka", "matala", "paperi",
                        "paperi-av1", "peruna"
                ]:
                    voikko_infclass = None
                break
    if voikko_infclass == None:
        for infclass in word.getElementsByTagName("infclass"):
            if infclass.getAttribute("type") != "historical":
                voikko_infclass = generate_lex_common.tValue(infclass)
                break
    if voikko_infclass == "poikkeava": return

    # Get the word classes
    wordclasses = generate_lex_common.tValues(
        word.getElementsByTagName("classes")[0], "wclass")
    if wordclasses[0] not in [
            "interjection", "prefix", "abbreviation", "conjunction", "adverb"
    ] and voikko_infclass == None:
        return
    vfst_word_class = get_vfst_word_class(wordclasses)
    if vfst_word_class == None: return

    # Get diacritics
    altforms = generate_lex_common.tValues(
        word.getElementsByTagName("forms")[0], "form")
    diacritics = "".join(get_diacritics(word, altforms, vfst_word_class))

    # Get forced vowel type
    if voikko_infclass == None and vfst_word_class != "[La]":
        forced_inflection_vtype = voikkoutils.VOWEL_DEFAULT
    else:
        inflectionElement = word.getElementsByTagName("inflection")
        if len(inflectionElement) > 0:
            forced_inflection_vtype = generate_lex_common.vowel_type(
                inflectionElement[0])
        else:
            forced_inflection_vtype = voikkoutils.VOWEL_DEFAULT

    # Construct debug information
    debug_info = ""
    if OPTIONS["sourceid"]:
        debug_info = '[Xs]%s[X]' % word.getAttribute("id")[1:].replace(
            "0", "%0")

    infoFlags = get_info_flags(word)

    # Process all alternative forms
    singlePartForms = []
    multiPartForms = []
    for altform in altforms:
        outputBaseform = altform.replace('|', '')
        wordform = outputBaseform.replace('=', '')
        if len(altform) == len(wordform.replace('-', '')):
            singlePartForms.append(altform)
        else:
            multiPartForms.append(altform)
        (alku, jatko) = generate_lex_common.get_malaga_inflection_class(
            wordform, voikko_infclass, wordclasses, CLASSMAP)
        if alku == None:
            errorstr = "ERROR: VFST class not found for (%s, %s)\n" % (
                wordform, voikko_infclass)
            sys.stderr.write(errorstr.encode("UTF-8"))
            sys.exit(1)
        if vfst_word_class == "[La]":
            jatko = get_abbreviation_jatko(word, altform)
        elif vfst_word_class == "[Ls]":
            jatko = get_adverb_jatko(word, altform)
        else:
            jatko = jatko.title()
        if vfst_word_class in ["[Ls]", "[Lc]", "[Lh]"]:
            for element in word.getElementsByTagName("baseform"):
                wordform = generate_lex_common.tValue(element)
                outputBaseform = wordform.replace('|', '')
        if forced_inflection_vtype == voikkoutils.VOWEL_DEFAULT:
            vtype = voikkoutils.get_wordform_infl_vowel_type(altform)
        else:
            vtype = forced_inflection_vtype
        if vtype == voikkoutils.VOWEL_FRONT: vfst_vtype = 'ä'
        elif vtype == voikkoutils.VOWEL_BACK: vfst_vtype = 'a'
        elif vtype == voikkoutils.VOWEL_BOTH: vfst_vtype = 'aä'
        vocabularyFile = vocabularyFiles[vfst_word_class.replace("[L",
                                                                 "").replace(
                                                                     "]", "")]
        if alku == None:
            errorstr = "ERROR: Malaga class not found for (%s, %s)\n" \
             % (wordform, voikko_infclass)
            generate_lex_common.write_entry(vocabularyFile, {}, word, errorstr)
            sys.stderr.write(errorstr.encode("UTF-8"))
            sys.exit(1)
        alku = alku.lower()
        (rakenne, alkuWithTags) = get_structure(altform, vfst_word_class, alku)

        if OPTIONS["no-baseform"]:
            outputBaseform = ""

        if vfst_word_class == "[Lh]":
            entry = '%s%s%s%s:%s # ;' % (
                vfst_word_class, debug_info, rakenne,
                injectBaseformToStructure(outputBaseform, alkuWithTags), alku)
            vocabularyFile.write(entry + "\n")
            continue
        vfst_class_prefix = get_vfst_class_prefix(vfst_word_class)

        # Vowel type in derived verbs
        if jatko in [
                "Heittää", "Muistaa", "Juontaa", "Hohtaa", "Murtaa", "Nousta",
                "Loistaa", "Jättää", "Kihistä"
        ]:
            diacritics = diacritics + vowel_type_for_derived_verb(alkuWithTags)
            if jatko == "Kihistä" and vtype == voikkoutils.VOWEL_FRONT and "y" not in alku and "ä" not in alku and "ö" not in alku and "e" in alku:
                jatko = "Helistä"

        if jatko == "Nainen" and vfst_class_prefix in [
                "Laatusana", "NimiLaatusana"
        ] and altform.endswith("inen"):
            jatko = "NainenInen"

        if vfst_word_class == "[Lp]":
            entry = '[Lp]%s%s%s%s%s:%s%s EtuliitteenJatko_%s;' \
                    % (debug_info, rakenne, alkuWithTags, diacritics, infoFlags, alku, diacritics, get_prefix_jatko(word, altform))
        else:
            entry = '%s%s%s%s%s%s:%s%s %s%s_%s ;' \
                    % (vfst_word_class, debug_info, rakenne, infoFlags, injectBaseformToStructure(outputBaseform, alkuWithTags),
                    diacritics, alku, diacritics, vfst_class_prefix, jatko, vfst_vtype)
        vocabularyFile.write(entry + "\n")

    # Sanity check for alternative forms: if there are both multi part forms and single part forms
    # then all multi part forms must end with a part contained in the single part set.
    if singlePartForms:
        for multiPartForm in multiPartForms:
            lastPart = multiPartForm[max(multiPartForm.rfind(
                "="), multiPartForm.rfind("|"), multiPartForm.rfind("-")) + 1:]
            if lastPart not in singlePartForms:
                sys.stderr.write(
                    "ERROR: suspicious alternative spelling: %s\n" %
                    multiPartForm)
                sys.exit(1)
コード例 #4
0
ファイル: sukija.py プロジェクト: m5w/corevoikko
def handle_word(main_vocabulary, vocabulary_files, word):
    if generate_lex_common.has_flag(word, "not_sukija"):
        return

    # Get the inflection class. Exactly one inflection class is needed.
    infclasses = word.getElementsByTagName("infclass")
    voikko_infclass = None
    for infclass in word.getElementsByTagName("infclass"):
        if infclass.getAttribute("type") == "historical":
            voikko_infclass = generate_lex_common.tValue(infclass)
            break
    if voikko_infclass in [
        u"antautua",
        u"kaihtaa",
        u"laittaa",
        u"paahtaa",
        u"taittaa",
        u"veranta",
        u"vihanta",
        u"virkkaa",
    ]:
        voikko_infclass = voikko_infclass + u"-av1"

    if voikko_infclass == None:
        for infclass in word.getElementsByTagName("infclass"):
            if infclass.getAttribute("type") != "historical":
                voikko_infclass = generate_lex_common.tValue(infclass)
                break

    ##	if voikko_infclass == None: return
    if voikko_infclass == u"poikkeava":
        return

    # Get the word classes
    wordclasses = generate_lex_common.tValues(word.getElementsByTagName("classes")[0], "wclass")
    if wordclasses[0] != u"interjection" and voikko_infclass == None:
        return
    malaga_word_class = generate_lex_common.get_malaga_word_class(wordclasses)
    if malaga_word_class == None:
        return

    # Get malaga flags
    malaga_flags = generate_lex_common.get_malaga_flags(word)

    # Get forced vowel type
    if voikko_infclass == None:
        forced_inflection_vtype = voikkoutils.VOWEL_DEFAULT
    else:
        forced_inflection_vtype = generate_lex_common.vowel_type(word.getElementsByTagName("inflection")[0])

        # Get forced vowel type
    ###	forced_inflection_vtype = generate_lex_common.vowel_type(word.getElementsByTagName("inflection")[0])

    # Process all alternative forms
    for altform in generate_lex_common.tValues(word.getElementsByTagName("forms")[0], "form"):
        wordform = altform.replace(u"|", u"").replace(u"=", u"")
        if (voikko_infclass == u"nuolaista-av2") and (wordform in [u"häväistä", u"vavista"]):
            voikko_infclass = u"nuolaista"
        # 		print (u"Hoo " + str(voikko_infclass) + u" " + u" " + wordform + u"\n")
        # 		print(u"Tavutus1 " + wordform + u" " + hyphenate(wordform.lower()) + u"\n")
        (alku, jatko) = generate_lex_common.get_malaga_inflection_class(
            wordform, voikko_infclass, wordclasses, classmap
        )
        # 		print (u"Huu " + wordform + u" " + str(alku) + u" " + str(jatko) + u" "  + str(voikko_infclass))
        if forced_inflection_vtype == voikkoutils.VOWEL_DEFAULT:
            vtype = voikkoutils.get_wordform_infl_vowel_type(altform)
        else:
            vtype = forced_inflection_vtype
        if vtype == voikkoutils.VOWEL_FRONT:
            malaga_vtype = u"ä"
        elif vtype == voikkoutils.VOWEL_BACK:
            malaga_vtype = u"a"
        elif vtype == voikkoutils.VOWEL_BOTH:
            malaga_vtype = u"aä"
        malaga_vtype = new_vtype(malaga_vtype, wordform)
        rakenne = generate_lex_common.get_structure(altform, malaga_word_class)
        if alku == None:
            generate_lex_common.write_entry(
                main_vocabulary,
                vocabulary_files,
                word,
                u"#Malaga class not found for (%s, %s)\n" % (wordform, voikko_infclass),
            )
            continue
        if wordform in words:
            # 			print ("Ei tarvita: " + wordform)
            continue
        if rx_begin.match(wordform) != None:
            # 			print ("Ei tarvita: " + wordform)
            continue
        if rx_end.match(wordform) != None:
            # 			print ("Ei tarvita: " + wordform)
            continue
            # Joillakin sanoilla on sanastossa kaksi taivususkaavaa, Sukijassa
            # taivutuskaavat on yhdistetty, ja toisen taivutuskaavan voi poistaa.
        if (wordform in [u"ori", u"ripsi", u"sini", u"täti", u"äiti"]) and (jatko == u"risti"):
            # 			print ("Ei tarvita: " + wordform)
            continue
        if (wordform == u"kampi") and (jatko == u"sampi"):
            # 			print ("Ei tarvita: " + wordform)
            continue

        # 		nsyl = number_of_syllabels(wordform)

        m = rx.match(wordform)
        d = None

        if m != None:
            d = m.groupdict()

        alku2 = u""
        jatko2 = u""
        wordform2 = u""

        alku3 = u""
        jatko3 = u""
        wordform3 = u""

        alku4 = u""
        jatko4 = u""
        wordform4 = u""

        alku5 = u""
        jatko5 = u""
        wordform5 = u""

        alku6 = u""
        jatko6 = u""
        wordform6 = u""

        s = u'lähtösana: "' + wordform + u'", lähtöalku: "' + alku + u'"'

        # Korjataan alku- ja jatko-kenttien arvoja.
        #
        # 		elif (jatko == u"rakentaa"):
        if jatko == u"rakentaa":
            alku = wordform[:-4]

            # Tulostetaan.

        # 		print(u"Word   " + wordform + u"\n")
        entry = u'[perusmuoto: "%s", alku: "%s", luokka: %s, jatko: <%s>, äs: %s%s%s];' % (
            wordform,
            alku,
            malaga_word_class,
            jatko,
            malaga_vtype,
            malaga_flags,
            generate_lex_common.get_structure(altform, malaga_word_class),
        )
        generate_lex_common.write_entry(main_vocabulary, vocabulary_files, word, entry)

        write_word_without_accents(main_vocabulary, vocabulary_files, word, entry, wordform)

        if len(wordform2) > 0:
            entry = u'[perusmuoto: "%s", alku: "%s", luokka: %s, jatko: <%s>, äs: %s%s%s, %s];' % (
                wordform2,
                alku2,
                malaga_word_class,
                jatko2,
                malaga_vtype,
                malaga_flags,
                generate_lex_common.get_structure(altform, malaga_word_class),
                s,
            )
            generate_lex_common.write_entry(main_vocabulary, vocabulary_files, word, entry)

        if len(wordform3) > 0:
            entry = u'[perusmuoto: "%s", alku: "%s", luokka: %s, jatko: <%s>, äs: %s%s%s, %s];' % (
                wordform3,
                alku3,
                malaga_word_class,
                jatko3,
                malaga_vtype,
                malaga_flags,
                generate_lex_common.get_structure(altform, malaga_word_class),
                s,
            )
            generate_lex_common.write_entry(main_vocabulary, vocabulary_files, word, entry)

        if len(wordform4) > 0:
            entry = u'[perusmuoto: "%s", alku: "%s", luokka: %s, jatko: <%s>, äs: %s%s%s, %s];' % (
                wordform4,
                alku4,
                malaga_word_class,
                jatko4,
                malaga_vtype,
                malaga_flags,
                generate_lex_common.get_structure(altform, malaga_word_class),
                s,
            )
            generate_lex_common.write_entry(main_vocabulary, vocabulary_files, word, entry)

        if len(wordform5) > 0:
            entry = u'[perusmuoto: "%s", alku: "%s", luokka: %s, jatko: <%s>, äs: %s%s%s, %s];' % (
                wordform5,
                alku5,
                malaga_word_class,
                jatko5,
                malaga_vtype,
                malaga_flags,
                generate_lex_common.get_structure(altform, malaga_word_class),
                s,
            )
            generate_lex_common.write_entry(main_vocabulary, vocabulary_files, word, entry)

        if len(wordform6) > 0:
            entry = u'[perusmuoto: "%s", alku: "%s", luokka: %s, jatko: <%s>, äs: %s%s%s, %s];' % (
                wordform6,
                alku6,
                malaga_word_class,
                jatko6,
                malaga_vtype,
                malaga_flags,
                generate_lex_common.get_structure(altform, malaga_word_class),
                s,
            )
            generate_lex_common.write_entry(main_vocabulary, vocabulary_files, word, entry)
コード例 #5
0
ファイル: generate_lex.py プロジェクト: diegolinan/corevoikko
def handle_word(word):
	global OPTIONS
	global CLASSMAP
	# Drop words that are not needed in the Voikko lexicon
	if generate_lex_common.has_flag(word, "not_voikko") and "sukija" not in OPTIONS["extra-usage"]:
		return
	if not check_style(word): return
	if not check_usage(word): return
	if frequency(word) >= OPTIONS["frequency"] + 1: return
	if frequency(word) == OPTIONS["frequency"] and generate_lex_common.has_flag(word, "confusing"): return
	
	# Get the inflection class. Exactly one inflection class is needed
	voikko_infclass = None
	for infclass in word.getElementsByTagName("infclass"):
		if infclass.getAttribute("type") != "historical":
			voikko_infclass = generate_lex_common.tValue(infclass)
			break
	if voikko_infclass == "poikkeava": return
	
	# Get the word classes
	wordclasses = generate_lex_common.tValues(word.getElementsByTagName("classes")[0], "wclass")
	if wordclasses[0] not in ["interjection", "prefix", "abbreviation", "conjunction", "adverb"] and voikko_infclass == None:
		return
	malaga_word_class = generate_lex_common.get_malaga_word_class(wordclasses)
	if malaga_word_class == None: return
	
	baseformTags = word.getElementsByTagName("baseform")
	if len(baseformTags) > 0:
		baseform = generate_lex_common.tValue(baseformTags[0])
	else:
		baseform = None
	
	# Get malaga flags
	malaga_flags = generate_lex_common.get_malaga_flags(word)
	
	# Get forced vowel type
	if voikko_infclass == None and malaga_word_class != "lyhenne":
		forced_inflection_vtype = voikkoutils.VOWEL_DEFAULT
	else:
		inflectionElement = word.getElementsByTagName("inflection")
		if len(inflectionElement) > 0:
			forced_inflection_vtype = generate_lex_common.vowel_type(inflectionElement[0])
		else:
			forced_inflection_vtype = voikkoutils.VOWEL_DEFAULT
	
	# Construct debug information and additional attributes
	additional_attributes = get_additional_attributes(word)
	if OPTIONS["sourceid"]:
		additional_attributes = additional_attributes + ', sourceid: "%s"' % word.getAttribute("id")
	
	# Process all alternative forms
	singlePartForms = []
	multiPartForms = []
	for altform in generate_lex_common.tValues(word.getElementsByTagName("forms")[0], "form"):
		wordform = altform.replace('|', '').replace('=', '')
		if len(altform) == len(wordform.replace('-', '')):
			singlePartForms.append(altform)
		else:
			multiPartForms.append(altform)
		(alku, jatko) = generate_lex_common.get_malaga_inflection_class(wordform, voikko_infclass, wordclasses, CLASSMAP)
		if alku == None:
			errorstr = "ERROR: Malaga class not found for (%s, %s)\n" \
				% (wordform, voikko_infclass)
			generate_lex_common.write_entry(main_vocabulary, {}, word, errorstr)
			sys.stderr.write(errorstr.encode("UTF-8"))
			sys.exit(1)
		if malaga_word_class == "lyhenne":
			jatko = get_abbreviation_jatko(word, altform)
		elif malaga_word_class == "seikkasana":
			jatko = get_adverb_jatko(word)
		if malaga_word_class == "etuliite":
			vtype = voikkoutils.VOWEL_BOTH
			malaga_jatko = get_prefix_jatko(word)
		else:
			if forced_inflection_vtype == voikkoutils.VOWEL_DEFAULT:
				vtype = voikkoutils.get_wordform_infl_vowel_type(altform)
			else:
				vtype = forced_inflection_vtype
			malaga_jatko = "<" + jatko + ">"
		if vtype == voikkoutils.VOWEL_FRONT: malaga_vtype = 'ä'
		elif vtype == voikkoutils.VOWEL_BACK: malaga_vtype = 'a'
		elif vtype == voikkoutils.VOWEL_BOTH: malaga_vtype = 'aä'
		rakenne = generate_lex_common.get_structure(altform, malaga_word_class)
		if baseform is None:
			altBaseform = altform
		else:
			altBaseform = baseform
		if malaga_word_class == "lyhenne":
			perusmuotoEntry = ""
		else:
			perusmuotoEntry = 'perusmuoto: "%s", ' % altBaseform
		entry = '[%salku: "%s", luokka: %s, jatko: %s, äs: %s%s%s%s];' \
		          % (perusmuotoEntry, alku, malaga_word_class, malaga_jatko, malaga_vtype, malaga_flags,
			   generate_lex_common.get_structure(altform, malaga_word_class),
			   additional_attributes)
		generate_lex_common.write_entry(main_vocabulary, {}, word, entry)
	
	# Sanity check for alternative forms: if there are both multi part forms and single part forms
	# then all multi part forms must end with a part contained in the single part set.
	if singlePartForms:
		for multiPartForm in multiPartForms:
			lastPart = multiPartForm[max(multiPartForm.rfind("="), multiPartForm.rfind("|"), multiPartForm.rfind("-")) + 1:]
			if lastPart not in singlePartForms:
				sys.stderr.write("ERROR: suspicious alternative spelling: %s\n" % multiPartForm)
				sys.exit(1)
コード例 #6
0
ファイル: generate_lex.py プロジェクト: komu/corevoikko
def handle_word(word):
	global OPTIONS
	global CLASSMAP
	# Drop words that are not needed in the Voikko lexicon
	# but only if not generating Sukija lexicon.
	if generate_lex_common.has_flag(word, "not_voikko") and not OPTIONS["sukija"]: return
	if not check_style(word): return
	if not check_usage(word): return
	if frequency(word) >= OPTIONS["frequency"] + 1: return
	if frequency(word) == OPTIONS["frequency"] and generate_lex_common.has_flag(word, "confusing"): return
	
	# Get the inflection class. Exactly one inflection class is needed
	voikko_infclass = None
	for infclass in word.getElementsByTagName("infclass"):
		if infclass.getAttribute("type") != "historical":
			voikko_infclass = generate_lex_common.tValue(infclass)
			break
	if voikko_infclass == u"poikkeava": return
	
	# Get the word classes
	wordclasses = generate_lex_common.tValues(word.getElementsByTagName("classes")[0], "wclass")
	if wordclasses[0] not in [u"interjection", u"prefix", u"abbreviation", u"adverb"] and voikko_infclass == None:
		return
	vfst_word_class = get_vfst_word_class(wordclasses)
	if vfst_word_class == None: return
	
	# Get diacritics
	altforms = generate_lex_common.tValues(word.getElementsByTagName("forms")[0], "form")
	diacritics = reduce(lambda x, y: x + y, get_diacritics(word, altforms, vfst_word_class), u"")
	
	# Get forced vowel type
	if voikko_infclass == None and vfst_word_class != u"[La]":
		forced_inflection_vtype = voikkoutils.VOWEL_DEFAULT
	else:
		inflectionElement = word.getElementsByTagName("inflection")
		if len(inflectionElement) > 0:
			forced_inflection_vtype = generate_lex_common.vowel_type(inflectionElement[0])
		else:
			forced_inflection_vtype = voikkoutils.VOWEL_DEFAULT
	
	# Construct debug information
	debug_info = u""
	if OPTIONS["sourceid"]:
		debug_info = u', sourceid: "%s"' % word.getAttribute("id")
	
	infoFlags = get_info_flags(word)
	
	# Process all alternative forms
	singlePartForms = []
	multiPartForms = []
	for altform in altforms:
		wordform = altform.replace(u'|', u'').replace(u'=', u'')
		if len(altform) == len(wordform.replace(u'-', u'')):
			singlePartForms.append(altform)
		else:
			multiPartForms.append(altform)
		(alku, jatko) = generate_lex_common.get_malaga_inflection_class(wordform, voikko_infclass, wordclasses, CLASSMAP)
		if vfst_word_class == u"[La]":
			jatko = u"Lyhenne"
		elif vfst_word_class == u"[Ls]":
			jatko = get_adverb_jatko(word)
		else:
			jatko = jatko.title()
		if forced_inflection_vtype == voikkoutils.VOWEL_DEFAULT:
			vtype = voikkoutils.get_wordform_infl_vowel_type(altform)
		else: vtype = forced_inflection_vtype
		if vtype == voikkoutils.VOWEL_FRONT: vfst_vtype = u'ä'
		elif vtype == voikkoutils.VOWEL_BACK: vfst_vtype = u'a'
		elif vtype == voikkoutils.VOWEL_BOTH: vfst_vtype = u'aä'
		rakenne = get_structure(altform, vfst_word_class)
		vocabularyFile = vocabularyFiles[vfst_word_class.replace(u"[L", u"").replace(u"]", u"")]
		if alku == None:
			errorstr = u"ERROR: Malaga class not found for (%s, %s)\n" \
				% (wordform, voikko_infclass)
			generate_lex_common.write_entry(vocabularyFile, {}, word, errorstr)
			sys.stderr.write(errorstr.encode(u"UTF-8"))
			sys.exit(1)
		if vfst_word_class == u"[Lh]":
			entry = u'%s[Xp]%s[X]%s%s:%s # ;' % (vfst_word_class, wordform, get_structure(altform, vfst_word_class), alku, alku)
			vocabularyFile.write(entry + u"\n")
			continue
		vfst_class_prefix = get_vfst_class_prefix(vfst_word_class)
		#entry = u'[perusmuoto: "%s", alku: "%s", luokka: %s, jatko: <%s>, äs: %s%s%s%s];' \
		#          % (wordform, alku, malaga_word_class, jatko, malaga_vtype, malaga_flags,
		#	   generate_lex_common.get_structure(altform, malaga_word_class),
		#	   debug_info)
		alku = alku.lower()
		
		# Vowel type in derived verbs
		if jatko in [u"Heittää", u"Muistaa", u"Juontaa", u"Hohtaa", u"Murtaa", u"Nousta", u"Loistaa", u"Jättää"]:
			diacritics = diacritics + vowel_type_for_derived_verb(alku)
		
		if vfst_word_class == u"[Lp]":
			entry = u'[Lp]%s:%s EtuliitteenJatko_%s;' \
			        % (wordform, wordform, get_prefix_jatko(word))
		else:
			entry = u'%s[Xp]%s[X]%s%s%s%s:%s%s %s%s_%s ;' \
			        % (vfst_word_class, wordform, get_structure(altform, vfst_word_class), infoFlags,
			        alku, diacritics, alku, diacritics, vfst_class_prefix, jatko, vfst_vtype)
		vocabularyFile.write(entry + u"\n")
	
	# Sanity check for alternative forms: if there are both multi part forms and single part forms
	# then all multi part forms must end with a part contained in the single part set.
	if singlePartForms:
		for multiPartForm in multiPartForms:
			lastPart = multiPartForm[max(rfind(multiPartForm, u"="), rfind(multiPartForm, u"|"), rfind(multiPartForm, u"-")) + 1:]
			if lastPart not in singlePartForms:
				sys.stderr.write(u"ERROR: suspicious alternative spelling: %s\n" % multiPartForm)
				sys.exit(1)
コード例 #7
0
ファイル: generate_lex.py プロジェクト: santipazos/corevoikko
def handle_word(word):
	global OPTIONS
	global CLASSMAP
	# Drop words that are not needed in the Voikko lexicon
	if generate_lex_common.has_flag(word, "not_voikko") and "sukija" not in OPTIONS["extra-usage"]:
		return
	if not check_style(word): return
	if not check_usage(word): return
	if frequency(word) >= OPTIONS["frequency"] + 1: return
	if frequency(word) == OPTIONS["frequency"] and generate_lex_common.has_flag(word, "confusing"): return
	
	# Get the inflection class. Exactly one inflection class is needed
	voikko_infclass = None
	for infclass in word.getElementsByTagName("infclass"):
		if infclass.getAttribute("type") != "historical":
			voikko_infclass = generate_lex_common.tValue(infclass)
			break
	if voikko_infclass == u"poikkeava": return
	
	# Get the word classes
	wordclasses = generate_lex_common.tValues(word.getElementsByTagName("classes")[0], "wclass")
	if wordclasses[0] not in [u"interjection", u"prefix", u"abbreviation", u"conjunction", u"adverb"] and voikko_infclass == None:
		return
	malaga_word_class = generate_lex_common.get_malaga_word_class(wordclasses)
	if malaga_word_class == None: return
	
	baseformTags = word.getElementsByTagName("baseform")
	if len(baseformTags) > 0:
		baseform = generate_lex_common.tValue(baseformTags[0])
	else:
		baseform = None
	
	# Get malaga flags
	malaga_flags = generate_lex_common.get_malaga_flags(word)
	
	# Get forced vowel type
	if voikko_infclass == None and malaga_word_class != u"lyhenne":
		forced_inflection_vtype = voikkoutils.VOWEL_DEFAULT
	else:
		inflectionElement = word.getElementsByTagName("inflection")
		if len(inflectionElement) > 0:
			forced_inflection_vtype = generate_lex_common.vowel_type(inflectionElement[0])
		else:
			forced_inflection_vtype = voikkoutils.VOWEL_DEFAULT
	
	# Construct debug information and additional attributes
	additional_attributes = get_additional_attributes(word)
	if OPTIONS["sourceid"]:
		additional_attributes = additional_attributes + u', sourceid: "%s"' % word.getAttribute("id")
	
	# Process all alternative forms
	singlePartForms = []
	multiPartForms = []
	for altform in generate_lex_common.tValues(word.getElementsByTagName("forms")[0], "form"):
		wordform = altform.replace(u'|', u'').replace(u'=', u'')
		if len(altform) == len(wordform.replace(u'-', u'')):
			singlePartForms.append(altform)
		else:
			multiPartForms.append(altform)
		(alku, jatko) = generate_lex_common.get_malaga_inflection_class(wordform, voikko_infclass, wordclasses, CLASSMAP)
		if alku == None:
			errorstr = u"ERROR: Malaga class not found for (%s, %s)\n" \
				% (wordform, voikko_infclass)
			generate_lex_common.write_entry(main_vocabulary, {}, word, errorstr)
			sys.stderr.write(errorstr.encode(u"UTF-8"))
			sys.exit(1)
		if malaga_word_class == u"lyhenne":
			jatko = get_abbreviation_jatko(word, altform)
		elif malaga_word_class == u"seikkasana":
			jatko = get_adverb_jatko(word)
		if malaga_word_class == u"etuliite":
			vtype = voikkoutils.VOWEL_BOTH
			malaga_jatko = get_prefix_jatko(word)
		else:
			if forced_inflection_vtype == voikkoutils.VOWEL_DEFAULT:
				vtype = voikkoutils.get_wordform_infl_vowel_type(altform)
			else:
				vtype = forced_inflection_vtype
			malaga_jatko = u"<" + jatko + u">"
		if vtype == voikkoutils.VOWEL_FRONT: malaga_vtype = u'ä'
		elif vtype == voikkoutils.VOWEL_BACK: malaga_vtype = u'a'
		elif vtype == voikkoutils.VOWEL_BOTH: malaga_vtype = u'aä'
		rakenne = generate_lex_common.get_structure(altform, malaga_word_class)
		if baseform is None:
			altBaseform = altform
		else:
			altBaseform = baseform
		if malaga_word_class == u"lyhenne":
			perusmuotoEntry = u""
		else:
			perusmuotoEntry = u'perusmuoto: "%s", ' % altBaseform
		entry = u'[%salku: "%s", luokka: %s, jatko: %s, äs: %s%s%s%s];' \
		          % (perusmuotoEntry, alku, malaga_word_class, malaga_jatko, malaga_vtype, malaga_flags,
			   generate_lex_common.get_structure(altform, malaga_word_class),
			   additional_attributes)
		generate_lex_common.write_entry(main_vocabulary, {}, word, entry)
	
	# Sanity check for alternative forms: if there are both multi part forms and single part forms
	# then all multi part forms must end with a part contained in the single part set.
	if singlePartForms:
		for multiPartForm in multiPartForms:
			lastPart = multiPartForm[max(rfind(multiPartForm, u"="), rfind(multiPartForm, u"|"), rfind(multiPartForm, u"-")) + 1:]
			if lastPart not in singlePartForms:
				sys.stderr.write(u"ERROR: suspicious alternative spelling: %s\n" % multiPartForm)
				sys.exit(1)
コード例 #8
0
ファイル: generate_lex.py プロジェクト: voikko/corevoikko
def handle_word(word):
	global OPTIONS
	global CLASSMAP
	# Drop words that are not needed in the Voikko lexicon
	# but only if not generating Sukija lexicon.
	if generate_lex_common.has_flag(word, "not_voikko") and not OPTIONS["sukija"]: return
	if not check_style(word): return
	if not check_usage(word): return
	if frequency(word) >= OPTIONS["frequency"] + 1: return
	if frequency(word) == OPTIONS["frequency"] and generate_lex_common.has_flag(word, "confusing"): return
	
	# Get the inflection class. Exactly one inflection class is needed
	voikko_infclass = None
	if OPTIONS["sukija"]:
		for infclass in word.getElementsByTagName("infclass"):
			if infclass.getAttribute("type") == "historical":
				voikko_infclass = generate_lex_common.tValue(infclass)
				if voikko_infclass == "banaali":   # Banaali taipuu kuten paperi.
					voikko_infclass = "paperi"
				elif voikko_infclass == "pasuuna":
					voikko_infclass = "peruna"
				if voikko_infclass not in ["aavistaa-av1", "arvelu", "arvelu-av1", "asema-av1", "haravoida-av2", "karahka", "kiiski", "matala", "paperi", "paperi-av1", "peruna"]:
					voikko_infclass = None
				break
	if voikko_infclass == None:
		for infclass in word.getElementsByTagName("infclass"):
			if infclass.getAttribute("type") != "historical":
				voikko_infclass = generate_lex_common.tValue(infclass)
				break
	if voikko_infclass == "poikkeava": return
	
	# Get the word classes
	wordclasses = generate_lex_common.tValues(word.getElementsByTagName("classes")[0], "wclass")
	if wordclasses[0] not in ["interjection", "prefix", "abbreviation", "conjunction", "adverb"] and voikko_infclass == None:
		return
	vfst_word_class = get_vfst_word_class(wordclasses)
	if vfst_word_class == None: return
	
	# Get diacritics
	altforms = generate_lex_common.tValues(word.getElementsByTagName("forms")[0], "form")
	diacritics = "".join(get_diacritics(word, altforms, vfst_word_class))
	
	# Get forced vowel type
	if voikko_infclass == None and vfst_word_class != "[La]":
		forced_inflection_vtype = voikkoutils.VOWEL_DEFAULT
	else:
		inflectionElement = word.getElementsByTagName("inflection")
		if len(inflectionElement) > 0:
			forced_inflection_vtype = generate_lex_common.vowel_type(inflectionElement[0])
		else:
			forced_inflection_vtype = voikkoutils.VOWEL_DEFAULT
	
	# Construct debug information
	debug_info = ""
	if OPTIONS["sourceid"]:
		debug_info = '[Xs]%s[X]' % word.getAttribute("id")[1:].replace("0", "%0")
	
	infoFlags = get_info_flags(word)
	
	# Process all alternative forms
	singlePartForms = []
	multiPartForms = []
	for altform in altforms:
		outputBaseform = altform.replace('|', '')
		wordform = outputBaseform.replace('=', '')
		if len(altform) == len(wordform.replace('-', '')):
			singlePartForms.append(altform)
		else:
			multiPartForms.append(altform)
		(alku, jatko) = generate_lex_common.get_malaga_inflection_class(wordform, voikko_infclass, wordclasses, CLASSMAP)
		if alku == None:
			errorstr = "ERROR: VFST class not found for (%s, %s)\n" % (wordform, voikko_infclass)
			sys.stderr.write(errorstr)
			sys.exit(1)
		if vfst_word_class == "[La]":
			jatko = get_abbreviation_jatko(word, altform)
		elif vfst_word_class == "[Ls]":
			jatko = get_adverb_jatko(word, altform)
		else:
			jatko = jatko.title()
		if vfst_word_class in ["[Ls]", "[Lc]", "[Lh]"]:
			for element in word.getElementsByTagName("baseform"):
				wordform = generate_lex_common.tValue(element)
				outputBaseform = wordform.replace('|', '')
		if forced_inflection_vtype == voikkoutils.VOWEL_DEFAULT:
			vtype = voikkoutils.get_wordform_infl_vowel_type(altform)
		else: vtype = forced_inflection_vtype
		if vtype == voikkoutils.VOWEL_FRONT: vfst_vtype = 'ä'
		elif vtype == voikkoutils.VOWEL_BACK: vfst_vtype = 'a'
		elif vtype == voikkoutils.VOWEL_BOTH: vfst_vtype = 'aä'
		vocabularyFile = vocabularyFiles[vfst_word_class.replace("[L", "").replace("]", "")]
		if alku == None:
			errorstr = "ERROR: Malaga class not found for (%s, %s)\n" \
				% (wordform, voikko_infclass)
			generate_lex_common.write_entry(vocabularyFile, {}, word, errorstr)
			sys.stderr.write(errorstr)
			sys.exit(1)
		alku = alku.lower()
		(rakenne, alkuWithTags) = get_structure(altform, vfst_word_class, alku)
		
		if OPTIONS["no-baseform"]:
			outputBaseform = ""
		
		if vfst_word_class == "[Lh]":
			entry = '%s%s%s%s:%s # ;' % (vfst_word_class, debug_info, rakenne, injectBaseformToStructure(outputBaseform, alkuWithTags), alku)
			vocabularyFile.write(entry + "\n")
			continue
		vfst_class_prefix = get_vfst_class_prefix(vfst_word_class)
		
		# Vowel type in derived verbs
		if jatko in ["Heittää", "Muistaa", "Juontaa", "Hohtaa", "Murtaa", "Nousta", "Loistaa", "Jättää", "Kihistä", "Kyntää2"]:
			diacritics = diacritics + vowel_type_for_derived_verb(alkuWithTags)
			if jatko == "Kihistä" and vtype == voikkoutils.VOWEL_FRONT and "y" not in alku and "ä" not in alku and "ö" not in alku and "e" in alku:
				jatko = "Helistä"
		
		if jatko == "Nainen" and vfst_class_prefix in ["Laatusana", "NimiLaatusana"] and altform.endswith("inen"):
			jatko = "NainenInen"
		
		if vfst_word_class == "[Lp]":
			entry = '[Lp]%s%s%s%s%s:%s%s EtuliitteenJatko_%s;' \
			        % (debug_info, rakenne, alkuWithTags, diacritics, infoFlags, alku, diacritics, get_prefix_jatko(word, altform))
		else:
			entry = '%s%s%s%s%s%s:%s%s %s%s_%s ;' \
			        % (vfst_word_class, debug_info, rakenne, infoFlags, injectBaseformToStructure(outputBaseform, alkuWithTags),
			        diacritics, alku, diacritics, vfst_class_prefix, jatko, vfst_vtype)
		vocabularyFile.write(entry + "\n")
	
	# Sanity check for alternative forms: if there are both multi part forms and single part forms
	# then all multi part forms must end with a part contained in the single part set.
	if singlePartForms:
		for multiPartForm in multiPartForms:
			lastPart = multiPartForm[max(multiPartForm.rfind("="), multiPartForm.rfind("|"), multiPartForm.rfind("-")) + 1:]
			if lastPart not in singlePartForms:
				sys.stderr.write("ERROR: suspicious alternative spelling: %s\n" % multiPartForm)
				sys.exit(1)
コード例 #9
0
def handle_word(main_vocabulary, vocabulary_files, word):
    if generate_lex_common.has_flag(word, "not_sukija"): return

    # Get the inflection class. Exactly one inflection class is needed.
    infclasses = word.getElementsByTagName("infclass")
    voikko_infclass = None
    for infclass in word.getElementsByTagName("infclass"):
        if infclass.getAttribute("type") == "historical":
            voikko_infclass = generate_lex_common.tValue(infclass)
            break
    if (voikko_infclass in [
            u"antautua", u"kaihtaa", u"laittaa", u"paahtaa", u"taittaa",
            u"veranta", u"vihanta", u"virkkaa"
    ]):
        voikko_infclass = voikko_infclass + u"-av1"

    if voikko_infclass == None:
        for infclass in word.getElementsByTagName("infclass"):
            if infclass.getAttribute("type") != "historical":
                voikko_infclass = generate_lex_common.tValue(infclass)
                break

##	if voikko_infclass == None: return
    if voikko_infclass == u"poikkeava": return

    # Get the word classes
    wordclasses = generate_lex_common.tValues(
        word.getElementsByTagName("classes")[0], "wclass")
    if wordclasses[0] != u"interjection" and voikko_infclass == None:
        return
    malaga_word_class = generate_lex_common.get_malaga_word_class(wordclasses)
    if malaga_word_class == None: return

    # Get malaga flags
    malaga_flags = generate_lex_common.get_malaga_flags(word)

    # Get forced vowel type
    if voikko_infclass == None:
        forced_inflection_vtype = voikkoutils.VOWEL_DEFAULT
    else:
        forced_inflection_vtype = generate_lex_common.vowel_type(
            word.getElementsByTagName("inflection")[0])

    # Get forced vowel type
###	forced_inflection_vtype = generate_lex_common.vowel_type(word.getElementsByTagName("inflection")[0])

# Process all alternative forms
    for altform in generate_lex_common.tValues(
            word.getElementsByTagName("forms")[0], "form"):
        wordform = altform.replace(u'|', u'').replace(u'=', u'')
        if (voikko_infclass == u"nuolaista-av2") and (wordform in [
                u"häväistä", u"vavista"
        ]):
            voikko_infclass = u"nuolaista"
#		print (u"Hoo " + str(voikko_infclass) + u" " + u" " + wordform + u"\n")
#		print(u"Tavutus1 " + wordform + u" " + hyphenate(wordform.lower()) + u"\n")
        (alku, jatko) = generate_lex_common.get_malaga_inflection_class(
            wordform, voikko_infclass, wordclasses, classmap)
        #		print (u"Huu " + wordform + u" " + str(alku) + u" " + str(jatko) + u" "  + str(voikko_infclass))
        if forced_inflection_vtype == voikkoutils.VOWEL_DEFAULT:
            vtype = voikkoutils.get_wordform_infl_vowel_type(altform)
        else:
            vtype = forced_inflection_vtype
        if vtype == voikkoutils.VOWEL_FRONT: malaga_vtype = u'ä'
        elif vtype == voikkoutils.VOWEL_BACK: malaga_vtype = u'a'
        elif vtype == voikkoutils.VOWEL_BOTH: malaga_vtype = u'aä'
        malaga_vtype = new_vtype(malaga_vtype, wordform)
        rakenne = generate_lex_common.get_structure(altform, malaga_word_class)
        if alku == None:
            generate_lex_common.write_entry(main_vocabulary, vocabulary_files, word, \
                u"#Malaga class not found for (%s, %s)\n" \
                               % (wordform, voikko_infclass))
            continue
        if (wordform in words):
            #			print ("Ei tarvita: " + wordform)
            continue
        if (rx_begin.match(wordform) != None):
            #			print ("Ei tarvita: " + wordform)
            continue
        if (rx_end.match(wordform) != None):
            #			print ("Ei tarvita: " + wordform)
            continue
        # Joillakin sanoilla on sanastossa kaksi taivususkaavaa, Sukijassa
        # taivutuskaavat on yhdistetty, ja toisen taivutuskaavan voi poistaa.
        if ((wordform in [u'ori', u'ripsi', u'sini', u'täti', u'äiti'])
                and (jatko == u'risti')):
            #			print ("Ei tarvita: " + wordform)
            continue
        if ((wordform == u'kampi') and (jatko == u'sampi')):
            #			print ("Ei tarvita: " + wordform)
            continue

#		nsyl = number_of_syllabels(wordform)

        m = rx.match(wordform)
        d = None

        if (m != None):
            d = m.groupdict()

        alku2 = u""
        jatko2 = u""
        wordform2 = u""

        alku3 = u""
        jatko3 = u""
        wordform3 = u""

        alku4 = u""
        jatko4 = u""
        wordform4 = u""

        alku5 = u""
        jatko5 = u""
        wordform5 = u""

        alku6 = u""
        jatko6 = u""
        wordform6 = u""

        s = u"lähtösana: \"" + wordform + u"\", lähtöalku: \"" + alku + u"\""

        # Korjataan alku- ja jatko-kenttien arvoja.
        #
        #		elif (jatko == u"rakentaa"):
        if (jatko == u"rakentaa"):
            alku = wordform[:-4]

        # Tulostetaan.

#		print(u"Word   " + wordform + u"\n")
        entry = u'[perusmuoto: "%s", alku: "%s", luokka: %s, jatko: <%s>, äs: %s%s%s];' \
         % (wordform, alku, malaga_word_class, jatko, malaga_vtype, malaga_flags,
            generate_lex_common.get_structure(altform, malaga_word_class))
        generate_lex_common.write_entry(main_vocabulary, vocabulary_files,
                                        word, entry)

        write_word_without_accents(main_vocabulary, vocabulary_files, word,
                                   entry, wordform)

        if (len(wordform2) > 0):
            entry = u'[perusmuoto: "%s", alku: "%s", luokka: %s, jatko: <%s>, äs: %s%s%s, %s];' \
             % (wordform2, alku2, malaga_word_class, jatko2, malaga_vtype, malaga_flags,
                generate_lex_common.get_structure(altform, malaga_word_class), s)
            generate_lex_common.write_entry(main_vocabulary, vocabulary_files,
                                            word, entry)

        if (len(wordform3) > 0):
            entry = u'[perusmuoto: "%s", alku: "%s", luokka: %s, jatko: <%s>, äs: %s%s%s, %s];' \
             % (wordform3, alku3, malaga_word_class, jatko3, malaga_vtype, malaga_flags,
                generate_lex_common.get_structure(altform, malaga_word_class), s)
            generate_lex_common.write_entry(main_vocabulary, vocabulary_files,
                                            word, entry)

        if (len(wordform4) > 0):
            entry = u'[perusmuoto: "%s", alku: "%s", luokka: %s, jatko: <%s>, äs: %s%s%s, %s];' \
             % (wordform4, alku4, malaga_word_class, jatko4, malaga_vtype, malaga_flags,
                generate_lex_common.get_structure(altform, malaga_word_class), s)
            generate_lex_common.write_entry(main_vocabulary, vocabulary_files,
                                            word, entry)

        if (len(wordform5) > 0):
            entry = u'[perusmuoto: "%s", alku: "%s", luokka: %s, jatko: <%s>, äs: %s%s%s, %s];' \
             % (wordform5, alku5, malaga_word_class, jatko5, malaga_vtype, malaga_flags,
                generate_lex_common.get_structure(altform, malaga_word_class), s)
            generate_lex_common.write_entry(main_vocabulary, vocabulary_files,
                                            word, entry)

        if (len(wordform6) > 0):
            entry = u'[perusmuoto: "%s", alku: "%s", luokka: %s, jatko: <%s>, äs: %s%s%s, %s];' \
             % (wordform6, alku6, malaga_word_class, jatko6, malaga_vtype, malaga_flags,
                generate_lex_common.get_structure(altform, malaga_word_class), s)
            generate_lex_common.write_entry(main_vocabulary, vocabulary_files,
                                            word, entry)
コード例 #10
0
def handle_word(word):
	global OPTIONS
	global CLASSMAP
	# Drop words that are not needed in the Voikko lexicon
	# but only if not generating Sukija lexicon.
	if generate_lex_common.has_flag(word, "not_voikko") and not OPTIONS["sukija"]: return
	if not check_style(word): return
	if not check_usage(word): return
	if frequency(word) >= OPTIONS["frequency"] + 1: return
	if frequency(word) == OPTIONS["frequency"] and generate_lex_common.has_flag(word, "confusing"): return
	
	# Get the inflection class. Exactly one inflection class is needed
	voikko_infclass = None
	for infclass in word.getElementsByTagName("infclass"):
		if infclass.getAttribute("type") != "historical":
			voikko_infclass = generate_lex_common.tValue(infclass)
			break
	if voikko_infclass == u"poikkeava": return
	
	# Get the word classes
	wordclasses = generate_lex_common.tValues(word.getElementsByTagName("classes")[0], "wclass")
	if wordclasses[0] not in [u"interjection", u"prefix", u"abbreviation", u"conjunction", u"adverb"] and voikko_infclass == None:
		return
	vfst_word_class = get_vfst_word_class(wordclasses)
	if vfst_word_class == None: return
	
	# Get diacritics
	altforms = generate_lex_common.tValues(word.getElementsByTagName("forms")[0], "form")
	diacritics = reduce(lambda x, y: x + y, get_diacritics(word, altforms, vfst_word_class), u"")
	
	# Get forced vowel type
	if voikko_infclass == None and vfst_word_class != u"[La]":
		forced_inflection_vtype = voikkoutils.VOWEL_DEFAULT
	else:
		inflectionElement = word.getElementsByTagName("inflection")
		if len(inflectionElement) > 0:
			forced_inflection_vtype = generate_lex_common.vowel_type(inflectionElement[0])
		else:
			forced_inflection_vtype = voikkoutils.VOWEL_DEFAULT
	
	# Construct debug information
	debug_info = u""
	if OPTIONS["sourceid"]:
		debug_info = u', sourceid: "%s"' % word.getAttribute("id")
	
	infoFlags = get_info_flags(word)
	
	# Process all alternative forms
	singlePartForms = []
	multiPartForms = []
	for altform in altforms:
		wordform = altform.replace(u'|', u'').replace(u'=', u'')
		if len(altform) == len(wordform.replace(u'-', u'')):
			singlePartForms.append(altform)
		else:
			multiPartForms.append(altform)
		(alku, jatko) = generate_lex_common.get_malaga_inflection_class(wordform, voikko_infclass, wordclasses, CLASSMAP)
		if vfst_word_class == u"[La]":
			jatko = get_abbreviation_jatko(word, altform)
		elif vfst_word_class == u"[Ls]":
			jatko = get_adverb_jatko(word, altform)
		else:
			jatko = jatko.title()
		if forced_inflection_vtype == voikkoutils.VOWEL_DEFAULT:
			vtype = voikkoutils.get_wordform_infl_vowel_type(altform)
		else: vtype = forced_inflection_vtype
		if vtype == voikkoutils.VOWEL_FRONT: vfst_vtype = u'ä'
		elif vtype == voikkoutils.VOWEL_BACK: vfst_vtype = u'a'
		elif vtype == voikkoutils.VOWEL_BOTH: vfst_vtype = u'aä'
		vocabularyFile = vocabularyFiles[vfst_word_class.replace(u"[L", u"").replace(u"]", u"")]
		if alku == None:
			errorstr = u"ERROR: Malaga class not found for (%s, %s)\n" \
				% (wordform, voikko_infclass)
			generate_lex_common.write_entry(vocabularyFile, {}, word, errorstr)
			sys.stderr.write(errorstr.encode(u"UTF-8"))
			sys.exit(1)
		alku = alku.lower()
		(rakenne, alkuWithTags) = get_structure(altform, vfst_word_class, alku)
		
		if vfst_word_class == u"[Lh]":
			entry = u'%s[Xp]%s[X]%s%s:%s # ;' % (vfst_word_class, wordform, rakenne, alkuWithTags, alku)
			vocabularyFile.write(entry + u"\n")
			continue
		vfst_class_prefix = get_vfst_class_prefix(vfst_word_class)
		
		# Vowel type in derived verbs
		if jatko in [u"Heittää", u"Muistaa", u"Juontaa", u"Hohtaa", u"Murtaa", u"Nousta", u"Loistaa", u"Jättää", u"Kihistä"]:
			diacritics = diacritics + vowel_type_for_derived_verb(alkuWithTags)
		
		if jatko == u"Nainen" and vfst_class_prefix in [u"Laatusana", u"NimiLaatusana"] and altform.endswith(u"inen"):
			jatko = u"NainenInen"
		
		if vfst_word_class == u"[Lp]":
			entry = u'[Lp]%s%s%s:%s%s EtuliitteenJatko_%s;' \
			        % (wordform, diacritics, infoFlags, wordform, diacritics, get_prefix_jatko(word, altform))
		else:
			entry = u'%s[Xp]%s[X]%s%s%s%s:%s%s %s%s_%s ;' \
			        % (vfst_word_class, wordform, rakenne, infoFlags,
			        alkuWithTags, diacritics, alku, diacritics, vfst_class_prefix, jatko, vfst_vtype)
		vocabularyFile.write(entry + u"\n")
	
	# Sanity check for alternative forms: if there are both multi part forms and single part forms
	# then all multi part forms must end with a part contained in the single part set.
	if singlePartForms:
		for multiPartForm in multiPartForms:
			lastPart = multiPartForm[max(rfind(multiPartForm, u"="), rfind(multiPartForm, u"|"), rfind(multiPartForm, u"-")) + 1:]
			if lastPart not in singlePartForms:
				sys.stderr.write(u"ERROR: suspicious alternative spelling: %s\n" % multiPartForm)
				sys.exit(1)