Ejemplo n.º 1
0
def check_attr_type(val, typ, msg):
    """! @brief Check that attribute value is of specified type.
    @param val The attribute value to check.
    @param typ The allowed Python type(s): simple, or Python set or list.
    @param msg The message to display if value is not of correct type.
    """
    # Python set or list of allowed types
    if type(typ) is set or type(typ) is list:
        if type(val) not in typ:
            print(Warning(msg))
    # Simple allowed type
    elif type(val) is not typ:
        print(Warning(msg))
Ejemplo n.º 2
0
def format_part_of_speech(lexical_entry,
                          font,
                          mapping=partOfSpeech_tex,
                          language=None):
    """! @brief Display part of speech in LaTeX format.
    @param lexical_entry The current Lexical Entry LMF instance.
    @param font A Python dictionary giving the vernacular, national, regional fonts to apply to a text in LaTeX format.
    @param mapping A Python dictionary giving the mapping between LMF part of speech LexicalEntry attribute value and LaTeX layout.
    @param language Language to consider to display part of speech.
    @return A string representing part of speech in LaTeX format.
    """
    result = ""
    if lexical_entry.get_partOfSpeech() is not None:
        try:
            if language is None:
                result += "\\textit{" + mapping[
                    lexical_entry.get_partOfSpeech()] + "}. "
            else:
                result += "\\textit{" + mapping[
                    (language, lexical_entry.get_partOfSpeech())] + "}. "
        except KeyError:
            print Warning(
                "Part of speech value '%s' encountered for lexeme '%s' is not defined in configuration"
                % (lexical_entry.get_partOfSpeech().encode(ENCODING),
                   lexical_entry.get_lexeme().encode(ENCODING)))
    return result
Ejemplo n.º 3
0
def check_lx(lexical_entry, lx_tmp):
    import os
    if lexical_entry.get_lexeme() != lx_tmp:
        print Warning(
            "Lexeme '%s' generated for lexical entry '%s' is not consistant." %
            (lx_tmp.encode(ENCODING),
             lexical_entry.get_lexeme().encode(ENCODING)))
Ejemplo n.º 4
0
def check_duration_format(duration):
    """! @brief Verify that duration format is composed as follows: PTxxHxxMxxS (ISO 8601: 'P' for Period).
    If not, display a Warning message.
    @param duration Duration to check.
    """
    import re
    if not re.match("^PT[0-2]\dH[0-5]\dM[0-5]\dS$", duration):
        print Warning(
            "Duration must be formatted as follows: PTxxHxxMxxS (given duration is %s)"
            % duration.encode(ENCODING))
Ejemplo n.º 5
0
def check_time_format(time):
    """! @brief Verify that time format is composed as follows: THH:MM:SS,MSMS (ISO 8601: 'T' for Time).
    If not, display a Warning message.
    @param time Time to check.
    """
    import re
    if not re.match("^T[0-2]\d:[0-5]\d:[0-5]\d(\,\d+|)$", time):
        print Warning(
            "Time must be formatted as follows: THH:MM:SS,MSMS (given time is %s)"
            % time.encode(ENCODING))
Ejemplo n.º 6
0
def check_date_format(date):
    """! @brief Verify that date format is composed as follows: YYYY-MM-DD (ISO 8601).
    If not, display a Warning message.
    @param date Date to check.
    """
    import re
    if not re.match("^\d{4}-[01]\d-[0-3]\d$", date):
        print Warning(
            "Date must be formatted as follows: YYYY-MM-DD (given date is %s)"
            % date.encode(ENCODING))
Ejemplo n.º 7
0
def check_se(lexical_entry, se_tmp):
    import os
    ok = False
    for form in lexical_entry.find_related_forms(mdf_semanticRelation["se"]):
        if form == se_tmp:
            ok = True
    if not ok:
        print Warning(
            "Subentry '%s' generated for lexical entry '%s' is not consistant."
            % (se_tmp.encode(ENCODING),
               lexical_entry.get_lexeme().encode(ENCODING)))
Ejemplo n.º 8
0
def check_nep(lexical_entry, nep):
    import os
    ok = False
    for form in lexical_entry.get_citation_forms(script_name="devanagari"):
        if form == nep:
            ok = True
    if not ok:
        print Warning(
            "Citation form '%s' of lexical entry '%s' is not consistant with generated one."
            % (nep.encode(ENCODING),
               lexical_entry.get_lexeme().encode(ENCODING)))
Ejemplo n.º 9
0
def check_attr_range(value, range, msg, mapping=None):
    """! @brief Check that attribute value is in specified range.
    @param value The attribute value to check.
    @param range A Python set giving the range of allowed values.
    @param msg The message to display if value is out-of-range.
    @param mapping A Python dictionary giving mapping between values (i.e. from MDF to LMF)
    @return The value to set, or None if out-of-range.
    """
    # Check value
    if value not in range:
        # Check converted value
        if mapping is not None:
            try:
                converted_value = mapping[value]
            except KeyError:
                print(Warning(msg))
            else:
                # Converted value to set
                return converted_value
        else:
            print(Warning(msg))
    else:
        # Value to set
        return value
Ejemplo n.º 10
0
def compare_sd(x, y):
    """Compare 2 semantic domains between each other.
    """
    try:
        # Both equal => do nothing
        if sd_order[x] == sd_order[y]:
            return 0
        # If the 1st one is lower than the 2nd one, its rank is decremented
        if sd_order[x] < sd_order[y]:
            return -1
        # If the 1st one is greater than the 2nd one, its rank is incremented
        elif sd_order[x] > sd_order[y]:
            return 1
    except KeyError:
        print Warning("Cannot compare " + x.encode(ENCODING) + " and " +
                      y.encode(ENCODING))
        return -1
Ejemplo n.º 11
0
 def compare(x, y):
     """Compare 2 elements between each other.
     """
     # Before comparing, remove acute accents from strings if any
     x = x.replace(u"\u0301", '').replace(u"\u0302", '')
     y = y.replace(u"\u0301", '').replace(u"\u0302", '')
     for i in range(min(len(x), len(y))):
         try:
             if type(sort_order) is not type(dict()):
                 if sort_order(x[i]) == sort_order(y[i]):
                     continue
                 # If the 1st one is lower than the 2nd one, its rank is decremented
                 if sort_order(x[i]) < sort_order(y[i]):
                     return -1
                 # If the 1st one is greater than the 2nd one, its rank is incremented
                 elif sort_order(x[i]) > sort_order(y[i]):
                     return 1
             else:
                 if sort_order[x[i]] == sort_order[y[i]]:
                     continue
                 # If the 1st one is lower than the 2nd one, its rank is decremented
                 if sort_order[x[i]] < sort_order[y[i]]:
                     return -1
                 # If the 1st one is greater than the 2nd one, its rank is incremented
                 elif sort_order[x[i]] > sort_order[y[i]]:
                     return 1
         # Handle other characters
         except KeyError:
             if options.verbose:
                 print Warning("Cannot compare " +
                               x[i].encode(ENCODING) + " and " +
                               y[i].encode(ENCODING))
             if x[i] == y[i]:
                 continue
             if x[i] < y[i]:
                 return -1
             elif x[i] > y[i]:
                 return 1
     # If both strings do not have the same length, they do not equal => the smallest string is the shortest one
     if len(x) < len(y):
         return -1
     elif len(x) > len(y):
         return 1
     # If all characters match, both equal => do nothing
     return 0
Ejemplo n.º 12
0
 def compare(x, y):
     """Compare 2 elements between each other.
     """
     if items(x) == items(y) and condition(x):
         # Classify similar entries by homonym number
         nb_x = x.get_homonymNumber()
         if nb_x is None:
             nb_x = 0
         nb_y = y.get_homonymNumber()
         if nb_y is None:
             nb_y = 0
         # If the 1st one is lower than the 2nd one, its rank is decremented
         if nb_x < nb_y:
             return -1
         # If the 1st one is greater than the 2nd one, its rank is incremented
         elif nb_x > nb_y:
             return 1
         else:
             print Warning(
                 "Several lexical entries '%s' exist. Please solve this issue by specifying the homonym number."
                 % items(x).encode(ENCODING))
     # Do nothing
     return 0
 def setUp(self):
     # Instantiate a Warning object
     self.warning = Warning("This is a warning.")
Ejemplo n.º 14
0
 def compare_lx(x, y):
     unknown = set(["xxxx", "???", ""])
     cmp_x = x
     cmp_y = y
     pattern = "^([" + initials.replace('j', '').replace(
         'w', ''
     ) + "]{0,3})([" + rimes + "]{1,2})#?([" + tones + "]{0,2})[$#]?[123]?(.*)$"
     n = 5
     while (n > 0):
         initial_x = ""
         rime_x = ""
         tone_x = ""
         initial_y = ""
         rime_y = ""
         tone_y = ""
         char_x = []
         char_y = []
         found = re.match(pattern, cmp_x)
         if found is None:
             if len(cmp_x) == 1:
                 if cmp_x in initials:
                     initial_x = cmp_x
                     rime_x = ""
                 elif cmp_x in rimes:
                     initial_x = ""
                     rime_x = cmp_x
                 tone_x = ""
                 cmp_x = ""
             else:
                 if cmp_x not in unknown:
                     print Warning("Cannot sort " + cmp_x.encode(ENCODING))
                 return 1
         else:
             initial_x = found.group(1)
             rime_x = found.group(2)
             tone_x = found.group(3)
             cmp_x = found.group(4)
             # Before comparing, handle combining tilde of 'ɻ̃' if any
             if rime_x == u"\u0303":
                 initial_x += rime_x
                 rime_x = ""
         found = re.match(pattern, cmp_y)
         if found is None:
             if len(cmp_y) == 1:
                 if cmp_y in initials:
                     initial_y = cmp_y
                     rime_y = ""
                 elif cmp_y in rimes:
                     initial_y = ""
                     rime_y = cmp_y
                 tone_y = ""
                 cmp_y = ""
             else:
                 if cmp_y not in unknown:
                     print Warning("Cannot sort " + cmp_y.encode(ENCODING))
                 return -1
         else:
             initial_y = found.group(1)
             rime_y = found.group(2)
             tone_y = found.group(3)
             cmp_y = found.group(4)
             # Before comparing, handle combining tilde of 'ɻ̃' if any
             if rime_y == u"\u0303":
                 initial_y += rime_y
                 rime_y = ""
         if len(initial_x) != 0:
             char_x.append(initial_x)
         if len(rime_x) != 0:
             char_x.append(rime_x)
         if len(initial_y) != 0:
             char_y.append(initial_y)
         if len(rime_y) != 0:
             char_y.append(rime_y)
         try:
             try:
                 char_x[0]
             except IndexError:
                 return -1
             try:
                 char_y[0]
             except IndexError:
                 return 1
             # If the 1st one is lower than the 2nd one, its rank is decremented
             if sort_order[char_x[0]] < sort_order[char_y[0]]:
                 return -1
             # If the 1st one is greater than the 2nd one, its rank is incremented
             elif sort_order[char_x[0]] > sort_order[char_y[0]]:
                 return 1
             else:  # sort_order[char_x[0]] == sort_order[char_y[0]]
                 single = False
                 try:
                     char_x[1]
                 except IndexError:
                     single = True
                 try:
                     char_y[1]
                 except IndexError:
                     single = True
                 if not single:
                     # If the 1st one is lower than the 2nd one, its rank is decremented
                     if sort_order[char_x[1]] < sort_order[char_y[1]]:
                         return -1
                     # If the 1st one is greater than the 2nd one, its rank is incremented
                     elif sort_order[char_x[1]] > sort_order[char_y[1]]:
                         return 1
                 # sort_order[char_x[1]] == sort_order[char_y[1]]
                 # If the 1st one is lower than the 2nd one, its rank is decremented
                 if sort_order[tone_x] < sort_order[tone_y]:
                     return -1
                 # If the 1st one is greater than the 2nd one, its rank is incremented
                 elif sort_order[tone_x] > sort_order[tone_y]:
                     return 1
                 else:  # sort_order[tone_x] == sort_order[tone_y]
                     if cmp_x == "":
                         return -1
                     if cmp_y == "":
                         return 1
                     n -= 1
                     if n == 0:
                         # If all characters match, both equal => do nothing
                         return 0
         except KeyError:
             print Warning("Cannot compare " + x.encode(ENCODING) +
                           " and " + y.encode(ENCODING))
             return 0
Ejemplo n.º 15
0
def tex_write(object,
              filename,
              preamble=None,
              introduction=None,
              lmf2tex=lmf_to_tex,
              font=None,
              items=lambda lexical_entry: lexical_entry.get_lexeme(),
              sort_order=None,
              paradigms=[],
              tables=[],
              title=None,
              tex_language=None,
              tex_other_languages=[]):
    """! @brief Write a LaTeX file.
    Note that the lexicon must already be ordered at this point. Here, parameters 'items' and 'sort_order' are only used to define chapters.
    @param object The LMF instance to convert into LaTeX output format.
    @param filename The name of the LaTeX file to write with full path, for instance 'user/output.tex'.
    @param preamble The name of the LaTeX file with full path containing the LaTeX header of the document, for instance 'user/config/japhug.tex'. Default value is None.
    @param introduction The name of the LaTeX file with full path containing the LaTeX introduction of the document, for instance 'user/config/introduction.tex'. Default value is None.
    @param lmf2tex A function giving the mapping from LMF representation information that must be written to LaTeX commands, in a defined order. Default value is 'lmf_to_tex' function defined in 'pylmflib/config/tex.py'. Please refer to it as an example.
    @param font A Python dictionary giving the vernacular, national, regional fonts to apply to a text in LaTeX format.
    @param items Lambda function giving the item to sort. Default value is 'lambda lexical_entry: lexical_entry.get_lexeme()', which means that the items to sort are lexemes.
    @param sort_order Default value is 'None', which means that the LaTeX output is alphabetically ordered.
    @param paradigms A Python list of LaTeX filenames with full path containing the paradigms in LaTeX format. Default value is an empty list.
    @param tables The name of the LaTeX file with full path containing some notes to add at the end of the LaTeX document, for instance 'user/config/conclusion.tex'. Default value is None.
    @param title A Python string containing the title of the LaTeX document. Default value is None.
    @param tex_language A Python string giving the default language to set in LaTeX.
    """
    import string, os
    # Define font
    if font is None:
        font = pylmflib.config.xml.font
    tex_file = open_write(filename)
    # Add file header if any
    tex_file.write(file_read(preamble))
    # Continue the header if needed
    if title is not None:
        tex_file.write("\\title{" + title + "}" + EOL)
    if tex_language is not None:
        tex_file.write("\setdefaultlanguage{" + tex_language + "}" + EOL)
    if tex_other_languages is not None:  # Ajouté par Benjamin pour permettre de proprement gérer plusieurs langues dans le fichier XeLaTeX.
        tex_file.write("\setotherlanguages{" + ", ".join(tex_other_languages) +
                       "}" + EOL)
    # Insert LaTeX commands to create a document
    tex_file.write(EOL + "\\begin{document}" + EOL)
    tex_file.write("\\maketitle" + EOL)
    tex_file.write("\\newpage" + EOL)
    # Add introduction if any
    if introduction is not None:
        tex_file.write("\\markboth{INTRODUCTION}{}" + EOL * 2)
    tex_file.write(file_read(introduction))
    # Add command for small caps
    tex_file.write(EOL + "\\def\\mytextsc{\\bgroup\\obeyspaces\\mytextscaux}" +
                   EOL)
    tex_file.write(
        "\\def\\mytextscaux#1{\\mytextscauxii #1\\relax\\relax\\egroup}" + EOL)
    tex_file.write("\\def\\mytextscauxii#1{%" + EOL)
    tex_file.write(
        "\\ifx\\relax#1\\else \\ifcat#1\\@sptoken{} \\expandafter\\expandafter\\expandafter\\mytextscauxii\\else"
        + EOL)
    tex_file.write(
        "\\ifnum`#1=\\uccode`#1 {\\normalsize #1}\\else {\\footnotesize \\uppercase{#1}}\\fi \\expandafter\\expandafter\\expandafter\\mytextscauxii\\expandafter\\fi\\fi}"
        + EOL * 2)
    # Configure space indent
    tex_file.write("\\setlength\\parindent{0cm}" + EOL)
    # Insert data path configuration
    # Unix-style paths
    audio_path = pylmflib.config.xml.audio_path
    graphic_path = os.path.abspath('.')
    if os.name != 'posix':
        # Windows-style paths
        audio_path = audio_path.replace("\\", "/")
        graphic_path = graphic_path.replace("\\", "/")
    tex_file.write(EOL + "\\addmediapath{" + audio_path.rstrip("/") + "}" +
                   EOL)
    tex_file.write("\\addmediapath{" + audio_path + "mp3}" + EOL)
    tex_file.write("\\addmediapath{" + audio_path + "wav}" + EOL)
    tex_file.write("\\graphicspath{{" + graphic_path +
                   "/pylmflib/output/img/}}" + EOL * 2)
    # Configure 2 columns
    tex_file.write("\\newpage" + EOL)
    tex_file.write("\\begin{multicols}{2}" + EOL * 2)
    if sort_order is None:
        # Lowercase and uppercase letters must have the same rank
        sort_order = dict([(c, ord(c)) for c in string.lowercase])
        up = dict([(c, ord(c) + 32) for c in string.uppercase])
        sort_order.update(up)
        sort_order.update({'': 0, ' ': 0})
    # For each element to write, get the corresponding LMF value
    if object.__class__.__name__ == "LexicalResource":
        for lexicon in object.get_lexicons():
            previous_character = ''
            current_character = ''
            # Lexicon is already ordered
            for lexical_entry in lexicon.get_lexical_entries():
                # Consider only main entries (subentries and components will be written as parts of the main entry)
                if lexical_entry.find_related_forms(
                        "main entry"
                ) == [] and lexical_entry.get_independentWord() is not False:
                    # Check if current element is a lexeme starting with a different character than previous lexeme
                    try:
                        current_character = items(lexical_entry)[0]
                        if sort_order[items(lexical_entry)[0:1]]:
                            current_character = items(lexical_entry)[0:1]
                        if sort_order[items(lexical_entry)[0:2]]:
                            current_character = items(lexical_entry)[0:2]
                    except IndexError:
                        pass
                    except KeyError:
                        pass
                    except TypeError:
                        pass
                    try:
                        if ( (type(sort_order) is not type(dict())) and ((previous_character == '') or (sort_order(current_character) != sort_order(previous_character))) ) \
                            or ( (type(sort_order) is type(dict())) and (int(sort_order[current_character]) != int(sort_order[previous_character])) ):
                            # Do not consider special characters
                            previous_character = current_character
                            tex_file.write("\\newpage" + EOL)
                            title = ''
                            if type(sort_order) is not type(dict()):
                                title += ' ' + font[NATIONAL](
                                    current_character)
                            else:
                                for key, value in sorted(sort_order.items(),
                                                         key=lambda x: x[1]):
                                    if int(value) == int(
                                            sort_order[current_character]):
                                        title += ' ' + font[VERNACULAR](key)
                            tex_file.write("\\section*{\\centering-" +
                                           handle_reserved(title) + " -}" +
                                           EOL)
                            #tex_file.write("\\pdfbookmark[1]{" + title + " }{" + title + " }" + EOL)
                        tex_file.write(lmf2tex(lexical_entry, font))
                        if len(paradigms) != 0:
                            tex_file.write(insert_references(lexical_entry))
                        tex_file.write("\\lhead{\\firstmark}" + EOL)
                        tex_file.write("\\rhead{\\botmark}" + EOL)
                        # Separate lexical entries from each others with a blank line
                        tex_file.write(EOL)
                        # Handle subentries
                        for related_form in lexical_entry.get_related_forms(
                                "subentry"):
                            if related_form.get_lexical_entry() is not None:
                                tex_file.write(
                                    lmf2tex(related_form.get_lexical_entry(),
                                            font))
                                if len(paradigms) != 0:
                                    tex_file.write(
                                        insert_references(
                                            related_form.get_lexical_entry()))
                                # Separate sub-entries from each others with a blank line
                                tex_file.write(EOL)
                    except KeyError:
                        print Warning("Cannot sort item %s" %
                                      items(lexical_entry).encode(ENCODING))
                    except IndexError:
                        # Item is an empty string
                        pass
    else:
        raise OutputError(object,
                          "Object to write must be a Lexical Resource.")
    # Insert LaTeX commands to finish the document properly
    tex_file.write("\end{multicols}" + EOL)
    # Insert paradigms if any
    for filename in paradigms:
        tex_file.write(EOL)
        tex_file.write("\\newpage" + EOL)
        tex_file.write("\markboth{paradigms}{}" + EOL)
        tex_file.write(file_read(filename))
        tex_file.write(EOL)
    # Insert other tables if any
    for filename in tables:
        tex_file.write(EOL)
        tex_file.write("\\newpage" + EOL)
        tex_file.write(file_read(filename))
        tex_file.write(EOL)
    tex_file.write("\end{document}" + EOL)
    tex_file.close()
Ejemplo n.º 16
0
def format_audio(lexical_entry, font):
    """! @brief Embed sound file into PDF.
    @param lexical_entry The current Lexical Entry LMF instance.
    @param font A Python dictionary giving the vernacular, national, regional fonts to apply to a text in LaTeX format.
    @return A string embedding sound in LaTeX format.
    """
    import os
    from os.path import basename, isfile
    # To access options
    from pylmflib import options
    global options
    result = ""
    if not options.audio:
        return result
    for form_representation in lexical_entry.get_form_representations():
        if form_representation.get_audio() is not None:
            # Embed local sound file
            # \includemedia[<options>]{<poster text>}{<main Flash (SWF) file or URL  |  3D (PRC, U3D) file>}
            # To include audio file in PDF, replace WAV extension by MP3 extension and search in audio, MP3 and WAV folders
            file_name = form_representation.get_audio().get_fileName().replace(
                ".wav", ".mp3")
            file_path = []
            if os.name == 'posix':
                # Unix-style paths
                file_path.append(pylmflib.config.xml.audio_path + file_name)
                file_path.append(pylmflib.config.xml.audio_path + "mp3/" +
                                 file_name)
                file_path.append(pylmflib.config.xml.audio_path + "wav/" +
                                 file_name)
            else:
                # Windows-style paths
                audio_path = pylmflib.config.xml.audio_path.replace("/", "\\")
                file_path.append(audio_path + file_name)
                file_path.append(audio_path + "mp3\\" + file_name)
                file_path.append(audio_path + "wav\\" + file_name)
            exist = False
            for audio_file in file_path:
                if isfile(audio_file):
                    exist = True
                    break
            if not exist:
                print Warning(
                    "Sound file '%s' encountered for lexeme '%s' does not exist"
                    % (file_name.encode(ENCODING),
                       lexical_entry.get_lexeme().encode(ENCODING)))
                return result
            file_name = file_name.replace('-', '\string-')
            result += "\includemedia[" + EOL +\
                "\taddresource=" + file_name + "," + EOL +\
                "\tflashvars={" + EOL +\
                    "\t\tsource=" + file_name + EOL +\
                    "\t\t&autoPlay=true" + EOL +\
                    "\t\t&autoRewind=true" + EOL +\
                    "\t\t&loop=false" + EOL +\
                    "\t\t&hideBar=true" + EOL +\
                    "\t\t&volume=1.0" + EOL +\
                    "\t\t&balance=0.0" + EOL +\
                "}]{\includegraphics[scale=0.5]{sound.jpg}}{APlayer.swf}"
            # \mediabutton[<options>]{<normal button text or graphic>}
            result += " \\hspace{0.1cm}" + EOL
    return result
Ejemplo n.º 17
0
def mdf_read(filename=None,
             mdf2lmf=mdf_lmf,
             lexicon=None,
             id=None,
             encoding=ENCODING):
    """! @brief Read an MDF file.
    @param filename The name of the MDF file to read with full path, for instance 'user/input.txt'.
    @param mdf2lmf A Python dictionary describing the mapping between MDF markers and LMF representation. Default value is 'mdf_lmf' dictionary defined in 'pylmflib/config/mdf.py'. Please refer to it as an example.
    @param lexicon An existing Lexicon to fill with lexical entries to read.
    @param id A Python string identifying the lexicon to create.
    @param encoding Use 'utf-8' encoding by default. Otherwise, user has to precise the native encoding of its document.
    @return A Lexicon instance containing all lexical entries.
    """
    import re
    # If not provided, create a Lexicon instance to contain all lexical entries
    if lexicon is None:
        lexicon = Lexicon(id)
    # Read in unicode
    if filename is None:
        filename = lexicon.get_entrySource()
    else:
        # Set lexicon attribute
        lexicon.set_entrySource(filename)
    # Read in unicode
    mdf_file = open_read(filename, encoding=encoding)
    # MDF syntax is the following: '\marker value'
    mdf_pattern = """^\\\(\w*) (<(.*)>)? ?(.*)$"""
    # Add each lexical entry to the lexicon
    current_entry = None
    sub_entry = None
    component = None
    main_entry = None
    for line in mdf_file.readlines():
        # Do not parse empty lines
        if line != EOL:
            result = re.match(mdf_pattern, line)
            if result is None:
                # Line is empty => continue parsing next line
                continue
            marker = result.group(1)
            attrs = result.group(3)
            value = result.group(4)
            # Do not consider markers starting with an underscore character (e.g. '_sh' and '_DateStampHasFourDigitYear')
            if marker[0] == '_':
                continue
            # Remove trailing spaces and end-of-line characters
            value = value.rstrip(' \r\n')
            # Do not consider empty fields
            if value == "":
                continue
            # Check if the current entry is a multiword expression
            is_mwe = False
            if marker == "lf":
                lf = value.split(" = ")
                if lf[0].startswith("Component"):
                    component_nb = lf[0].lstrip("Component")
                    value = lf[1]
                    is_mwe = True
            # 'lx' and 'se' markers indicate a new entry
            if marker == "lx" or marker == "se" or is_mwe:
                # Compute a unique identifier
                uid = uni2sampa(value)
                if marker == "se":
                    # Create a subentry
                    sub_entry = LexicalEntry(uid)
                    # An MDF subentry corresponds to an LMF lexical entry
                    mdf2lmf["lx"](value, sub_entry)
                    # Add it to the lexicon
                    lexicon.add_lexical_entry(sub_entry)
                    # Manage main entry
                    if main_entry is None:
                        main_entry = current_entry
                    else:
                        current_entry = main_entry
                    # Set main entry
                    homonym_nb = current_entry.get_homonymNumber()
                    if homonym_nb is None:
                        homonym_nb = ""
                    sub_entry.create_and_add_related_form(
                        current_entry.get_lexeme() + homonym_nb, "main entry")
                elif is_mwe:
                    # Create a subentry
                    component = LexicalEntry(uid)
                    # An MDF subentry corresponds to an LMF lexical entry
                    mdf2lmf["lx"](value, component)
                    # Add it to the lexicon
                    lexicon.add_lexical_entry(component)
                    # Manage current entry
                    if sub_entry is not None:
                        current_entry = sub_entry
                    # Set component
                    homonym_nb = current_entry.get_homonymNumber()
                    if homonym_nb is None:
                        homonym_nb = ""
                    current_entry.create_and_add_component(component_nb, value)
                    component.create_and_add_related_form(
                        current_entry.get_lexeme() + homonym_nb,
                        "complex predicate")
                    component.set_independentWord(False)
                else:
                    # Create a new entry
                    current_entry = LexicalEntry(uid)
                    # Add it to the lexicon
                    lexicon.add_lexical_entry(current_entry)
                    # Reset main entry
                    main_entry = None
            # Map MDF marker and value to LMF representation
            try:
                if attrs is not None:
                    # There are attributes
                    attributes = {}
                    # Remove quotation marks from attributes if any
                    attrs = attrs.replace('"', '')
                    for attr in attrs.split(' '):
                        attributes.update(
                            {attr.split('=')[0]: attr.split('=')[1]})
                    # A customized marker starts with '__' characters
                    mdf2lmf["__" + marker](attributes, value, current_entry)
                else:
                    mdf2lmf[marker](value, current_entry)
                if sub_entry is not None:
                    current_entry = sub_entry
                    sub_entry = None
                if component is not None:
                    sub_entry = current_entry
                    current_entry = component
                    component = None
            except KeyError:
                # When printing, we need to convert 'unicode' into 'str' using 'utf-8' encoding:
                print Warning(
                    "MDF marker '%s' encountered for lexeme '%s' is not defined in configuration"
                    % (marker.encode(ENCODING),
                       current_entry.get_lexeme().encode(ENCODING)))
            except Error as exception:
                exception.handle()
    mdf_file.close()
    return lexicon
Ejemplo n.º 18
0
 def check_cross_references(self):
     """! @brief Check all cross-references in the lexicon.
     Fill the private attribute '__lexicalEntry' of each RelatedForm instance for all lexical entries.
     @return Lexicon instance.
     """
     import os
     from string import digits
     if self.__checked:
         return self
     # Verifiy cross references only once
     self.__checked = True
     for lexical_entry in self.get_lexical_entries():
         for related_form in lexical_entry.get_related_forms():
             # From RelatedForm targets attribute, retrieve the pointed LexicalEntry instance
             related_lexeme = related_form.get_lexeme()
             # Check if there is an homonym number at the end of the related lexeme
             related_homonym_number = None
             if related_lexeme[-1] in digits:
                 related_homonym_number = related_lexeme[-1]
                 related_lexeme = related_lexeme[:-1]
             found_entry = self.find_lexical_entries(
                 lambda lexical_entry: lexical_entry.get_lexeme(
                 ) == related_lexeme)
             # Remove duplicate subentries from check if any
             if len(found_entry) == 2:
                 if found_entry[0].is_subentry(
                 ) and not found_entry[1].is_subentry():
                     # Keep only the first subentry
                     found_entry = found_entry[:1]
                 elif not found_entry[0].is_subentry(
                 ) and found_entry[1].is_subentry():
                     # Keep only the second subentry
                     found_entry = found_entry[1:]
             if len(found_entry) < 1:
                 # No lexical entry with this lexeme exists
                 print Warning(
                     "Lexical entry '%s' does not exist. Please solve this issue by checking the related form of lexical entry '%s'."
                     % (related_lexeme.encode(ENCODING),
                        lexical_entry.get_lexeme().encode(ENCODING)))
             elif len(found_entry) > 1:
                 # Several lexical entries with this lexeme exist => consider homonym number if any
                 related_homonym = []
                 if related_homonym_number is not None:
                     for related_entry in found_entry:
                         if related_entry.get_homonymNumber(
                         ) == related_homonym_number:
                             related_homonym.append(related_entry)
                 if len(related_homonym) != 1:
                     print Warning(
                         "Several lexical entries '%s' exist. Please solve this issue by renaming lexical entries correctly or by specifying the homonym number."
                         % related_lexeme.encode(ENCODING))
                 else:
                     # Save the found lexical entry
                     related_form.set_lexical_entry(related_homonym[0])
             else:
                 # Save the found lexical entry
                 related_form.set_lexical_entry(found_entry[0])
         for component in lexical_entry.get_components():
             # From Component targets attribute, retrieve the pointed LexicalEntry instance
             found_entries = self.find_lexical_entries(
                 lambda lexical_entry: lexical_entry.get_lexeme(
                 ) == component.get_lexeme())
             for found_entry in found_entries:
                 # Check that the found entry is a component
                 if found_entry.is_component():
                     # Save the found lexical entry
                     component.set_lexical_entry(found_entry)
                     break
     return self