def mdf_read(filename=None, mdf2lmf=mdf_lmf, lexicon=None, id=None, encoding=ENCODING): """! @brief Read an MDF file. @param filename The name of the MDF file to read with full path, for instance 'user/input.txt'. @param mdf2lmf A Python dictionary describing the mapping between MDF markers and LMF representation. Default value is 'mdf_lmf' dictionary defined in 'pylmflib/config/mdf.py'. Please refer to it as an example. @param lexicon An existing Lexicon to fill with lexical entries to read. @param id A Python string identifying the lexicon to create. @param encoding Use 'utf-8' encoding by default. Otherwise, user has to precise the native encoding of its document. @return A Lexicon instance containing all lexical entries. """ import re # If not provided, create a Lexicon instance to contain all lexical entries if lexicon is None: lexicon = Lexicon(id) # Read in unicode if filename is None: filename = lexicon.get_entrySource() else: # Set lexicon attribute lexicon.set_entrySource(filename) # Read in unicode mdf_file = open_read(filename, encoding=encoding) # MDF syntax is the following: '\marker value' mdf_pattern = """^\\\(\w*) (<(.*)>)? ?(.*)$""" # Add each lexical entry to the lexicon current_entry = None sub_entry = None component = None main_entry = None for line in mdf_file.readlines(): # Do not parse empty lines if line != EOL: result = re.match(mdf_pattern, line) if result is None: # Line is empty => continue parsing next line continue marker = result.group(1) attrs = result.group(3) value = result.group(4) # Do not consider markers starting with an underscore character (e.g. '_sh' and '_DateStampHasFourDigitYear') if marker[0] == '_': continue # Remove trailing spaces and end-of-line characters value = value.rstrip(' \r\n') # Do not consider empty fields if value == "": continue # Check if the current entry is a multiword expression is_mwe = False if marker == "lf": lf = value.split(" = ") if lf[0].startswith("Component"): component_nb = lf[0].lstrip("Component") value = lf[1] is_mwe = True # 'lx' and 'se' markers indicate a new entry if marker == "lx" or marker == "se" or is_mwe: # Compute a unique identifier uid = uni2sampa(value) if marker == "se": # Create a subentry sub_entry = LexicalEntry(uid) # An MDF subentry corresponds to an LMF lexical entry mdf2lmf["lx"](value, sub_entry) # Add it to the lexicon lexicon.add_lexical_entry(sub_entry) # Manage main entry if main_entry is None: main_entry = current_entry else: current_entry = main_entry # Set main entry homonym_nb = current_entry.get_homonymNumber() if homonym_nb is None: homonym_nb = "" sub_entry.create_and_add_related_form( current_entry.get_lexeme() + homonym_nb, "main entry") elif is_mwe: # Create a subentry component = LexicalEntry(uid) # An MDF subentry corresponds to an LMF lexical entry mdf2lmf["lx"](value, component) # Add it to the lexicon lexicon.add_lexical_entry(component) # Manage current entry if sub_entry is not None: current_entry = sub_entry # Set component homonym_nb = current_entry.get_homonymNumber() if homonym_nb is None: homonym_nb = "" current_entry.create_and_add_component(component_nb, value) component.create_and_add_related_form( current_entry.get_lexeme() + homonym_nb, "complex predicate") component.set_independentWord(False) else: # Create a new entry current_entry = LexicalEntry(uid) # Add it to the lexicon lexicon.add_lexical_entry(current_entry) # Reset main entry main_entry = None # Map MDF marker and value to LMF representation try: if attrs is not None: # There are attributes attributes = {} # Remove quotation marks from attributes if any attrs = attrs.replace('"', '') for attr in attrs.split(' '): attributes.update( {attr.split('=')[0]: attr.split('=')[1]}) # A customized marker starts with '__' characters mdf2lmf["__" + marker](attributes, value, current_entry) else: mdf2lmf[marker](value, current_entry) if sub_entry is not None: current_entry = sub_entry sub_entry = None if component is not None: sub_entry = current_entry current_entry = component component = None except KeyError: # When printing, we need to convert 'unicode' into 'str' using 'utf-8' encoding: print Warning( "MDF marker '%s' encountered for lexeme '%s' is not defined in configuration" % (marker.encode(ENCODING), current_entry.get_lexeme().encode(ENCODING))) except Error as exception: exception.handle() mdf_file.close() return lexicon
class TestLexiconFunctions(unittest.TestCase): def setUp(self): # Instantiate a Lexicon object self.lexicon = Lexicon() def tearDown(self): # Release instantiated objects del self.lexicon def test_init(self): self.assertIsNone(self.lexicon.language) self.assertIsNone(self.lexicon.languageScript) self.assertIsNone(self.lexicon.id) self.assertIsNone(self.lexicon.label) self.assertIsNone(self.lexicon.lexiconType) self.assertIsNone(self.lexicon.entrySource) self.assertIsNone(self.lexicon.vowelHarmony) self.assertListEqual(self.lexicon.lexical_entry, []) self.assertIsNone(self.lexicon.localPath) def test_set_id(self): id = "English lexicon" self.assertEqual(self.lexicon.set_id(id), self.lexicon) self.assertEqual(self.lexicon.id, id) def test_get_id(self): self.assertIs(self.lexicon.get_id(), self.lexicon.id) def test_set_language(self): language = "eng" self.assertEqual(self.lexicon.set_language(language), self.lexicon) self.assertEqual(self.lexicon.language, language) def test_get_language(self): self.assertIs(self.lexicon.get_language(), self.lexicon.language) def test_set_languageScript(self): script = "latn" self.assertEqual(self.lexicon.set_languageScript(script), self.lexicon) self.assertEqual(self.lexicon.languageScript, script) def test_get_languageScript(self): self.assertIs(self.lexicon.get_languageScript(), self.lexicon.languageScript) def test_set_label(self): label = "online dictionary" self.assertEqual(self.lexicon.set_label(label), self.lexicon) self.assertEqual(self.lexicon.label, label) def test_get_label(self): self.assertIs(self.lexicon.get_label(), self.lexicon.label) def test_set_lexiconType(self): type = "bilingual dictionary" self.assertEqual(self.lexicon.set_lexiconType(type), self.lexicon) self.assertEqual(self.lexicon.lexiconType, type) def test_get_lexiconType(self): self.assertIs(self.lexicon.get_lexiconType(), self.lexicon.lexiconType) def test_set_entrySource(self): source = "test.txt" self.assertEqual(self.lexicon.set_entrySource(source), self.lexicon) self.assertEqual(self.lexicon.entrySource, source) def test_get_entrySource(self): self.assertIs(self.lexicon.get_entrySource(), self.lexicon.entrySource) def test_set_vowelHarmony(self): test = False try: self.lexicon.set_vowelHarmony(None) except NotImplementedError: test = True self.assertTrue(test) def test_get_vowelHarmony(self): test = False try: self.lexicon.get_vowelHarmony() except NotImplementedError: test = True self.assertTrue(test) def test_set_localPath(self): path = "/full/local/path/to/audio/files/" self.assertEqual(self.lexicon.set_localPath(path), self.lexicon) self.assertEqual(self.lexicon.localPath, path) def test_get_localPath(self): self.assertIs(self.lexicon.get_localPath(), self.lexicon.localPath) def test_get_lexical_entries(self): # Create lexical entries entry1 = LexicalEntry() entry2 = LexicalEntry() # Add entries to the lexicon self.lexicon.lexical_entry = [entry1, entry2] # Test get lexical entries self.assertListEqual(self.lexicon.get_lexical_entries(), [entry1, entry2]) self.lexicon.lexical_entry.append(entry1) self.assertListEqual(self.lexicon.get_lexical_entries(), [entry1, entry2, entry1]) # Release LexicalEntry instances del self.lexicon.lexical_entry[:] del entry1, entry2 def test_add_lexical_entry(self): # Create lexical entries entry1 = LexicalEntry() entry2 = LexicalEntry() # Test add entries to the lexicon self.assertEqual(self.lexicon.add_lexical_entry(entry1), self.lexicon) self.assertListEqual(self.lexicon.lexical_entry, [entry1]) self.assertEqual(self.lexicon.add_lexical_entry(entry2), self.lexicon) self.assertListEqual(self.lexicon.lexical_entry, [entry1, entry2]) # Release LexicalEntry instances del self.lexicon.lexical_entry[:] del entry1, entry2 def test_remove_lexical_entry(self): # Create lexical entries entry1 = LexicalEntry() entry2 = LexicalEntry() # Add entries to the lexicon self.lexicon.lexical_entry = [entry1, entry2] # Test remove lexical entries self.assertEqual(self.lexicon.remove_lexical_entry(entry1), self.lexicon) self.assertListEqual(self.lexicon.lexical_entry, [entry2]) self.assertEqual(self.lexicon.remove_lexical_entry(entry2), self.lexicon) self.assertListEqual(self.lexicon.lexical_entry, []) # Release LexicalEntry instances del entry1, entry2 def test_count_lexical_entries(self): # Create lexical entries entry1 = LexicalEntry() entry2 = LexicalEntry() # Add entries to the lexicon self.lexicon.lexical_entry = [entry1] # Test count lexical entries self.assertEqual(self.lexicon.count_lexical_entries(), 1) self.lexicon.lexical_entry.append(entry2) self.assertEqual(self.lexicon.count_lexical_entries(), 2) self.lexicon.lexical_entry.append(entry1) self.assertEqual(self.lexicon.count_lexical_entries(), 3) # Release LexicalEntry instances del self.lexicon.lexical_entry[:] del entry1, entry2 def test_sort_homonym_numbers(self): # Create several lexical entries entry1 = LexicalEntry().set_lexeme("aa").set_homonymNumber("2") entry2 = LexicalEntry().set_lexeme("aa").set_homonymNumber("1") entry3 = LexicalEntry().set_lexeme("ab") entry4 = LexicalEntry().set_lexeme("ba") entry5 = LexicalEntry().set_lexeme("bb").set_homonymNumber("6") entry6 = LexicalEntry().set_lexeme("bb").set_homonymNumber("5") # Add entries to the lexicon self.lexicon.lexical_entry = [ entry1, entry2, entry3, entry4, entry5, entry6 ] # Test sort homonym numbers self.assertListEqual(self.lexicon.sort_homonym_numbers(), [entry2, entry1, entry3, entry4, entry6, entry5]) self.assertListEqual(self.lexicon.lexical_entry, [entry2, entry1, entry3, entry4, entry6, entry5]) # Release LexicalEntry instances del self.lexicon.lexical_entry[:] del entry1, entry2, entry3, entry4, entry5, entry6 def test_sort_lexical_entries(self): # Create several lexical entries with different lexemes entry1 = LexicalEntry().set_lexeme("aa") entry2 = LexicalEntry().set_lexeme("ab") entry3 = LexicalEntry().set_lexeme("ba") entry4 = LexicalEntry().set_lexeme("bb") # Add entries to the lexicon self.lexicon.lexical_entry = [entry4, entry1, entry2, entry3] # Test sort lexical entries self.assertListEqual(self.lexicon.sort_lexical_entries(), [entry1, entry2, entry3, entry4]) self.assertListEqual(self.lexicon.lexical_entry, [entry1, entry2, entry3, entry4]) # Provide a sort order my_order = dict({'A': 1.1, 'a': 1.2, 'B': 2.1, 'b': 2.2}) my_unicode_order = ({}) for key in my_order.keys(): my_unicode_order.update( {key.decode(encoding='utf8'): my_order[key]}) entry5 = LexicalEntry().set_lexeme("Aa") entry6 = LexicalEntry().set_lexeme("bB") self.lexicon.lexical_entry.append(entry5) self.lexicon.lexical_entry.append(entry6) self.assertListEqual( self.lexicon.sort_lexical_entries(sort_order=my_order), [entry5, entry1, entry2, entry3, entry6, entry4]) self.assertListEqual(self.lexicon.lexical_entry, [entry5, entry1, entry2, entry3, entry6, entry4]) # Release LexicalEntry instances del self.lexicon.lexical_entry[:] del entry1, entry2, entry3, entry4, entry5, entry6 def test_find_lexical_entries(self): # Create several lexical entries with different lexemes entry1 = LexicalEntry().set_lexeme("Hello") entry2 = LexicalEntry().set_lexeme("world!") entry3 = LexicalEntry().set_lexeme("hello") entry4 = LexicalEntry().set_lexeme("world") # Add entries to the lexicon self.lexicon.lexical_entry = [entry1, entry2, entry3, entry4] # Test find lexical entries self.assertListEqual( self.lexicon.find_lexical_entries( lambda entry: entry.get_lexeme() == "Hello"), [entry1]) def test_filter(entry): return entry.get_lexeme().lower() == "hello" # List is randomly ordered => create a set to avoid random results self.assertEqual(set(self.lexicon.find_lexical_entries(test_filter)), set([entry1, entry3])) # Release LexicalEntry instances del self.lexicon.lexical_entry[:] del entry1, entry2, entry3, entry4 def test_check_cross_references(self): # Create lexical entries with lexemes and related lexemes entry1 = LexicalEntry().set_lexeme( "Hello").create_and_add_related_form("world!", "main entry") entry2 = LexicalEntry().set_lexeme( "world!").create_and_add_related_form("Hello", "subentry") # Add entries to the lexicon self.lexicon.lexical_entry = [entry1, entry2] # Test check cross references self.assertIs(self.lexicon.check_cross_references(), self.lexicon) self.assertIs(entry1.related_form[0].get_lexical_entry(), entry2) self.assertIs(entry2.related_form[0].get_lexical_entry(), entry1) # Test warning case: entry not found entry3 = LexicalEntry().set_lexeme( "hello").create_and_add_related_form("world", "main entry") self.lexicon.lexical_entry.append(entry3) self.lexicon.reset_check() self.lexicon.check_cross_references() # Retrieve nominal case entry4 = LexicalEntry().set_lexeme("world") self.lexicon.lexical_entry.append(entry4) self.lexicon.reset_check() self.assertIs(self.lexicon.check_cross_references(), self.lexicon) self.assertIs(entry3.related_form[0].get_lexical_entry(), entry4) # Test warning case: several entries found entry5 = LexicalEntry().set_lexeme("world") self.lexicon.lexical_entry.append(entry5) self.lexicon.reset_check() self.lexicon.check_cross_references() # Test check cross references with homonym number entry3.related_form[0].set_lexical_entry(None) entry3.related_form[0].targets = "world2" entry4.homonymNumber = "1" entry5.homonymNumber = "2" self.lexicon.reset_check() self.assertIs(self.lexicon.check_cross_references(), self.lexicon) self.assertIs(entry3.related_form[0].get_lexical_entry(), entry5) # Release LexicalEntry instances del self.lexicon.lexical_entry[:] del entry1, entry2, entry3, entry4, entry5 def test_convert_to_latex(self): pass
def mdf_read(filename=None, mdf2lmf=mdf_lmf, lexicon=None, id=None, encoding=ENCODING): """! @brief Read an MDF file. @param filename The name of the MDF file to read with full path, for instance 'user/input.txt'. @param mdf2lmf A Python dictionary describing the mapping between MDF markers and LMF representation. Default value is 'mdf_lmf' dictionary defined in 'pylmflib/config/mdf.py'. Please refer to it as an example. @param lexicon An existing Lexicon to fill with lexical entries to read. @param id A Python string identifying the lexicon to create. @param encoding Use 'utf-8' encoding by default. Otherwise, user has to precise the native encoding of its document. @return A Lexicon instance containing all lexical entries. """ import re # If not provided, create a Lexicon instance to contain all lexical entries if lexicon is None: lexicon = Lexicon(id) # Read in unicode if filename is None: filename = lexicon.get_entrySource() else: # Set lexicon attribute lexicon.set_entrySource(filename) # Read in unicode mdf_file = open_read(filename, encoding=encoding) # MDF syntax is the following: '\marker value' mdf_pattern = """^\\\(\w*) (<(.*)>)? ?(.*)$""" # Add each lexical entry to the lexicon current_entry = None sub_entry = None component = None main_entry = None for line in mdf_file.readlines(): # Do not parse empty lines if line != EOL: result = re.match(mdf_pattern, line) if result is None: # Line is empty => continue parsing next line continue marker = result.group(1) attrs = result.group(3) value = result.group(4) # Do not consider markers starting with an underscore character (e.g. '_sh' and '_DateStampHasFourDigitYear') if marker[0] == '_': continue # Remove trailing spaces and end-of-line characters value = value.rstrip(' \r\n') # Do not consider empty fields if value == "": continue # Check if the current entry is a multiword expression is_mwe = False if marker == "lf": lf = value.split(" = ") if lf[0].startswith("Component"): component_nb = lf[0].lstrip("Component") value = lf[1] is_mwe = True # 'lx' and 'se' markers indicate a new entry if marker == "lx" or marker == "se" or is_mwe: # Compute a unique identifier uid = uni2sampa(value) if marker == "se": # Create a subentry sub_entry = LexicalEntry(uid) # An MDF subentry corresponds to an LMF lexical entry mdf2lmf["lx"](value, sub_entry) # Add it to the lexicon lexicon.add_lexical_entry(sub_entry) # Manage main entry if main_entry is None: main_entry = current_entry else: current_entry = main_entry # Set main entry homonym_nb = current_entry.get_homonymNumber() if homonym_nb is None: homonym_nb = "" sub_entry.create_and_add_related_form(current_entry.get_lexeme() + homonym_nb, "main entry") elif is_mwe: # Create a subentry component = LexicalEntry(uid) # An MDF subentry corresponds to an LMF lexical entry mdf2lmf["lx"](value, component) # Add it to the lexicon lexicon.add_lexical_entry(component) # Manage current entry if sub_entry is not None: current_entry = sub_entry # Set component homonym_nb = current_entry.get_homonymNumber() if homonym_nb is None: homonym_nb = "" current_entry.create_and_add_component(component_nb, value) component.create_and_add_related_form(current_entry.get_lexeme() + homonym_nb, "complex predicate") component.set_independentWord(False) else: # Create a new entry current_entry = LexicalEntry(uid) # Add it to the lexicon lexicon.add_lexical_entry(current_entry) # Reset main entry main_entry = None # Map MDF marker and value to LMF representation try: if attrs is not None: # There are attributes attributes = {} # Remove quotation marks from attributes if any attrs = attrs.replace('"', '') for attr in attrs.split(' '): attributes.update({attr.split('=')[0] : attr.split('=')[1]}) # A customized marker starts with '__' characters mdf2lmf["__" + marker](attributes, value, current_entry) else: mdf2lmf[marker](value, current_entry) if sub_entry is not None: current_entry = sub_entry sub_entry = None if component is not None: sub_entry = current_entry current_entry = component component = None except KeyError: # When printing, we need to convert 'unicode' into 'str' using 'utf-8' encoding: print Warning("MDF marker '%s' encountered for lexeme '%s' is not defined in configuration" % (marker.encode(ENCODING), current_entry.get_lexeme().encode(ENCODING))) except Error as exception: exception.handle() mdf_file.close() return lexicon
def config_read(filename): """! @brief Read an XML file giving the user configuration. @param filename The name of the XML file to read with full path, for instance 'pylmflib/pylmflib/config/default/config.xml'. @return A Lexical Resource. """ import os import config.xml configuration = parse_xml(filename) # Parse XML elements for format in configuration: if format.tag == "Language": # XML element "Language" have several XML subelements "lang" for lang in format: # XML elements "lang" have 2 XML attributes: one for the nature of the language ("att"), a second for the language code ("val") exec("config.xml." + lang.attrib["att"] + " = '" + lang.attrib["val"] + "'") elif format.tag == "Font": config.xml.font = dict() # XML element "Font" have several XML subelements "font" for font in format: # XML elements "font" have 2 XML attributes: one for the nature of the language ("att"), a second for the variable name ("var") exec("l = lambda " + font.attrib['var'] + ": " + font.text) config.xml.font.update({font.attrib['att']: l}) elif format.tag == "LMF": # Create lexical resource and set DTD version lexical_resource = LexicalResource(format[0].attrib["dtdVersion"]) for object in format[0]: if object.tag == "GlobalInformation": # Set global information for feat in object: if feat.attrib["att"] == "languageCode": lexical_resource.set_language_code(feat.attrib["val"]) elif feat.attrib["att"] == "author": lexical_resource.set_author(feat.attrib["val"]) elif feat.attrib["att"] == "version": lexical_resource.set_version(feat.attrib["val"]) elif feat.attrib["att"] == "lastUpdate": lexical_resource.set_last_update(feat.attrib["val"]) elif feat.attrib["att"] == "license": lexical_resource.set_license(feat.attrib["val"]) elif feat.attrib["att"] == "characterEncoding": lexical_resource.set_character_encoding(feat.attrib["val"]) elif feat.attrib["att"] == "dateCoding": lexical_resource.set_date_coding(feat.attrib["val"]) elif feat.attrib["att"] == "creationDate": lexical_resource.set_creation_date(feat.attrib["val"]) elif feat.attrib["att"] == "projectName": lexical_resource.set_project_name(feat.attrib["val"]) elif feat.attrib["att"] == "description": lexical_resource.set_description(feat.attrib["val"]) elif object.tag == "Lexicon": # Create lexicon and set identifier lexicon = Lexicon(object.attrib["id"]) # Set lexicon attributes for feat in object: if feat.attrib["att"] == "language": lexicon.set_language(feat.attrib["val"]) elif feat.attrib["att"] == "languageScript": lexicon.set_languageScript(feat.attrib["val"]) elif feat.attrib["att"] == "label": lexicon.set_label(feat.attrib["val"]) elif feat.attrib["att"] == "lexiconType": lexicon.set_lexiconType(feat.attrib["val"]) elif feat.attrib["att"] == "entrySource": lexicon.set_entrySource(feat.attrib["val"]) elif feat.attrib["att"] == "localPath": lexicon.set_localPath(feat.attrib["val"]) # Set absolute path to audio files config.xml.audio_path = os.path.abspath(os.path.abspath('.') + "/" + feat.attrib["val"]) + "/" # Attach lexicon to the lexical resource lexical_resource.add_lexicon(lexicon) elif format.tag == "MDF": for mdf in format: if mdf.tag == "mdf_lmf": # XML elements "mdf_lmf" have 2 XML attributes: one for the name of the marker ("marker"), a second for the variable name ("var") exec("l = lambda " + mdf.attrib['var'] + ": " + mdf.text) mdf_lmf.update({mdf.attrib['marker']: l}) elif mdf.tag == "ps_partOfSpeech": # XML elements "ps_partOfSpeech" have 2 XML attributes: one for the MDF value ("ps"), a second for the LMF value ("partOfSpeech") ps_partOfSpeech.update({mdf.attrib['ps']: mdf.attrib['partOfSpeech']}) # Also automatically update range of possible values allowed for LMF part of speech LexicalEntry attribute --> partOfSpeech_range.add(mdf.attrib['partOfSpeech']) # And automatically update the reverse operation partOfSpeech_tex.update({mdf.attrib['partOfSpeech']: mdf.attrib['ps']}) elif mdf.tag == "pdl_paradigmLabel": # XML elements "pdl_paradigmLabel" have 2 XML attributes: one for the MDF value ("pdl"), a second for the LMF value ("paradigmLabel") pdl_paradigmLabel.update({mdf.attrib['pdl']: mdf.attrib['paradigmLabel']}) # Also automatically update range of possible values allowed for LMF paradigm label Paradigm attribute --> paradigmLabel_range.add(mdf.attrib['paradigmLabel']) # And automatically update the reverse operation paradigmLabel_tex.update({mdf.attrib['paradigmLabel']: mdf.attrib['pdl']}) elif mdf.tag == "lmf_mdf": # XML elements "lmf_mdf" have 2 XML attributes: one for the name of the marker ("marker"), a second for the variable name ("var") exec("l = lambda " + mdf.attrib['var'] + ": " + mdf.text) lmf_mdf.update({mdf.attrib['marker']: l}) elif mdf.tag == "mdf_order": mdf_order = [] for element in mdf: mdf_order.append(element.tag) list1 = [] for level1 in element: list1.append(level1.tag) list2 = [] for level2 in level1: list2.append(level2.tag) if len(list2) != 0: list1.append(list2) if len(list1) != 0: mdf_order.append(list1) elif format.tag == "LaTeX": for param in format: if param.tag == "partOfSpeech_tex": # XML elements "partOfSpeech_tex" have 2 or 3 XML attributes: one for the LMF value ("partOfSpeech"), a second for the LaTeX value ("tex"), and an optional one to define language try: partOfSpeech_tex.update({(param.attrib['lang'], param.attrib['partOfSpeech']): param.attrib['tex']}) except KeyError: partOfSpeech_tex.update({param.attrib['partOfSpeech']: param.attrib['tex']}) # Also automatically update range of possible values allowed for LMF part of speech LexicalEntry attribute --> partOfSpeech_range.add(param.attrib['partOfSpeech']) elif param.tag == "paradigmLabel_tex": # XML elements "paradigmLabel_tex" have 2 XML attributes: one for the LMF value ("paradigmLabel"), a second for the LaTeX value ("tex") paradigmLabel_tex.update({param.attrib['paradigmLabel']: param.attrib['tex']}) # Also automatically update range of possible values allowed for LMF paradigm label Paradigm attribute --> paradigmLabel_range.add(param.attrib['paradigmLabel']) else: raise InputError(module_name + ".py", "XML file '%s' is not well-formatted." % filename) return lexical_resource
class TestLexiconFunctions(unittest.TestCase): def setUp(self): # Instantiate a Lexicon object self.lexicon = Lexicon() def tearDown(self): # Release instantiated objects del self.lexicon def test_init(self): self.assertIsNone(self.lexicon.language) self.assertIsNone(self.lexicon.languageScript) self.assertIsNone(self.lexicon.id) self.assertIsNone(self.lexicon.label) self.assertIsNone(self.lexicon.lexiconType) self.assertIsNone(self.lexicon.entrySource) self.assertIsNone(self.lexicon.vowelHarmony) self.assertListEqual(self.lexicon.lexical_entry, []) self.assertIsNone(self.lexicon.localPath) def test_set_id(self): id = "English lexicon" self.assertEqual(self.lexicon.set_id(id), self.lexicon) self.assertEqual(self.lexicon.id, id) def test_get_id(self): self.assertIs(self.lexicon.get_id(), self.lexicon.id) def test_set_language(self): language = "eng" self.assertEqual(self.lexicon.set_language(language), self.lexicon) self.assertEqual(self.lexicon.language, language) def test_get_language(self): self.assertIs(self.lexicon.get_language(), self.lexicon.language) def test_set_languageScript(self): script = "latn" self.assertEqual(self.lexicon.set_languageScript(script), self.lexicon) self.assertEqual(self.lexicon.languageScript, script) def test_get_languageScript(self): self.assertIs(self.lexicon.get_languageScript(), self.lexicon.languageScript) def test_set_label(self): label = "online dictionary" self.assertEqual(self.lexicon.set_label(label), self.lexicon) self.assertEqual(self.lexicon.label, label) def test_get_label(self): self.assertIs(self.lexicon.get_label(), self.lexicon.label) def test_set_lexiconType(self): type = "bilingual dictionary" self.assertEqual(self.lexicon.set_lexiconType(type), self.lexicon) self.assertEqual(self.lexicon.lexiconType, type) def test_get_lexiconType(self): self.assertIs(self.lexicon.get_lexiconType(), self.lexicon.lexiconType) def test_set_entrySource(self): source = "test.txt" self.assertEqual(self.lexicon.set_entrySource(source), self.lexicon) self.assertEqual(self.lexicon.entrySource, source) def test_get_entrySource(self): self.assertIs(self.lexicon.get_entrySource(), self.lexicon.entrySource) def test_set_vowelHarmony(self): test = False try: self.lexicon.set_vowelHarmony(None) except NotImplementedError: test = True self.assertTrue(test) def test_get_vowelHarmony(self): test = False try: self.lexicon.get_vowelHarmony() except NotImplementedError: test = True self.assertTrue(test) def test_set_localPath(self): path = "/full/local/path/to/audio/files/" self.assertEqual(self.lexicon.set_localPath(path), self.lexicon) self.assertEqual(self.lexicon.localPath, path) def test_get_localPath(self): self.assertIs(self.lexicon.get_localPath(), self.lexicon.localPath) def test_get_lexical_entries(self): # Create lexical entries entry1 = LexicalEntry() entry2 = LexicalEntry() # Add entries to the lexicon self.lexicon.lexical_entry = [entry1, entry2] # Test get lexical entries self.assertListEqual(self.lexicon.get_lexical_entries(), [entry1, entry2]) self.lexicon.lexical_entry.append(entry1) self.assertListEqual(self.lexicon.get_lexical_entries(), [entry1, entry2, entry1]) # Release LexicalEntry instances del self.lexicon.lexical_entry[:] del entry1, entry2 def test_add_lexical_entry(self): # Create lexical entries entry1 = LexicalEntry() entry2 = LexicalEntry() # Test add entries to the lexicon self.assertEqual(self.lexicon.add_lexical_entry(entry1), self.lexicon) self.assertListEqual(self.lexicon.lexical_entry, [entry1]) self.assertEqual(self.lexicon.add_lexical_entry(entry2), self.lexicon) self.assertListEqual(self.lexicon.lexical_entry, [entry1, entry2]) # Release LexicalEntry instances del self.lexicon.lexical_entry[:] del entry1, entry2 def test_remove_lexical_entry(self): # Create lexical entries entry1 = LexicalEntry() entry2 = LexicalEntry() # Add entries to the lexicon self.lexicon.lexical_entry = [entry1, entry2] # Test remove lexical entries self.assertEqual(self.lexicon.remove_lexical_entry(entry1), self.lexicon) self.assertListEqual(self.lexicon.lexical_entry, [entry2]) self.assertEqual(self.lexicon.remove_lexical_entry(entry2), self.lexicon) self.assertListEqual(self.lexicon.lexical_entry, []) # Release LexicalEntry instances del entry1, entry2 def test_count_lexical_entries(self): # Create lexical entries entry1 = LexicalEntry() entry2 = LexicalEntry() # Add entries to the lexicon self.lexicon.lexical_entry = [entry1] # Test count lexical entries self.assertEqual(self.lexicon.count_lexical_entries(), 1) self.lexicon.lexical_entry.append(entry2) self.assertEqual(self.lexicon.count_lexical_entries(), 2) self.lexicon.lexical_entry.append(entry1) self.assertEqual(self.lexicon.count_lexical_entries(), 3) # Release LexicalEntry instances del self.lexicon.lexical_entry[:] del entry1, entry2 def test_sort_homonym_numbers(self): # Create several lexical entries entry1 = LexicalEntry().set_lexeme("aa").set_homonymNumber("2") entry2 = LexicalEntry().set_lexeme("aa").set_homonymNumber("1") entry3 = LexicalEntry().set_lexeme("ab") entry4 = LexicalEntry().set_lexeme("ba") entry5 = LexicalEntry().set_lexeme("bb").set_homonymNumber("6") entry6 = LexicalEntry().set_lexeme("bb").set_homonymNumber("5") # Add entries to the lexicon self.lexicon.lexical_entry = [entry1, entry2, entry3, entry4, entry5, entry6] # Test sort homonym numbers self.assertListEqual(self.lexicon.sort_homonym_numbers(), [entry2, entry1, entry3, entry4, entry6, entry5]) self.assertListEqual(self.lexicon.lexical_entry, [entry2, entry1, entry3, entry4, entry6, entry5]) # Release LexicalEntry instances del self.lexicon.lexical_entry[:] del entry1, entry2, entry3, entry4, entry5, entry6 def test_sort_lexical_entries(self): # Create several lexical entries with different lexemes entry1 = LexicalEntry().set_lexeme("aa") entry2 = LexicalEntry().set_lexeme("ab") entry3 = LexicalEntry().set_lexeme("ba") entry4 = LexicalEntry().set_lexeme("bb") # Add entries to the lexicon self.lexicon.lexical_entry = [entry4, entry1, entry2, entry3] # Test sort lexical entries self.assertListEqual(self.lexicon.sort_lexical_entries(), [entry1, entry2, entry3, entry4]) self.assertListEqual(self.lexicon.lexical_entry, [entry1, entry2, entry3, entry4]) # Provide a sort order my_order = dict({'A':1.1, 'a':1.2, 'B':2.1, 'b':2.2}) my_unicode_order = ({}) for key in my_order.keys(): my_unicode_order.update({key.decode(encoding='utf8'):my_order[key]}) entry5 = LexicalEntry().set_lexeme("Aa") entry6 = LexicalEntry().set_lexeme("bB") self.lexicon.lexical_entry.append(entry5) self.lexicon.lexical_entry.append(entry6) self.assertListEqual(self.lexicon.sort_lexical_entries(sort_order=my_order), [entry5, entry1, entry2, entry3, entry6, entry4]) self.assertListEqual(self.lexicon.lexical_entry, [entry5, entry1, entry2, entry3, entry6, entry4]) # Release LexicalEntry instances del self.lexicon.lexical_entry[:] del entry1, entry2, entry3, entry4, entry5, entry6 def test_find_lexical_entries(self): # Create several lexical entries with different lexemes entry1 = LexicalEntry().set_lexeme("Hello") entry2 = LexicalEntry().set_lexeme("world!") entry3 = LexicalEntry().set_lexeme("hello") entry4 = LexicalEntry().set_lexeme("world") # Add entries to the lexicon self.lexicon.lexical_entry = [entry1, entry2, entry3, entry4] # Test find lexical entries self.assertListEqual(self.lexicon.find_lexical_entries(lambda entry: entry.get_lexeme() == "Hello"), [entry1]) def test_filter(entry): return entry.get_lexeme().lower() == "hello" # List is randomly ordered => create a set to avoid random results self.assertEqual(set(self.lexicon.find_lexical_entries(test_filter)), set([entry1, entry3])) # Release LexicalEntry instances del self.lexicon.lexical_entry[:] del entry1, entry2, entry3, entry4 def test_check_cross_references(self): # Create lexical entries with lexemes and related lexemes entry1 = LexicalEntry().set_lexeme("Hello").create_and_add_related_form("world!", "main entry") entry2 = LexicalEntry().set_lexeme("world!").create_and_add_related_form("Hello", "subentry") # Add entries to the lexicon self.lexicon.lexical_entry = [entry1, entry2] # Test check cross references self.assertIs(self.lexicon.check_cross_references(), self.lexicon) self.assertIs(entry1.related_form[0].get_lexical_entry(), entry2) self.assertIs(entry2.related_form[0].get_lexical_entry(), entry1) # Test warning case: entry not found entry3 = LexicalEntry().set_lexeme("hello").create_and_add_related_form("world", "main entry") self.lexicon.lexical_entry.append(entry3) self.lexicon.reset_check() self.lexicon.check_cross_references() # Retrieve nominal case entry4 = LexicalEntry().set_lexeme("world") self.lexicon.lexical_entry.append(entry4) self.lexicon.reset_check() self.assertIs(self.lexicon.check_cross_references(), self.lexicon) self.assertIs(entry3.related_form[0].get_lexical_entry(), entry4) # Test warning case: several entries found entry5 = LexicalEntry().set_lexeme("world") self.lexicon.lexical_entry.append(entry5) self.lexicon.reset_check() self.lexicon.check_cross_references() # Test check cross references with homonym number entry3.related_form[0].set_lexical_entry(None) entry3.related_form[0].targets = "world2" entry4.homonymNumber = "1" entry5.homonymNumber = "2" self.lexicon.reset_check() self.assertIs(self.lexicon.check_cross_references(), self.lexicon) self.assertIs(entry3.related_form[0].get_lexical_entry(), entry5) # Release LexicalEntry instances del self.lexicon.lexical_entry[:] del entry1, entry2, entry3, entry4, entry5 def test_convert_to_latex(self): pass