def test_get_lexicons(self): # Create lexicons lexicon1 = Lexicon() lexicon2 = Lexicon() # Add lexicons to the lexical resource self.lexical_resource.lexicon = [lexicon1, lexicon2] # Test get lexicons self.assertListEqual(self.lexical_resource.get_lexicons(), [lexicon1, lexicon2]) # Release Lexicon instances del self.lexical_resource.lexicon[:] del lexicon1, lexicon2
def test_get_lexicon(self): # Create lexicons lexicon1 = Lexicon("lexicon1") lexicon2 = Lexicon("lexicon2") # Add lexicons to the lexical resource self.lexical_resource.lexicon = [lexicon1, lexicon2] # Test get lexicon self.assertIsNone( self.lexical_resource.get_lexicon("unknown identifier")) self.assertEqual(self.lexical_resource.get_lexicon("lexicon2"), lexicon2) # Release Lexicon instances del lexicon1, lexicon2
def test_remove_lexicon(self): # Create lexicons lexicon1 = Lexicon() lexicon2 = Lexicon() # Add lexicons to the lexical resource self.lexical_resource.lexicon = [lexicon1, lexicon2] # Test remove lexicons self.assertEqual(self.lexical_resource.remove_lexicon(lexicon1), self.lexical_resource) self.assertListEqual(self.lexical_resource.lexicon, [lexicon2]) self.assertEqual(self.lexical_resource.remove_lexicon(lexicon2), self.lexical_resource) self.assertListEqual(self.lexical_resource.lexicon, []) # Release Lexicon instances del lexicon1, lexicon2
def test_add_lexicon(self): # Create lexicons lexicon1 = Lexicon() lexicon2 = Lexicon() # Test add lexicons to the lexical resource self.assertEqual(self.lexical_resource.add_lexicon(lexicon1), self.lexical_resource) self.assertListEqual(self.lexical_resource.lexicon, [lexicon1]) self.assertEqual(self.lexical_resource.add_lexicon(lexicon2), self.lexical_resource) self.assertListEqual(self.lexical_resource.lexicon, [lexicon1, lexicon2]) # Release Lexicon instances del self.lexical_resource.lexicon[:] del lexicon1, lexicon2
def get_system(name, args, schema=None, timed=False, model_path=None): if name in ('rulebased', 'neural'): lexicon = Lexicon(schema, args.learned_lex, stop_words=args.stop_words, lexicon_path=args.lexicon) if args.inverse_lexicon: realizer = InverseLexicon.from_file(args.inverse_lexicon) else: realizer = DefaultInverseLexicon() if name == 'rulebased': templates = Templates.from_pickle(args.templates) generator = Generator(templates) manager = Manager.from_pickle(args.policy) return RulebasedSystem(lexicon, generator, manager, timed) elif name == 'neural': assert args.model_path return NeuralSystem(schema, lexicon, args.model_path, args.fact_check, args.decoding, realizer=realizer) elif name == 'cmd': return CmdSystem() else: raise ValueError('Unknown system %s' % name)
def get_data_generator(args, model_args, schema, test=False): from cocoa.core.scenario_db import ScenarioDB from cocoa.core.dataset import read_dataset from cocoa.core.util import read_json from core.scenario import Scenario from core.lexicon import Lexicon from preprocess import DataGenerator, Preprocessor import os.path # TODO: move this to dataset dataset = read_dataset(args, Scenario) mappings_path = model_args.mappings lexicon = Lexicon(schema.values['item']) preprocessor = Preprocessor(schema, lexicon, model_args.entity_encoding_form, model_args.entity_decoding_form, model_args.entity_target_form, model=model_args.model) if test: model_args.dropout = 0 train, dev, test = None, None, dataset.test_examples else: train, dev, test = dataset.train_examples, dataset.test_examples, None data_generator = DataGenerator(train, dev, test, preprocessor, args, schema, mappings_path, cache=args.cache, ignore_cache=args.ignore_cache, num_context=model_args.num_context, batch_size=args.batch_size, model=model_args.model) return data_generator
def get_system(name, args, schema=None, timed=False): lexicon = Lexicon(schema.values['owner']) if name == 'rulebased': templates = Templates.from_pickle(args.templates) generator = Generator(templates) manager = Manager.from_pickle(args.policy) return RulebasedSystem(lexicon, generator, manager, timed) elif name == 'cmd': return CmdSystem() # elif name == 'neural': # return NeuralSystem(args.model_file, args.temperature, timed_session=timed, gpu=args.gpu) else: raise ValueError('Unknown system %s' % name)
def get_system(name, args, schema=None, timed=False, model_path=None): lexicon = Lexicon(schema.values['item']) if name == 'rulebased': templates = Templates.from_pickle(args.templates) generator = Generator(templates) manager = Manager.from_pickle(args.policy) return RulebasedSystem(lexicon, generator, manager, timed) elif name == 'hybrid': assert model_path templates = Templates.from_pickle(args.templates) manager = PytorchNeuralSystem(args, schema, lexicon, model_path, timed) generator = Generator(templates) return HybridSystem(lexicon, generator, manager, timed) elif name == 'cmd': return CmdSystem() elif name == 'fb-neural': assert model_path return FBNeuralSystem(model_path, args.temperature, timed_session=timed, gpu=False) elif name == 'pt-neural': assert model_path return PytorchNeuralSystem(args, schema, lexicon, model_path, timed) else: raise ValueError('Unknown system %s' % name)
def test_odt_write(self): import sys, os # Create LMF objects lexical_entry = LexicalEntry() lexical_entry.lemma = Lemma() lexical_entry.partOfSpeech = "toto" lexical_entry.status = "draft" lexical_entry.lemma.lexeme = "hello" lexicon = Lexicon() lexicon.add_lexical_entry(lexical_entry) lexical_resource = LexicalResource() lexical_resource.add_lexicon(lexicon) # Write document file and test result utest_path = sys.path[0] + '/' odt_filename = utest_path + "output.odt" odt_write(lexical_resource, odt_filename) odt_file = open(odt_filename, "r") odt_file.readlines() odt_file.close() # Customize mapping def lmf2odt(lexicon, document, items, sort_order, paradigms, reverse): return "test" # Write document file and test result odt_write(lexical_resource, odt_filename, None, lmf2odt) odt_file = open(odt_filename, "r") odt_file.readlines() odt_file.close() del lexical_entry.lemma lexical_entry.lemma = None del lexical_entry, lexicon lexicon = None del lexical_resource # Remove document file os.remove(odt_filename)
def test_mdf_write(self): import sys, os # Create LMF objects lexical_entry = LexicalEntry() lexical_entry.lemma = Lemma() lexical_entry.partOfSpeech = "toto" lexical_entry.status = "draft" lexical_entry.lemma.lexeme = "hello" lexicon = Lexicon() lexicon.add_lexical_entry(lexical_entry) # Write MDF file and test result utest_path = sys.path[0] + '/' mdf_filename = utest_path + "output.txt" mdf_write(lexicon, mdf_filename) mdf_file = open(mdf_filename, "r") expected_lines = ["\\lx hello" + EOL, "\\ps toto" + EOL, "\\st draft" + EOL, EOL] self.assertListEqual(expected_lines, mdf_file.readlines()) mdf_file.close() # Customize mapping lmf2mdf = dict({ "lx" : lambda lexical_entry: lexical_entry.get_status(), "ps" : lambda lexical_entry: lexical_entry.get_partOfSpeech(), "st" : lambda lexical_entry: lexical_entry.get_lexeme() }) order = ["st", "lx", "ps"] # Write MDF file and test result mdf_write(lexicon, mdf_filename, lmf2mdf, order) mdf_file = open(mdf_filename, "r") expected_lines = ["\\st hello" + EOL, "\\lx draft" + EOL, "\\ps toto" + EOL, EOL] self.assertListEqual(expected_lines, mdf_file.readlines()) mdf_file.close() del lexical_entry.lemma lexical_entry.lemma = None del lexical_entry, lexicon # Remove MDF file os.remove(mdf_filename)
parser = argparse.ArgumentParser() parser.add_argument('--transcripts', nargs='*', help='JSON transcripts to extract templates') parser.add_argument('--max-examples', default=-1, type=int) parser.add_argument('--templates', help='Path to load templates') parser.add_argument('--policy', help='Path to load model') parser.add_argument('--schema-path', help='Path to schema') parser.add_argument( '--agent', help='Only consider examples with the given type of agent') add_lexicon_arguments(parser) args = parser.parse_args() schema = Schema(args.schema_path) lexicon = Lexicon(schema, False, stop_words=args.stop_words, lexicon_path=args.lexicon) #templates = Templates.from_pickle(args.templates) templates = Templates() manager = Manager.from_pickle(args.policy) analyzer = Analyzer(lexicon) examples = read_examples(args.transcripts, args.max_examples, Scenario) agent = args.agent if agent is not None: examples = [e for e in examples if agent in e.agents.values()] analyzer.example_stats(examples, agent=agent) #import sys; sys.exit() parsed_dialogues = [] for example in examples:
if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--transcripts', nargs='*', help='JSON transcripts to extract templates') parser.add_argument('--max-examples', default=-1, type=int) parser.add_argument('--templates', help='Path to load templates') parser.add_argument('--policy', help='Path to load model') parser.add_argument('--schema-path', help='Path to schema') parser.add_argument( '--agent', help='Only consider examples with the given type of agent') args = parser.parse_args() schema = Schema(args.schema_path) lexicon = Lexicon(schema.values['item']) #templates = Templates.from_pickle(args.templates) templates = Templates() manager = Manager.from_pickle(args.policy) analyzer = Analyzer(lexicon) # TODO: skip examples examples = read_examples(args.transcripts, args.max_examples, Scenario) agent = args.agent if agent is not None: examples = [e for e in examples if agent in e.agents.values()] analyzer.example_stats(examples, agent=agent) #import sys; sys.exit() parsed_dialogues = [] for example in examples:
def test_tex_write(self): import sys, os # Create LMF objects lexical_entry = LexicalEntry() lexical_entry.lemma = Lemma() lexical_entry.partOfSpeech = "toto" lexical_entry.status = "draft" lexical_entry.lemma.lexeme = "hello" lexicon = Lexicon() lexicon.add_lexical_entry(lexical_entry) lexical_resource = LexicalResource() lexical_resource.add_lexicon(lexicon) # Write LaTeX file and test result utest_path = sys.path[0] + '/' tex_filename = utest_path + "output.tex" tex_write(lexical_resource, tex_filename) tex_file = open(tex_filename, "r") begin_lines = [ EOL, "\\begin{document}" + EOL, "\\maketitle" + EOL, "\\newpage" + EOL, EOL, "\\def\\mytextsc{\\bgroup\\obeyspaces\\mytextscaux}" + EOL, "\\def\\mytextscaux#1{\\mytextscauxii #1\\relax\\relax\\egroup}" + EOL, "\\def\\mytextscauxii#1{%" + EOL, "\\ifx\\relax#1\\else \\ifcat#1\\@sptoken{} \\expandafter\\expandafter\\expandafter\\mytextscauxii\\else" + EOL, "\\ifnum`#1=\\uccode`#1 {\\normalsize #1}\\else {\\footnotesize \\uppercase{#1}}\\fi \\expandafter\\expandafter\\expandafter\\mytextscauxii\\expandafter\\fi\\fi}" + EOL, EOL, "\\setlength\\parindent{0cm}" + EOL, EOL, "\\addmediapath{.}" + EOL, "\\addmediapath{./mp3}" + EOL, "\\addmediapath{./wav}" + EOL, "\\graphicspath{{" + os.path.abspath('.') + "/pylmflib/output/img/}}" + EOL, EOL, "\\newpage" + EOL, "\\begin{multicols}{2}" + EOL, EOL ] end_lines = ["\end{multicols}" + EOL, "\end{document}" + EOL] expected_lines = [ "\\newpage" + EOL, "\\section*{\\centering- \\textbf{\ipa{H}} \\textbf{\ipa{h}} -}" + EOL, #"\\pdfbookmark[1]{\ipa{ H h }}{ H h }" + EOL, "\\paragraph{\\hspace{-0.5cm} \\textbf{\ipa{hello}}} \\hypertarget{01}{}" + EOL, "\markboth{\\textbf{\\ipa{hello}}}{}" + EOL, "\\textit{Status:} draft" + EOL, "\lhead{\\firstmark}" + EOL, "\\rhead{\\botmark}" + EOL, EOL ] self.assertListEqual(begin_lines + expected_lines + end_lines, tex_file.readlines()) tex_file.close() # Customize mapping my_lmf_tex = dict({ "Lemma.lexeme": lambda lexical_entry: "is " + lexical_entry.get_lexeme( ) + "." + EOL, "LexicalEntry.id": lambda lexical_entry: "The lexical entry " + str(lexical_entry. get_id()) + " ", "LexicalEntry.partOfSpeech": lambda lexical_entry: "Its grammatical category is " + lexical_entry.get_partOfSpeech() + "." + EOL, "LexicalEntry.status": lambda lexical_entry: "Warning: " + lexical_entry.get_status( ) + " version!" + EOL }) my_order = [ "LexicalEntry.id", "Lemma.lexeme", "LexicalEntry.partOfSpeech", "LexicalEntry.status" ] def lmf2tex(entry, font): result = "" for attribute in my_order: result += my_lmf_tex[attribute](entry) return result # Write LaTeX file and test result tex_write(lexical_resource, tex_filename, None, None, lmf2tex, font) tex_file = open(tex_filename, "r") expected_lines = [ "\\newpage" + EOL, "\\section*{\\centering- \\textbf{\ipa{H}} \\textbf{\ipa{h}} -}" + EOL, #"\\pdfbookmark[1]{\ipa{ H h }}{ H h }" + EOL, "The lexical entry 01 is hello." + EOL, "Its grammatical category is toto." + EOL, "Warning: draft version!" + EOL, "\lhead{\\firstmark}" + EOL, "\\rhead{\\botmark}" + EOL, EOL ] self.assertListEqual(begin_lines + expected_lines + end_lines, tex_file.readlines()) tex_file.close() del lexical_entry.lemma lexical_entry.lemma = None del lexical_entry, lexicon lexicon = None del lexical_resource # Remove LaTeX file os.remove(tex_filename)
def mdf_read(filename=None, mdf2lmf=mdf_lmf, lexicon=None, id=None, encoding=ENCODING): """! @brief Read an MDF file. @param filename The name of the MDF file to read with full path, for instance 'user/input.txt'. @param mdf2lmf A Python dictionary describing the mapping between MDF markers and LMF representation. Default value is 'mdf_lmf' dictionary defined in 'pylmflib/config/mdf.py'. Please refer to it as an example. @param lexicon An existing Lexicon to fill with lexical entries to read. @param id A Python string identifying the lexicon to create. @param encoding Use 'utf-8' encoding by default. Otherwise, user has to precise the native encoding of its document. @return A Lexicon instance containing all lexical entries. """ import re # If not provided, create a Lexicon instance to contain all lexical entries if lexicon is None: lexicon = Lexicon(id) # Read in unicode if filename is None: filename = lexicon.get_entrySource() else: # Set lexicon attribute lexicon.set_entrySource(filename) # Read in unicode mdf_file = open_read(filename, encoding=encoding) # MDF syntax is the following: '\marker value' mdf_pattern = """^\\\(\w*) (<(.*)>)? ?(.*)$""" # Add each lexical entry to the lexicon current_entry = None sub_entry = None component = None main_entry = None for line in mdf_file.readlines(): # Do not parse empty lines if line != EOL: result = re.match(mdf_pattern, line) if result is None: # Line is empty => continue parsing next line continue marker = result.group(1) attrs = result.group(3) value = result.group(4) # Do not consider markers starting with an underscore character (e.g. '_sh' and '_DateStampHasFourDigitYear') if marker[0] == '_': continue # Remove trailing spaces and end-of-line characters value = value.rstrip(' \r\n') # Do not consider empty fields if value == "": continue # Check if the current entry is a multiword expression is_mwe = False if marker == "lf": lf = value.split(" = ") if lf[0].startswith("Component"): component_nb = lf[0].lstrip("Component") value = lf[1] is_mwe = True # 'lx' and 'se' markers indicate a new entry if marker == "lx" or marker == "se" or is_mwe: # Compute a unique identifier uid = uni2sampa(value) if marker == "se": # Create a subentry sub_entry = LexicalEntry(uid) # An MDF subentry corresponds to an LMF lexical entry mdf2lmf["lx"](value, sub_entry) # Add it to the lexicon lexicon.add_lexical_entry(sub_entry) # Manage main entry if main_entry is None: main_entry = current_entry else: current_entry = main_entry # Set main entry homonym_nb = current_entry.get_homonymNumber() if homonym_nb is None: homonym_nb = "" sub_entry.create_and_add_related_form( current_entry.get_lexeme() + homonym_nb, "main entry") elif is_mwe: # Create a subentry component = LexicalEntry(uid) # An MDF subentry corresponds to an LMF lexical entry mdf2lmf["lx"](value, component) # Add it to the lexicon lexicon.add_lexical_entry(component) # Manage current entry if sub_entry is not None: current_entry = sub_entry # Set component homonym_nb = current_entry.get_homonymNumber() if homonym_nb is None: homonym_nb = "" current_entry.create_and_add_component(component_nb, value) component.create_and_add_related_form( current_entry.get_lexeme() + homonym_nb, "complex predicate") component.set_independentWord(False) else: # Create a new entry current_entry = LexicalEntry(uid) # Add it to the lexicon lexicon.add_lexical_entry(current_entry) # Reset main entry main_entry = None # Map MDF marker and value to LMF representation try: if attrs is not None: # There are attributes attributes = {} # Remove quotation marks from attributes if any attrs = attrs.replace('"', '') for attr in attrs.split(' '): attributes.update( {attr.split('=')[0]: attr.split('=')[1]}) # A customized marker starts with '__' characters mdf2lmf["__" + marker](attributes, value, current_entry) else: mdf2lmf[marker](value, current_entry) if sub_entry is not None: current_entry = sub_entry sub_entry = None if component is not None: sub_entry = current_entry current_entry = component component = None except KeyError: # When printing, we need to convert 'unicode' into 'str' using 'utf-8' encoding: print Warning( "MDF marker '%s' encountered for lexeme '%s' is not defined in configuration" % (marker.encode(ENCODING), current_entry.get_lexeme().encode(ENCODING))) except Error as exception: exception.handle() mdf_file.close() return lexicon
parser.add_argument('--transcripts', nargs='*', help='JSON transcripts to extract templates') parser.add_argument('--max-examples', default=-1, type=int) parser.add_argument('--templates', help='Path to load templates') parser.add_argument('--templates-output', help='Path to save templates') parser.add_argument('--model', help='Path to load model') parser.add_argument('--model-output', help='Path to save the dialogue manager model') args = parser.parse_args() examples = read_examples(args.transcripts, args.max_examples, Scenario) parsed_dialogues = [] templates = Templates() lexicon = Lexicon(['ball', 'hat', 'book']) for example in examples: utterances = parse_example(example, lexicon, templates) parsed_dialogues.append(utterances) templates.finalize() templates.save(args.templates_output) templates.dump(n=10) # Train n-gram model sequences = [] for d in parsed_dialogues: sequences.append([u.lf.intent for u in d]) manager = Manager.from_train(sequences) manager.save(args.model_output)
def setUp(self): # Instantiate a Lexicon object self.lexicon = Lexicon()
def config_read(filename): """! @brief Read an XML file giving the user configuration. @param filename The name of the XML file to read with full path, for instance 'pylmflib/pylmflib/config/default/config.xml'. @return A Lexical Resource. """ import os import config.xml configuration = parse_xml(filename) # Parse XML elements for format in configuration: if format.tag == "Language": # XML element "Language" have several XML subelements "lang" for lang in format: # XML elements "lang" have 2 XML attributes: one for the nature of the language ("att"), a second for the language code ("val") exec("config.xml." + lang.attrib["att"] + " = '" + lang.attrib["val"] + "'") elif format.tag == "Font": config.xml.font = dict() # XML element "Font" have several XML subelements "font" for font in format: # XML elements "font" have 2 XML attributes: one for the nature of the language ("att"), a second for the variable name ("var") exec("l = lambda " + font.attrib['var'] + ": " + font.text) config.xml.font.update({font.attrib['att']: l}) elif format.tag == "LMF": # Create lexical resource and set DTD version lexical_resource = LexicalResource(format[0].attrib["dtdVersion"]) for object in format[0]: if object.tag == "GlobalInformation": # Set global information for feat in object: if feat.attrib["att"] == "languageCode": lexical_resource.set_language_code(feat.attrib["val"]) elif feat.attrib["att"] == "author": lexical_resource.set_author(feat.attrib["val"]) elif feat.attrib["att"] == "version": lexical_resource.set_version(feat.attrib["val"]) elif feat.attrib["att"] == "lastUpdate": lexical_resource.set_last_update(feat.attrib["val"]) elif feat.attrib["att"] == "license": lexical_resource.set_license(feat.attrib["val"]) elif feat.attrib["att"] == "characterEncoding": lexical_resource.set_character_encoding(feat.attrib["val"]) elif feat.attrib["att"] == "dateCoding": lexical_resource.set_date_coding(feat.attrib["val"]) elif feat.attrib["att"] == "creationDate": lexical_resource.set_creation_date(feat.attrib["val"]) elif feat.attrib["att"] == "projectName": lexical_resource.set_project_name(feat.attrib["val"]) elif feat.attrib["att"] == "description": lexical_resource.set_description(feat.attrib["val"]) elif object.tag == "Lexicon": # Create lexicon and set identifier lexicon = Lexicon(object.attrib["id"]) # Set lexicon attributes for feat in object: if feat.attrib["att"] == "language": lexicon.set_language(feat.attrib["val"]) elif feat.attrib["att"] == "languageScript": lexicon.set_languageScript(feat.attrib["val"]) elif feat.attrib["att"] == "label": lexicon.set_label(feat.attrib["val"]) elif feat.attrib["att"] == "lexiconType": lexicon.set_lexiconType(feat.attrib["val"]) elif feat.attrib["att"] == "entrySource": lexicon.set_entrySource(feat.attrib["val"]) elif feat.attrib["att"] == "localPath": lexicon.set_localPath(feat.attrib["val"]) # Set absolute path to audio files config.xml.audio_path = os.path.abspath(os.path.abspath('.') + "/" + feat.attrib["val"]) + "/" # Attach lexicon to the lexical resource lexical_resource.add_lexicon(lexicon) elif format.tag == "MDF": for mdf in format: if mdf.tag == "mdf_lmf": # XML elements "mdf_lmf" have 2 XML attributes: one for the name of the marker ("marker"), a second for the variable name ("var") exec("l = lambda " + mdf.attrib['var'] + ": " + mdf.text) mdf_lmf.update({mdf.attrib['marker']: l}) elif mdf.tag == "ps_partOfSpeech": # XML elements "ps_partOfSpeech" have 2 XML attributes: one for the MDF value ("ps"), a second for the LMF value ("partOfSpeech") ps_partOfSpeech.update({mdf.attrib['ps']: mdf.attrib['partOfSpeech']}) # Also automatically update range of possible values allowed for LMF part of speech LexicalEntry attribute --> partOfSpeech_range.add(mdf.attrib['partOfSpeech']) # And automatically update the reverse operation partOfSpeech_tex.update({mdf.attrib['partOfSpeech']: mdf.attrib['ps']}) elif mdf.tag == "pdl_paradigmLabel": # XML elements "pdl_paradigmLabel" have 2 XML attributes: one for the MDF value ("pdl"), a second for the LMF value ("paradigmLabel") pdl_paradigmLabel.update({mdf.attrib['pdl']: mdf.attrib['paradigmLabel']}) # Also automatically update range of possible values allowed for LMF paradigm label Paradigm attribute --> paradigmLabel_range.add(mdf.attrib['paradigmLabel']) # And automatically update the reverse operation paradigmLabel_tex.update({mdf.attrib['paradigmLabel']: mdf.attrib['pdl']}) elif mdf.tag == "lmf_mdf": # XML elements "lmf_mdf" have 2 XML attributes: one for the name of the marker ("marker"), a second for the variable name ("var") exec("l = lambda " + mdf.attrib['var'] + ": " + mdf.text) lmf_mdf.update({mdf.attrib['marker']: l}) elif mdf.tag == "mdf_order": mdf_order = [] for element in mdf: mdf_order.append(element.tag) list1 = [] for level1 in element: list1.append(level1.tag) list2 = [] for level2 in level1: list2.append(level2.tag) if len(list2) != 0: list1.append(list2) if len(list1) != 0: mdf_order.append(list1) elif format.tag == "LaTeX": for param in format: if param.tag == "partOfSpeech_tex": # XML elements "partOfSpeech_tex" have 2 or 3 XML attributes: one for the LMF value ("partOfSpeech"), a second for the LaTeX value ("tex"), and an optional one to define language try: partOfSpeech_tex.update({(param.attrib['lang'], param.attrib['partOfSpeech']): param.attrib['tex']}) except KeyError: partOfSpeech_tex.update({param.attrib['partOfSpeech']: param.attrib['tex']}) # Also automatically update range of possible values allowed for LMF part of speech LexicalEntry attribute --> partOfSpeech_range.add(param.attrib['partOfSpeech']) elif param.tag == "paradigmLabel_tex": # XML elements "paradigmLabel_tex" have 2 XML attributes: one for the LMF value ("paradigmLabel"), a second for the LaTeX value ("tex") paradigmLabel_tex.update({param.attrib['paradigmLabel']: param.attrib['tex']}) # Also automatically update range of possible values allowed for LMF paradigm label Paradigm attribute --> paradigmLabel_range.add(param.attrib['paradigmLabel']) else: raise InputError(module_name + ".py", "XML file '%s' is not well-formatted." % filename) return lexical_resource
return parsed_utterances if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--transcripts', nargs='*', help='JSON transcripts to extract templates') parser.add_argument('--max-examples', default=-1, type=int) parser.add_argument('--templates', help='Path to load templates') parser.add_argument('--templates-output', help='Path to save templates') parser.add_argument('--model', help='Path to load model') parser.add_argument('--model-output', help='Path to save the dialogue manager model') parser.add_argument('--schema-path', help='Path to schema') add_lexicon_arguments(parser) args = parser.parse_args() schema = Schema(args.schema_path) lexicon = Lexicon(schema, args.learned_lex, stop_words=args.stop_words, lexicon_path=args.lexicon) examples = read_examples(args.transcripts, args.max_examples, Scenario) parsed_dialogues = [] templates = Templates() for idx, example in enumerate(examples): utterances = parse_example(example, lexicon, templates) parsed_dialogues.append(utterances) #sample_intents(parsed_dialogues, "unknown", 30) #intent_breakdown(parsed_dialogues) templates.finalize() templates.save(args.templates_output) templates.dump(n=10)