def parse(options, filename, annotated_sentences, tmp_dir): tagged_conll_filename = output_filename(tmp_dir, filename, "tag.conll") parsed_filename = output_filename(tmp_dir, filename, "conll") log_filename = output_filename(tmp_dir, filename, "log") # The parser command line is dependent on the input and # output files, so we build that one for each data file parser_cmdline = ["java", "-Xmx2000m", "-jar", os.path.expanduser(options.malt), "-m", "parse", "-i", tagged_conll_filename, "-o", parsed_filename, "-w", tmp_dir, "-c", os.path.basename(options.parsing_model)] # Conversion from .tag file to tagged.conll (input format for the parser) tagged_conll_file = open(tagged_conll_filename, "w", encoding="utf-8") tagged_to_tagged_conll(annotated_sentences, tagged_conll_file) tagged_conll_file.close() # Run the parser with open(log_filename, "w", encoding="utf-8") as log_file: returncode = Popen(parser_cmdline, stdout=log_file, stderr=log_file).wait() if returncode: sys.exit("Parsing failed! Log file may contain more information: %s" % log_filename) return parsed_filename
def test_verb(self): annotated_sentences = [[ ('Jag jag PRON|Case=Nom|Definite=Def|Gender=Com|Number=Sing PN|UTR|SIN|DEF|SUB' ).split(" "), ('har ha VERB|Mood=Ind|Tense=Pres|VerbForm=Fin|Voice=Act VB|PRS|AKT' ).split(" "), ('en en DET|Definite=Ind|Gender=Com|Number=Sing DT|UTR|SIN|IND' ).split(" "), ('dröm dröm NOUN|Case=Nom|Definite=Ind|Gender=Com|Number=Sing NN|UTR|SIN|IND|NOM' ).split(" "), ('. . PUNCT|_ MAD').split(" "), ]] file_data = "" with tempfile.TemporaryFile(mode="w+") as outfile: conll.tagged_to_tagged_conll(annotated_sentences, outfile) outfile.seek(0) file_data = outfile.read() self.assertEqual( file_data.splitlines(), textwrap.dedent(""" 0 Jag jag PRON PN|UTR|SIN|DEF|SUB Case=Nom|Definite=Def|Gender=Com|Number=Sing 1 har ha VERB VB|PRS|AKT Mood=Ind|Tense=Pres|VerbForm=Fin|Voice=Act 2 en en DET DT|UTR|SIN|IND Definite=Ind|Gender=Com|Number=Sing 3 dröm dröm NOUN NN|UTR|SIN|IND|NOM Case=Nom|Definite=Ind|Gender=Com|Number=Sing 4 . . PUNCT MAD _ """).lstrip().splitlines())
def parse(options, filename, annotated_sentences, tmp_dir): tagged_conll_filename = output_filename(tmp_dir, filename, "tag.conll") parsed_filename = output_filename(tmp_dir, filename, "conll") log_filename = output_filename(tmp_dir, filename, "log") # The parser command line is dependent on the input and # output files, so we build that one for each data file parser_cmdline = [ "java", "-Xmx2000m", "-jar", os.path.expanduser(options.malt), "-m", "parse", "-i", tagged_conll_filename, "-o", parsed_filename, "-w", tmp_dir, "-c", os.path.basename(options.parsing_model) ] # Conversion from .tag file to tagged.conll (input format for the parser) tagged_conll_file = open(tagged_conll_filename, "w", encoding="utf-8") tagged_to_tagged_conll(annotated_sentences, tagged_conll_file) tagged_conll_file.close() # Run the parser with open(log_filename, "w", encoding="utf-8") as log_file: returncode = Popen(parser_cmdline, stdout=log_file, stderr=log_file).wait() if returncode: sys.exit("Parsing failed! See log file: %s" % log_filename) return parsed_filename
def test_verb(self): annotated_sentences = [[ ('Jag jag PRON|Case=Nom|Definite=Def|Gender=Com|Number=Sing PN|UTR|SIN|DEF|SUB').split(" "), ('har ha VERB|Mood=Ind|Tense=Pres|VerbForm=Fin|Voice=Act VB|PRS|AKT').split(" "), ('en en DET|Definite=Ind|Gender=Com|Number=Sing DT|UTR|SIN|IND').split(" "), ('dröm dröm NOUN|Case=Nom|Definite=Ind|Gender=Com|Number=Sing NN|UTR|SIN|IND|NOM').split(" "), ('. . PUNCT|_ MAD').split(" "), ]] file_data = "" with tempfile.TemporaryFile(mode="w+") as outfile: conll.tagged_to_tagged_conll(annotated_sentences, outfile) outfile.seek(0) file_data = outfile.read() self.assertEqual(file_data.splitlines(), textwrap.dedent(""" 0 Jag jag PRON PN|UTR|SIN|DEF|SUB Case=Nom|Definite=Def|Gender=Com|Number=Sing 1 har ha VERB VB|PRS|AKT Mood=Ind|Tense=Pres|VerbForm=Fin|Voice=Act 2 en en DET DT|UTR|SIN|IND Definite=Ind|Gender=Com|Number=Sing 3 dröm dröm NOUN NN|UTR|SIN|IND|NOM Case=Nom|Definite=Ind|Gender=Com|Number=Sing 4 . . PUNCT MAD _ """).lstrip().splitlines())