def testWriteConfigurationFile(self): self.text_file.write("T1 T2 T3 T4\nT1T2-T3\n") self.text_file.close() self.otpl_file.write( "T1 ent1 att1 1 rel1 1 2 evt1 att1\n" "T2 ent2 NULL 3 rel2 0 3 NULL NULL\n" "T3 ent1 NULL 4 rel2 3 4 evt1 att2\n" "T4 ent2 att1 3 rel3 4 2 evt2 att1\n\n" "T1 ent3 NULL 1 rel1 3 1 evt1 att2\n" "T2 ent1 NULL 3 rel1 3 2 evt2 NULL\n" "T3 ent2 att1 2 rel3 3 0 evt2 att2\n\n" ) self.otpl_file.close() config_file = NamedTemporaryFile(suffix=".conf", delete=False) config_file.close() converter = OtplBratConverter() converter.set_colspec(guess_colspec(self.segments)) self.assertTrue(converter.convert(self.segments, self.text_file.name, self.brat_file.name)) converter.write_config_file(config_file.name) expected = [ "", "[entities]", "", "ent1", "ent2", "ent3", "", "[relations]", "", "rel1\tArg1:<ENTITY>, Arg2:<ENTITY>", "rel2\tArg1:<ENTITY>, Arg2:<ENTITY>", "rel3\tArg1:<ENTITY>, Arg2:<ENTITY>", "", "[events]", "", "evt1\tCol7:<ENTITY>", "evt2\tCol7?:<ENTITY>", "", "[attributes]", "", "att1\tArg:<ANY>", "att2\tArg:<EVENT>", ] for lno, raw in enumerate(open(config_file.name, "rb")): line = raw.decode("utf-8").strip() self.assertEqual(expected[lno], line) remove(config_file.name)
def testDefault(self): logging.getLogger().addHandler(logging.StreamHandler()) # might spam your console... self.interceptLogs("otplc.converter") self.text_file.write("This is Florianʼs weird test.") self.text_file.close() self.segments.filter = r"^%" self.otpl_file.write( "This DT 6 nsubj B-NP NULL\n" "is VBZ 6 cop B-VP NULL\n" "% a comment line in %\n" "Florian NNP 6 nn B-NP NULL\n" "ʼs POS 3 pos I-NP db:id\n" "weird JJ 6 amod I-NP NULL\n" "test NN 0 root I-NP NULL\n" ". DOT 6 punct O NULL\n\n" ) expected = [ "T1 DT 0 4 This", "T2 VBZ 5 7 is", "T3 NNP 8 15 Florian", "T4 POS 15 17 ʼs", "T5 JJ 18 23 weird", "T6 NN 24 28 test", "T7 DOT 28 29 .", "T8 NP 0 4 This", "T9 VP 5 7 is", "T10 NP 8 28 Florianʼs weird test", "R1 nsubj Arg1:T1 Arg2:T6", "R2 cop Arg1:T2 Arg2:T6", "R3 nn Arg1:T3 Arg2:T6", "R4 pos Arg1:T4 Arg2:T3", "R5 amod Arg1:T5 Arg2:T6", "R6 punct Arg1:T7 Arg2:T6", "N1 Reference T10 db:id db:id", ] self.otpl_file.close() self.brat_file.close() converter = OtplBratConverter() converter.set_colspec(guess_colspec(self.segments)) self.assertTrue(converter.convert(self.segments, self.text_file.name, self.brat_file.name)) for lno, line in enumerate(open(self.brat_file.name)): line = line.strip("\r\n") self.assertEqual(expected[lno], line)
def testUnmatchedTokens(self): self.interceptLogs("otplc.converter") self.text_file.write("This is Florianʼs weird test.") self.text_file.close() self.otpl_file.write( "This DT 6 nsubj B-NP NULL\n" "is VBZ 6 cop B-VP NULL\n" "Florian NNP 6 nn B-NP NULL\n" "ʼs POS 3 pos I-NP mailto:[email protected]\n" "weird JJ 6 amod I-NP NULL\n" "anti-test NN 0 root I-NP NULL\n" ". . 6 punct O NULL\n\n" ) self.otpl_file.close() self.brat_file.close() converter = OtplBratConverter() converter.set_colspec(guess_colspec(self.segments)) self.assertFalse(converter.convert(self.segments, self.text_file.name, self.brat_file.name)) self.test_log.assertMatches( "failed - %s", levelname="WARNING", args=('token "anti-test" from line 6 not found at " test." (23)',) )
def otpl_to_text(configuration): """ Extract the text using the tokens of the OTPL files and store the results into separate plain-text files. :param configuration: a :class:`otplc.settings.Configuration` object :return: The number of failed conversion for the input files. """ errors = 0 for otpl_file in configuration.input_files: text_file = make_path_to(otpl_file, configuration.text_suffix) msg = "output text file and input OTPL file have the same path " \ "(ensure the OTPL file does not use the extension '{}')" assert otpl_file != text_file, msg.format(configuration.text_suffix) segments = configure_reader(otpl_file, configuration) if segments is None: errors += 1 continue if configuration.colspec is None: configuration.colspec = guess_colspec(segments) token = configuration.colspec.token try: with open(text_file, encoding=configuration.encoding, mode='wt') as out_stream: for seg in segments: print(*[row[token] for row in seg], file=out_stream) except IOError as e: L.error('I/O error while extracting %s to %s: %s', otpl_file, text_file, str(e)) errors += 1 return errors