Example #1
0
 def testDefault(self):
     logging.getLogger().addHandler(logging.StreamHandler())  # might spam your console...
     self.interceptLogs('otplc.extractor')
     self.otpl_file.write(
         "This    DT  6 nsubj B-NP NULL\n"
         "is      VBZ 6 cop   B-VP NULL\n"
         "Florian NNP 6 nn    B-NP NULL\n"
         "ʼs      POS 3 pos   I-NP db:id\n"
         "weird   JJ  6 amod  I-NP NULL\n"
         "test    NN  0 root  I-NP NULL\n"
         ".       DOT 6 punct O    NULL\n\n"
         "And    DT  6 nsubj B-NP NULL\n"
         "another      VBZ 6 cop   B-VP NULL\n"
         "one    NN  0 root  I-NP NULL\n"
         ".       DOT 6 punct O    NULL\n\n"
     )
     self.otpl_file.close()
     expected = 'This is Florian ʼs weird test .\nAnd another one .\n'
     self.assertEqual(0, otpl_to_text(Configuration([self.otpl_file.name])))
     result = open(make_path_to(self.otpl_file.name, Configuration.TEXT_SUFFIX)).read()
     self.assertEqual(expected, result)
Example #2
0
def otpl_to_text(configuration):
    """
    Extract the text using the tokens of the OTPL files and store the results
    into separate plain-text files.

    :param configuration: a :class:`otplc.settings.Configuration` object
    :return: The number of failed conversion for the input files.
    """
    errors = 0

    for otpl_file in configuration.input_files:
        text_file = make_path_to(otpl_file, configuration.text_suffix)
        msg = "output text file and input OTPL file have the same path " \
              "(ensure the OTPL file does not use the extension '{}')"
        assert otpl_file != text_file, msg.format(configuration.text_suffix)
        segments = configure_reader(otpl_file, configuration)

        if segments is None:
            errors += 1
            continue

        if configuration.colspec is None:
            configuration.colspec = guess_colspec(segments)

        token = configuration.colspec.token

        try:
            with open(text_file,
                      encoding=configuration.encoding,
                      mode='wt') as out_stream:
                for seg in segments:
                    print(*[row[token] for row in seg], file=out_stream)
        except IOError as e:
            L.error('I/O error while extracting %s to %s: %s',
                    otpl_file, text_file, str(e))
            errors += 1

    return errors