Ejemplo n.º 1
0
def parse_section_of_a_file(inobj, outobj, datadir, start=None, end=None,
    convert_from_trees=False, keep_temporary_section=False, **parse_args):
    """Generic function to parse just a portion of a file and put the
    outobj into a file.  This function is the workhorse of ParseHog.parse.
    inobj and outobj are either file-like objects or filenames.
    start and end can be None to mean the beginning or end of the file,
    respectively."""
    # note that None is less than all integers and that the empty string
    # is greater than all integers.  thus, if we change end to "" if it is 
    # None, we can compare it with integers properly.
    if end is None:
        end = ""

    input_file = open_file_or_filename(inobj)
    if convert_from_trees:
        input_file = TreeConverter(input_file)
        parse_args['already_tokenized'] = True # set -K

    sliced_file = keepable_tempfile(keep=keep_temporary_section)

    current_line_number = 0
    for line in input_file:
        if start <= current_line_number <= end:
            sliced_file.write(line)
        elif current_line_number > end:
            break
        current_line_number += 1
    sliced_file.flush()
    if (start is None) or (end is ""):
        num_input_lines = None
    else:
        num_input_lines = (start - end) + 1

    parser = ECParser()
    parser.parse_sgml_file(sliced_file, output=outobj, 
        datadir=datadir, **parse_args)
Ejemplo n.º 2
0
    def parse_sgml_file(self, sgml_fileobj_or_name, output=None,
        debug=False, skip_blank_lines=False, logstream=sys.stdout, 
        error_messages_are_failures=True, **parse_args):
        # TODO these docs are dangerously outdated
        """sgml_fileobj_or_name is an <s> file object, must have
        a filename (i.e. tempfile.TemporaryFile is not okay but a
        tempfile.NamedTemporaryFile is fine).  We run Eugene's parser
        on the sgml_fileobj_or_name and return the output of his parser
        as a string.  See get_parser_command() for a description of
        parse_args."""

        # TODO don't we already have something like this?
        if isinstance(sgml_fileobj_or_name, basestring):
            name = sgml_fileobj_or_name
        else:
            name = sgml_fileobj_or_name.name

        cmd = get_parser_command(name, **parse_args)
        if debug:
            print >>logstream, "ECParser: Running %r" % cmd
            if output:
                print >>logstream, "ECParser: Redirecting output to %r" % output
        logstream.flush()

        # fork and run the parser as a child.  Popen3 objects will give us
        # their stdin/stdout/stderr file handles and PID.
        self.parser = Popen3(cmd, capturestderr=True)
        self.pid = self.parser.pid
        if output: # redirect output
            output_file = open_file_or_filename(output, 'w')
            data = self.parser.fromchild.read()
            if skip_blank_lines:
                lines = data.splitlines()
                for line in lines:
                    if line.strip():
                        output_file.write(data)
            else:
                output_file.write(data)
            output_file.flush()
            self.last_output = "(redirected to %r)" % output

            """
            # this code doesn't handle nbest or LM yet, so I'm taking it out
            # until it is more complete
            # make sure we saw enough output lines or raise MissingScores
            if length_hint is not None and parse_args.get('nbest') == None:
                output_file = file(output, 'r')
                num_output_lines = 0
                for line in output_file:
                    num_output_lines += 1
                num_input_lines = length_hint
                # 2x since there is a blank line between
                if num_output_lines != 2 * num_input_lines:
                    m = "Expected %d lines of output in %s, only found %d"
                    v = (2 * num_input_lines, output, num_output_lines)
                    raise MissingScores(m % v)
            """
        else:
            self.last_output = self.parser.fromchild.read()

        self.last_error = self.parser.childerr.read().strip()
        if self.last_error.strip():
            print >>logstream, "Parser stderr -----"
            print >>logstream, self.last_error
            print >>logstream, "End parser stderr -----"
            logstream.flush()
            if error_messages_are_failures:
                raise ParserPrintedErrorMessages(self.last_error)

        self.last_status = self.parser.wait()
        self.pid = None

        if self.last_status != 0 or debug:
            print >>logstream, "Parser stdout -----"
            print self.last_output
            print >>logstream, "End parser stdout -----"
            logstream.flush()
            if self.last_status != 0:
                raise BadParserExitCode(self.last_status)

        return self.last_output