Ejemplo n.º 1
0
def _write_reject(where, dropped_from, errcomms, opcode, rejection):
    """
    where is either:
      ['fname', fname]
    or
      ['docid', document_id, data_sort]

    errcoms is a list in the form:
      [[errcode, comments], e... ]

    where 'errcode' and 'comments' are defined as:

      comments -- list of comment lines.  If there are literal
                  newlines in a comment line, that's fine and new line
                  will be indented.  Comments may be the empty list
                  for one or all errorcodes.

      errcode  -- a short string to be looked up in the ERRS table.

    """

    if False:
        if where[0] == "fname":
            fname_base = where[1]
        else:
            assert where[0] == "docid"
            document_id, data_sort = where[1:]
            try:
                fname_base = output_file_name(document_id, data_sort)
            except IndexError:
                print document_id, data_sort
                raise
            try:
                mkdirs(os.path.dirname(fname_base))
            except Exception:
                pass  # already exists

        fname = fname_base + ".rejects"

        with codecs.open(fname, "a", "utf8") as outf:
            errnums = []

            try:
                for errcode, comments in errcomms:
                    errnum = "%s%s" % (ERRS[dropped_from][0],
                                       ERRS[dropped_from][1][errcode][0])
                    errmsg = ERRS[dropped_from][1][errcode][1]
                    errnums.append(errnum)
                    outf.write("; %s %s\n" % (errnum, errmsg))
                    for line in comments:
                        line = unicode2buckwalter(line)

                        indent = " " * 6
                        try:
                            line = indent + line.replace(
                                "\n", "\n" + indent + "   ")
                        except Exception:
                            print "%r" % line
                            raise
                        for subline in line.split("\n"):
                            outf.write("; %s\n" % subline)
            except ValueError:
                pprint.pprint(errcomms)
                raise

            outf.write("%s %s %s\n;\n;\n" %
                       (opcode, ",".join(errnums), rejection))
def callisto_to_sgml(fname,
                     out_sgml=None,
                     buckit=False,
                     language="unknown",
                     wrap=True):
    """
    given the fname of a callisto xml file, produce either fname.coref
    or fname.name depending on whether the file represents name or
    coref annotation.

     if buckit, then run everything through unicode2buckwalter before writing out

    if wrap, wrap the whole thing in <DOC ...> </DOC>

    """

    try:
        from on.common.util import unicode2buckwalter, desubtokenize_annotations
    except ImportError:
        raise OnCommonUtilNeededError("callisto_to_sgml")

    document_id, annotation_opens, annotation_closes, source_text_raw = parse_callisto_xml(
        fname)

    filename = None

    source_text = list(source_text_raw)

    names = corefs = 0

    for open_annotation in annotation_opens:
        idx = open_annotation[0]
        if open_annotation[1] == "name":
            open_annotation_str = "<%s>" % open_annotation[2]
            names += 1
        else:
            open_annotation_str = '<COREF-ID="%s"-TYPE="%s"%s>' % (
                open_annotation[2], open_annotation[3], '-SUBTYPE="%s"' %
                open_annotation[4] if open_annotation[4] else "")
            corefs += 1

        source_text[idx] = "%s%s" % (open_annotation_str, source_text[idx])

    for close_annotation in annotation_closes:
        idx = close_annotation[0]
        if close_annotation[1] == "name":
            close_annotation_str = "</%s>" % close_annotation[2]
        else:
            close_annotation_str = "</COREF>"

        source_text[idx] = "%s%s" % (close_annotation_str, source_text[idx])

    if corefs and not names:
        ext = "coref"
    elif names and not corefs:
        ext = "name"
    elif names and corefs:
        raise FileContainsBothNameAndCorefAnnotationException(fname)
    else:
        raise NoAnnotationFoundException(fname)

    filename = out_sgml if out_sgml else fname + "." + ext

    with codecs.open(filename, "w", "utf8") as out_f:
        if wrap:
            out_f.write('<DOC DOCNO="%s">\n' % document_id)

        n_text = "".join(source_text)

        try:
            n_text, num_fixed = desubtokenize_annotations(
                n_text, add_offset_notations=True)
        except Exception as e:
            raise DeSubtokenizationFailedException(fname, e)

        if buckit:
            n_text = unicode2buckwalter(n_text, sgml_safe=True)

        n_text = n_text.replace("\r", " ")

        n_text = n_text.strip()
        out_f.write(n_text)
        out_f.write("\n")

        if wrap:
            out_f.write('</DOC>\n')