Ejemplos de unicode2buckwalter en Python

Lenguaje de programación: Python

Namespace/Package Name: on.common.util

Método / Función: unicode2buckwalter

Ejemplos en hotexamples.com: 2

Python unicode2buckwalter - 2 ejemplos encontrados. Estos son los ejemplos en Python del mundo real mejor valorados de on.common.util.unicode2buckwalter extraídos de proyectos de código abierto. Puedes valorar ejemplos para ayudarnos a mejorar la calidad de los ejemplos.

Ejemplo n.º 1

Mostrar archivo

Archivo: log.py Proyecto: liushui9404/coref-1

def _write_reject(where, dropped_from, errcomms, opcode, rejection):
    """
    where is either:
      ['fname', fname]
    or
      ['docid', document_id, data_sort]

    errcoms is a list in the form:
      [[errcode, comments], e... ]

    where 'errcode' and 'comments' are defined as:

      comments -- list of comment lines.  If there are literal
                  newlines in a comment line, that's fine and new line
                  will be indented.  Comments may be the empty list
                  for one or all errorcodes.

      errcode  -- a short string to be looked up in the ERRS table.

    """

    if False:
        if where[0] == "fname":
            fname_base = where[1]
        else:
            assert where[0] == "docid"
            document_id, data_sort = where[1:]
            try:
                fname_base = output_file_name(document_id, data_sort)
            except IndexError:
                print document_id, data_sort
                raise
            try:
                mkdirs(os.path.dirname(fname_base))
            except Exception:
                pass  # already exists

        fname = fname_base + ".rejects"

        with codecs.open(fname, "a", "utf8") as outf:
            errnums = []

            try:
                for errcode, comments in errcomms:
                    errnum = "%s%s" % (ERRS[dropped_from][0],
                                       ERRS[dropped_from][1][errcode][0])
                    errmsg = ERRS[dropped_from][1][errcode][1]
                    errnums.append(errnum)
                    outf.write("; %s %s\n" % (errnum, errmsg))
                    for line in comments:
                        line = unicode2buckwalter(line)

                        indent = " " * 6
                        try:
                            line = indent + line.replace(
                                "\n", "\n" + indent + "   ")
                        except Exception:
                            print "%r" % line
                            raise
                        for subline in line.split("\n"):
                            outf.write("; %s\n" % subline)
            except ValueError:
                pprint.pprint(errcomms)
                raise

            outf.write("%s %s %s\n;\n;\n" %
                       (opcode, ",".join(errnums), rejection))

Ejemplo n.º 2

Mostrar archivo

Archivo: callisto_converter.py Proyecto: Varal7/ontonotes-tools-py3

def callisto_to_sgml(fname,
                     out_sgml=None,
                     buckit=False,
                     language="unknown",
                     wrap=True):
    """
    given the fname of a callisto xml file, produce either fname.coref
    or fname.name depending on whether the file represents name or
    coref annotation.

     if buckit, then run everything through unicode2buckwalter before writing out

    if wrap, wrap the whole thing in <DOC ...> </DOC>

    """

    try:
        from on.common.util import unicode2buckwalter, desubtokenize_annotations
    except ImportError:
        raise OnCommonUtilNeededError("callisto_to_sgml")

    document_id, annotation_opens, annotation_closes, source_text_raw = parse_callisto_xml(
        fname)

    filename = None

    source_text = list(source_text_raw)

    names = corefs = 0

    for open_annotation in annotation_opens:
        idx = open_annotation[0]
        if open_annotation[1] == "name":
            open_annotation_str = "<%s>" % open_annotation[2]
            names += 1
        else:
            open_annotation_str = '<COREF-ID="%s"-TYPE="%s"%s>' % (
                open_annotation[2], open_annotation[3], '-SUBTYPE="%s"' %
                open_annotation[4] if open_annotation[4] else "")
            corefs += 1

        source_text[idx] = "%s%s" % (open_annotation_str, source_text[idx])

    for close_annotation in annotation_closes:
        idx = close_annotation[0]
        if close_annotation[1] == "name":
            close_annotation_str = "</%s>" % close_annotation[2]
        else:
            close_annotation_str = "</COREF>"

        source_text[idx] = "%s%s" % (close_annotation_str, source_text[idx])

    if corefs and not names:
        ext = "coref"
    elif names and not corefs:
        ext = "name"
    elif names and corefs:
        raise FileContainsBothNameAndCorefAnnotationException(fname)
    else:
        raise NoAnnotationFoundException(fname)

    filename = out_sgml if out_sgml else fname + "." + ext

    with codecs.open(filename, "w", "utf8") as out_f:
        if wrap:
            out_f.write('<DOC DOCNO="%s">\n' % document_id)

        n_text = "".join(source_text)

        try:
            n_text, num_fixed = desubtokenize_annotations(
                n_text, add_offset_notations=True)
        except Exception as e:
            raise DeSubtokenizationFailedException(fname, e)

        if buckit:
            n_text = unicode2buckwalter(n_text, sgml_safe=True)

        n_text = n_text.replace("\r", " ")

        n_text = n_text.strip()
        out_f.write(n_text)
        out_f.write("\n")

        if wrap:
            out_f.write('</DOC>\n')