Beispiel #1
0
def filter(vertical, struct, attr, match="all"):
    """Filter structures in vertical according to attribute value(s).

    All structures above ``struct`` are discarded. The output is a vertical
    consisting of structures of type struct which satisfy ``all/any/none``
    ``(key, val)`` conditions in ``attr``.

    """
    # TODO: reimplement this as a regex match on a string generated from the
    # sorted attr list → will allow for wildcard matching
    attr = set(attr)
    if match == "all":
        match = "issuperset"
    elif match == "any":
        match = "intersection"
    elif match == "none":
        match = "isdisjoint"
    else:
        raise RuntimeError("Unsupported matching strategy: {}.".format(match))
    for struct in pyvert.iterstruct(vertical, struct=struct):
        struct_attr = set(struct.attr.items())
        # check if struct_attr is a superset of attr (if match == "all") or
        # whether the intersection of struct_attr and attr is non-zero (if
        # match == "any")
        if getattr(struct_attr, match)(attr):
            yield struct.raw
Beispiel #2
0
def group(vertical, target, attr, parent=None, unique=False, as_struct="group"):
    """Group structures in vertical according to an attribute.

    Group all ``target`` structures within each ``parent`` structure
    according to one or more of their ``attr``ibute values.

    Structures above parent and between parent and target are discarded. Groups
    will be represented as structures with tag <``as``> and an @id attribute
    with the same value as the original attr. Other attributes are copied over
    from the first target falling into the given group, and from the parent.

    If no ``parent`` is given, groups will be constructed at the top level of
    the vertical.

    """
    for i, struct in enumerate(pyvert.iterstruct(vertical, struct=parent)):
        fri = None if unique else "__autoid{}__".format(i)
        grouped = struct.group(target=target, attr=attr, as_struct=as_struct,
                               fallback_root_id=fri)
        serialized = etree.tounicode(grouped)
        # get rid of helper <root/> struct wrapping the vertical to make it
        # valid XML when it's taken as a whole
        if parent is None:
            serialized = serialized[7:-8]
        yield serialized
Beispiel #3
0
def project(vertical, parent, child):
    """Project metadata from ``parent`` structure onto ``child`` structure.

    Projected attributes are prefixed with the parent structure's name, and if
    necessary, postfixed with underscores so as to avoid collisions with any
    existing attributes in the child structure.

    """
    for struct in pyvert.iterstruct(vertical, struct=parent):
        struct.project(child=child)
        yield etree.tounicode(struct.xml)
Beispiel #4
0
def identify(vertical, struct, base="id_", attr="id"):
    """Add a unique identifier attribute to each ``struct`` in vertical, and
    hoist the struct to the top level of the vertical.

    The identifier will be stored in attribute ``attr`` (possibly overwriting
    it) and will be of the form ``<base>_<numeric index>``.

    """
    # TODO: iterate over lines instead so as not to drop structures above
    # ``struct`` (→ change docstring when it's done)
    for i, struct in enumerate(pyvert.iterstruct(vertical, struct=struct)):
        struct.xml.attrib[attr] = base + str(i)
        yield etree.tounicode(struct.xml)
Beispiel #5
0
def wrap(vertical, target, attr, name="wrap"):
    """Wrap ``target`` structures in a parent with tag ``name``.

    Put adjacent structures under the same parent while their attribute ``key,
    val`` pairs (for all attributes specified under ``attr``) are the same.

    """
    last_attr = None
    for i, struct in enumerate(pyvert.iterstruct(vertical, struct=target)):
        try:
            new_attr = ",".join(struct.attr[a] for a in attr)
        except KeyError as e:
            raise RuntimeError("Structure does not contain specified "
                               "attribute.") from e
        if new_attr != last_attr:
            if last_attr is not None:
                yield "</{}>\n".format(name)
            yield '<{} id="{}_{}">\n'.format(name, new_attr, i)
        yield struct.raw
        last_attr = new_attr
    yield "</{}>\n".format(name)
Beispiel #6
0
def chunk(vertical, ancestor, child, name="chunk", minmax=(2000, 5000)):
    """Split a vertical into chunks of a given size.

    Output is that same vertical, but separated into chunks. All structures
    other than ``ancestor``, the chunks themselves and ``child`` are discarded.

    ``ancestor`` is the existing structure on which to base the chunks; its
    metadata will be copied over to the newly created chunks.

    ``child`` is the structure which will constitute the immediate children of
    the chunks and whose boundaries the chunks will respect.

    Note that ``minmax`` may be violated when the given ancestor structure is
    shorter, or when the next child boundary occurs some positions after the
    maximum limit.

    """
    # we want the chunking to be randomized within the minmax range, but
    # replicable across runs on the same data
    random.seed(1)
    for i, struct in enumerate(pyvert.iterstruct(vertical, struct=ancestor)):
        chunkified = struct.chunk(child=child, name=name, minmax=minmax,
                                  fallback_orig_id="__autoid{}__".format(i))
        yield etree.tounicode(chunkified)