def filter(vertical, struct, attr, match="all"): """Filter structures in vertical according to attribute value(s). All structures above ``struct`` are discarded. The output is a vertical consisting of structures of type struct which satisfy ``all/any/none`` ``(key, val)`` conditions in ``attr``. """ # TODO: reimplement this as a regex match on a string generated from the # sorted attr list → will allow for wildcard matching attr = set(attr) if match == "all": match = "issuperset" elif match == "any": match = "intersection" elif match == "none": match = "isdisjoint" else: raise RuntimeError("Unsupported matching strategy: {}.".format(match)) for struct in pyvert.iterstruct(vertical, struct=struct): struct_attr = set(struct.attr.items()) # check if struct_attr is a superset of attr (if match == "all") or # whether the intersection of struct_attr and attr is non-zero (if # match == "any") if getattr(struct_attr, match)(attr): yield struct.raw
def group(vertical, target, attr, parent=None, unique=False, as_struct="group"): """Group structures in vertical according to an attribute. Group all ``target`` structures within each ``parent`` structure according to one or more of their ``attr``ibute values. Structures above parent and between parent and target are discarded. Groups will be represented as structures with tag <``as``> and an @id attribute with the same value as the original attr. Other attributes are copied over from the first target falling into the given group, and from the parent. If no ``parent`` is given, groups will be constructed at the top level of the vertical. """ for i, struct in enumerate(pyvert.iterstruct(vertical, struct=parent)): fri = None if unique else "__autoid{}__".format(i) grouped = struct.group(target=target, attr=attr, as_struct=as_struct, fallback_root_id=fri) serialized = etree.tounicode(grouped) # get rid of helper <root/> struct wrapping the vertical to make it # valid XML when it's taken as a whole if parent is None: serialized = serialized[7:-8] yield serialized
def project(vertical, parent, child): """Project metadata from ``parent`` structure onto ``child`` structure. Projected attributes are prefixed with the parent structure's name, and if necessary, postfixed with underscores so as to avoid collisions with any existing attributes in the child structure. """ for struct in pyvert.iterstruct(vertical, struct=parent): struct.project(child=child) yield etree.tounicode(struct.xml)
def identify(vertical, struct, base="id_", attr="id"): """Add a unique identifier attribute to each ``struct`` in vertical, and hoist the struct to the top level of the vertical. The identifier will be stored in attribute ``attr`` (possibly overwriting it) and will be of the form ``<base>_<numeric index>``. """ # TODO: iterate over lines instead so as not to drop structures above # ``struct`` (→ change docstring when it's done) for i, struct in enumerate(pyvert.iterstruct(vertical, struct=struct)): struct.xml.attrib[attr] = base + str(i) yield etree.tounicode(struct.xml)
def wrap(vertical, target, attr, name="wrap"): """Wrap ``target`` structures in a parent with tag ``name``. Put adjacent structures under the same parent while their attribute ``key, val`` pairs (for all attributes specified under ``attr``) are the same. """ last_attr = None for i, struct in enumerate(pyvert.iterstruct(vertical, struct=target)): try: new_attr = ",".join(struct.attr[a] for a in attr) except KeyError as e: raise RuntimeError("Structure does not contain specified " "attribute.") from e if new_attr != last_attr: if last_attr is not None: yield "</{}>\n".format(name) yield '<{} id="{}_{}">\n'.format(name, new_attr, i) yield struct.raw last_attr = new_attr yield "</{}>\n".format(name)
def chunk(vertical, ancestor, child, name="chunk", minmax=(2000, 5000)): """Split a vertical into chunks of a given size. Output is that same vertical, but separated into chunks. All structures other than ``ancestor``, the chunks themselves and ``child`` are discarded. ``ancestor`` is the existing structure on which to base the chunks; its metadata will be copied over to the newly created chunks. ``child`` is the structure which will constitute the immediate children of the chunks and whose boundaries the chunks will respect. Note that ``minmax`` may be violated when the given ancestor structure is shorter, or when the next child boundary occurs some positions after the maximum limit. """ # we want the chunking to be randomized within the minmax range, but # replicable across runs on the same data random.seed(1) for i, struct in enumerate(pyvert.iterstruct(vertical, struct=ancestor)): chunkified = struct.chunk(child=child, name=name, minmax=minmax, fallback_orig_id="__autoid{}__".format(i)) yield etree.tounicode(chunkified)