def datastream(xmlfile): skippedLines = defaultdict(int) elems = sorted(allElems, key=lambda e: elem2level[e]) mE0 = "<%s " % elems[0] mE1 = "<%s " % elems[1] attrs0 = [a for a in attrs if attr2elem[a] == elems[0]] attrs1 = [a for a in attrs if attr2elem[a] == elems[1]] mAs0 = [(a, re.compile('%s="([^"]*)"' % a)) for a in attrs0] mAs1 = [(a, re.compile('%s="([^"]*)"' % a)) for a in attrs1] values = {} # attr -> value for line in _open(xmlfile): if mE0 in line: for a, r in mAs0: values[a] = r.search(line).groups()[0] if mE1 in line: skip = False for a, r in mAs1: m = r.search(line) if m: values[a] = m.groups()[0] else: skip = True skippedLines[a] += 1 if not skip: yield [values[a] for a in attrs] for attr, count in skippedLines.items(): print( "Warning: Skipped %s lines because of missing attributes '%s'." % (count, attr), file=sys.stderr)
def datastream(xmlfile): mE = "<%s " % allElems[0] mAs = [re.compile('%s="([^"]*)"' % a) for a in attrs] for line in _open(xmlfile): if mE in line: matches = [r.search(line) for r in mAs] if all(matches): yield [m.groups()[0] for m in matches]
def getDataStream(options): # determine elements and nesting for the given attributes # by reading from the first file attrOptions = options.attrOptions attr2elem = {} elem2level = {} level = 0 for event, elem in ET.iterparse(_open(options.files[0]), ("start", "end")): if event == "start": level += 1 for a, e in zip(attrOptions, options.attrElems): attr = getattr(options, a) if attr in elem.keys(): if e is not None and e != elem.tag: # print("skipping attribute '%s' in element '%s' (required elem '%s'" % (attr, elem.tag, e)) continue elem2level[elem.tag] = level if attr in attr2elem: oldTag = attr2elem[attr] if oldTag != elem.tag: if elem2level[oldTag] < level: attr2elem[attr] = elem.tag print( "Warning: found %s '%s' in element '%s' (level %s) and element '%s' (level %s)." " Using '%s'." % (a, attr, oldTag, elem2level[oldTag], elem.tag, level, attr2elem[attr])) else: attr2elem[attr] = elem.tag if len(attr2elem) == 3: # all attributes have been seen break elif event == "end": level -= 1 if len(attr2elem) != 3: for a in attrOptions: attr = getattr(options, a) if attr not in attr2elem: sys.exit("%s '%s' not found in %s" % (a, attr, options.files[0])) allElems = list(set(attr2elem.values())) attrs = [getattr(options, a) for a in attrOptions] # we don't know the order of the elements and we cannot get it from our xml parser if len(allElems) == 2: def datastream(xmlfile): skippedLines = defaultdict(int) elems = sorted(allElems, key=lambda e: elem2level[e]) mE0 = "<%s " % elems[0] mE1 = "<%s " % elems[1] attrs0 = [a for a in attrs if attr2elem[a] == elems[0]] attrs1 = [a for a in attrs if attr2elem[a] == elems[1]] mAs0 = [(a, re.compile('%s="([^"]*)"' % a)) for a in attrs0] mAs1 = [(a, re.compile('%s="([^"]*)"' % a)) for a in attrs1] values = {} # attr -> value for line in _open(xmlfile): if mE0 in line: for a, r in mAs0: values[a] = r.search(line).groups()[0] if mE1 in line: skip = False for a, r in mAs1: m = r.search(line) if m: values[a] = m.groups()[0] else: skip = True skippedLines[a] += 1 if not skip: yield [values[a] for a in attrs] for attr, count in skippedLines.items(): print( "Warning: Skipped %s lines because of missing attributes '%s'." % (count, attr), file=sys.stderr) return datastream elif len(allElems) == 1: def datastream(xmlfile): mE = "<%s " % allElems[0] mAs = [re.compile('%s="([^"]*)"' % a) for a in attrs] for line in _open(xmlfile): if mE in line: matches = [r.search(line) for r in mAs] if all(matches): yield [m.groups()[0] for m in matches] return datastream else: sys.exit( "Found attributes at elements %s but at most 2 elements are supported" % allElems)