class StandardOutput(PmmlSed): xsd = load_xsdElement( PmmlSed, """ <xs:element name="StandardOutput"> <xs:complexType> <xs:attribute name="validate" type="xs:boolean" default="true" use="optional" /> <xs:attribute name="indent" type="xs:string" default=" " use="optional" /> <xs:attribute name="linesep" type="xs:string" default="%s" use="optional" /> </xs:complexType> </xs:element> """ % os.linesep) def post_validate(self): if "validate" not in self.attrib: self["validate"] = True if "indent" not in self.attrib: self["indent"] = " " if "linesep" not in self.attrib: self["linesep"] = os.linesep def write(self, pmmlFile): print pmmlFile.xml(indent=self["indent"], linesep=self["linesep"]) if self["validate"]: pmmlFile.validate(exception=True)
class Context(ScoresAwk): xsd = load_xsdElement(ScoresAwk, """ <xs:element name="Context"> <xs:complexType> <xs:attribute name="library" type="xs:string" use="required"/> <xs:attribute name="as" type="xs:string" use="optional"/> <xs:attribute name="path" type="xs:string" use="optional"/> </xs:complexType> </xs:element> """) def post_validate(self): if "as" not in self.attrib: self.attrib["as"] = self.attrib["library"] haspath = ("path" in self.attrib and self.attrib["path"] not in sys.path) if haspath: sys.path.append(self.attrib["path"]) try: exec("import %s as tmp" % self.attrib["library"]) except ImportError, err: raise XMLValidationError, "Context element could not load library \"%s\"" % self.attrib["library"] if haspath: sys.path.remove(self.attrib["path"]) if self.attrib["as"] == "*": self.context = tmp.__dict__ else: self.context = {self.attrib["as"]: tmp}
class Append(Replacement): xsd = load_xsdElement( PmmlSed, """ <xs:element name="Append"> <xs:complexType> <xs:complexContent mixed="true"> <xs:restriction base="xs:anyType"> <xs:sequence> <xs:element minOccurs="0" maxOccurs="unbounded" ref="Context"/> <xs:any minOccurs="0" maxOccurs="unbounded" processContents="skip" /> </xs:sequence> </xs:restriction> </xs:complexContent> </xs:complexType> </xs:element> """) def evaluate(self, pmmlSnippet, matchedVariables, namedGroups): variables = dict(self.context) variables.update(matchedVariables) insertion = [] for child in self.children: if not isinstance(child, Context): construction = self.construct(child, variables, namedGroups) if isinstance(construction, list): insertion.extend(construction) else: insertion.append(construction) for item in insertion: pmmlSnippet.children.append(item) return pmmlSnippet
class FileInput(ScoresAwk): xsd = load_xsdElement(ScoresAwk, """ <xs:element name="FileInput"> <xs:complexType> <xs:sequence> <xs:element ref="Context" minOccurs="0" maxOccurs="unbounded"/> <xs:element ref="CastAttribute" minOccurs="0" maxOccurs="unbounded"/> <xs:element ref="CastContent" minOccurs="0" maxOccurs="unbounded"/> </xs:sequence> <xs:attribute name="fileName" type="xs:string" use="required"/> <xs:attribute name="excludeTag" type="xs:string" use="optional"/> </xs:complexType> </xs:element> """) def post_validate(self): context = {} for c in self.matches(Context): context.update(c.context) self.castAttribute = {} for c in self.matches(CastAttribute): self.castAttribute[c["tag"] + "." + c["attribute"]] = eval(c["type"], context) self.castContent = {} for c in self.matches(CastContent): self.castContent[c["tag"]] = eval(c["type"], context) if "excludeTag" not in self.attrib: self["excludeTag"] = None self.file = ScoresFile(self["fileName"], excludeTag=self["excludeTag"], attributeCast=self.castAttribute, contentCast=self.castContent)
class PythonFunction(PmmlSed): xsd = load_xsdElement(PmmlSed, """ <xs:element name="PythonFunction"> <xs:complexType> <xs:sequence> <xs:element minOccurs="0" maxOccurs="unbounded" ref="Context"/> </xs:sequence> <xs:attribute name="name" type="xs:string" use="required"/> <xs:attribute name="begin" type="xs:string" use="optional"/> <xs:attribute name="end" type="xs:string" use="optional"/> <xs:attribute name="deepestNode" type="xs:string" use="optional"/> </xs:complexType> </xs:element> """) def post_validate(self): context = {"g": globalVariables} for c in self.matches(Context): context.update(c.context) cdatas = [i for i in self.children if isinstance(i, xmlbase.XMLCDATA)] if len(cdatas) != 1: raise XMLValidationError, "A PythonFunction object must contain exactly one CDATA" theCode = "".join(cdatas[0].text).lstrip().rstrip() ## CAREFUL: evaluates whatever you give it! try: exec theCode in context except SyntaxError, err: raise XMLValidationError, "PythonFunction could not be evaluated: %s" % str(err) try: self.func = context[self["name"]] if not callable(self.func): raise KeyError except KeyError: raise XMLValidationError, "PythonFunction does not contain a function called \"%s\"" % self["name"] if "begin" in self.attrib: try: self.begin = context[self["begin"]] if not callable(self.begin): raise KeyError except KeyError: raise XMLValidationError, "PythonFunction does not contain a function called \"%s\"" % self["begin"] else: self.begin = None if "end" in self.attrib: try: self.end = context[self["end"]] if not callable(self.end): raise KeyError except KeyError: raise XMLValidationError, "PythonFunction does not contain a function called \"%s\"" % self["end"] else: self.end = None self.deepestNode = self.attrib.get("deepestNode", None)
class Context(PmmlSed): xsd = load_xsdElement( PmmlSed, """ <xs:element name="Context"> <xs:complexType> <xs:attribute name="library" type="xs:string" use="required"/> <xs:attribute name="as" type="xs:string" use="optional"/> </xs:complexType> </xs:element> """) def post_validate(self): if "as" not in self.attrib: self["as"] = self["library"] try: exec("import %s as tmp" % self["library"]) except ImportError as err: raise XMLValidationError, "Context element could not load library \"%s\"" % self[ "library"] if self["as"] == "*": self.context = tmp.__dict__ else: self.context = {self["as"]: tmp}
class LoadBalanceSplit(PmmlSplit): xsd = load_xsdElement( PmmlSplit, """ <xs:element name="LoadBalanceSplit"> <xs:complexType> <xs:sequence> <xs:element ref="FileOutput" minOccurs="1" maxOccurs="unbounded"/> </xs:sequence> <xs:attribute name="how" default="sequential" use="optional"> <xs:simpleType> <xs:restriction base="xs:string"> <xs:enumeration value="sequential"/> <xs:enumeration value="random"/> <xs:enumeration value="textBalanced"/> </xs:restriction> </xs:simpleType> </xs:attribute> </xs:complexType> </xs:element> """) def post_validate(self): if "how" not in self.attrib: self["how"] = "random" self.numFiles = len(self.matches(FileOutput)) def evaluate(self, data): segmentation = data.descendant(pmml.Segmentation, exception=False) if segmentation is None: raise RuntimeError( "PMML file has no <Segmentation> block; cannot split!") # pull out the segments and drop them from the file segments = segmentation.matches(pmml.Segment) segmentation.children = [] outputSegments = [] if self["how"] == "sequential": stepSize = int(math.ceil(len(segments) / float(self.numFiles))) for i in xrange(self.numFiles): outputSegments.append(segments[(i) * stepSize:(i + 1) * stepSize]) elif self["how"] == "random": random.shuffle(segments) stepSize = int(math.ceil(len(segments) / float(self.numFiles))) for i in xrange(self.numFiles): outputSegments.append(segments[(i) * stepSize:(i + 1) * stepSize]) elif self["how"] == "textBalanced": raise NotImplementedError( "how == textBalanced not implemented yet") for i, fileOutput in enumerate(self.matches(FileOutput)): output = data.copy() # copied without the segments output.descendant(pmml.Segmentation).children = outputSegments[i] output.write(fileOutput["fileName"])
class PythonFunction(ScoresAwk): xsd = load_xsdElement(ScoresAwk, """ <xs:element name="PythonFunction"> <xs:complexType> <xs:sequence> <xs:element minOccurs="0" maxOccurs="unbounded" ref="Context"/> </xs:sequence> <xs:attribute name="condition" type="xs:string" use="optional"/> <xs:attribute name="action" type="xs:string" use="required"/> </xs:complexType> </xs:element> """) BEGIN = Atom("Begin") EVENT = Atom("Event") END = Atom("End") def post_validate(self): context = {"g": globalVariables} for c in self.matches(Context): context.update(c.context) cdatas = [i for i in self.children if isinstance(i, xmlbase.XMLCDATA)] if len(cdatas) != 1: raise XMLValidationError, "A PythonFunction object must contain exactly one CDATA" theCode = "".join(cdatas[0].text).lstrip().rstrip() ## CAREFUL: evaluates whatever you give it! try: exec theCode in context except SyntaxError, err: raise XMLValidationError, "PythonFunction could not be evaluated: %s" % str(err) if "condition" in self.attrib: if self["condition"] == "BEGIN": self.condition = self.BEGIN elif self["condition"] == "END": self.condition = self.END else: try: self.condition = context[self["condition"]] if not callable(self.condition): raise KeyError except KeyError: raise XMLValidationError, "PythonFunction does not contain a condition function called \"%s\"" % self["condition"] else: self.condition = self.EVENT try: self.action = context[self["action"]] if not callable(self.action): raise KeyError except KeyError: raise XMLValidationError, "PythonFunction does not contain an action function called \"%s\"" % self["action"]
class StandardOutput(ScoresAwk): xsd = load_xsdElement(ScoresAwk, """ <xs:element name="StandardOutput"> <xs:complexType/> </xs:element> """) def post_validate(self): self.file = sys.stdout
class CastContent(ScoresAwk): xsd = load_xsdElement(ScoresAwk, """ <xs:element name="CastContent"> <xs:complexType> <xs:attribute name="tag" type="xs:string" use="required"/> <xs:attribute name="type" type="xs:string" use="required"/> </xs:complexType> </xs:element> """)
class FileOutput(PmmlSplit): xsd = load_xsdElement( PmmlSplit, """ <xs:element name="FileOutput"> <xs:complexType> <xs:attribute name="fileName" type="xs:string" use="required"/> </xs:complexType> </xs:element> """)
class NamedGroupMatch(PmmlSed): xsd = load_xsdElement(PmmlSed, """ <xs:element name="NamedGroupMatch"> <xs:complexType> <xs:attribute name="name" type="xs:string" use="required" /> <xs:attribute name="maxMatch" type="xs:nonNegativeInteger" use="optional" /> </xs:complexType> </xs:element> """)
class AugustusConfigurationFromFile(Workflow): xsd = load_xsdElement(Workflow, """ <xs:element name="AugustusConfigurationFromFile"> <xs:complexType> <xs:attribute name="fileName" type="xs:string" use="required" /> </xs:complexType> </xs:element> """) def post_validate(self): self.config = xmlbase.loadfile(self["fileName"], augustus.core.config.Config, lineNumbers=True)
class ScoresAwkFromFile(Workflow): xsd = load_xsdElement(Workflow, """ <xs:element name="ScoresAwkFromFile"> <xs:complexType> <xs:attribute name="fileName" type="xs:string" use="required" /> </xs:complexType> </xs:element> """) def post_validate(self): self.config = xmlbase.loadfile(self["fileName"], augustus.applications.scoresAwk.root, lineNumbers=True)
class root(XTBL): xsd = load_xsdElement( XTBL, """ <xs:element name="XTBL"> <xs:complexType> <xs:sequence> <xs:element ref="MetaData" minOccurs="0" /> <xs:element ref="DataDictionary" /> <xs:element ref="Pages" /> <xs:element ref="SeekFooter" /> </xs:sequence> <xs:attribute name="version" type="xs:string" use="required" /> </xs:complexType> </xs:element> """) tag = "XTBL" def __init__(self, fields, types, metadata={}, fieldMetadatas={}): self.attrib = {"version": "1.0"} if len(metadata) > 0: self.children = [ MetaData(metadata), DataDictionary(fields, types, fieldMetadatas), Pages(), SeekFooter(0) ] else: self.children = [ DataDictionary(fields, types, fieldMetadatas), Pages(), SeekFooter(0) ] def post_validate(self): if self.attrib["version"] != "1.0": raise XMLValidationError( "XTBL version in this file is \"%s\" but this is a XTBL 1.0 interpreter" % self.attrib["version"]) fields = set() for dataField in self.child(DataDictionary).matches(DataField): fields.add(dataField.attrib["name"]) for page in self.child(Pages).matches(Page): fields2 = set() for pageFieldOffset in page.matches(PageFieldOffset): fields2.add(pageFieldOffset.attrib["name"]) if fields != fields2: raise XMLValidationError( "PageFieldOffset fields (%s) do not match DataDictionary fields (%s)" % (fields2, fields))
class Insert(Replacement): xsd = load_xsdElement( PmmlSed, """ <xs:element name="Insert"> <xs:complexType> <xs:complexContent mixed="true"> <xs:restriction base="xs:anyType"> <xs:sequence> <xs:element minOccurs="0" maxOccurs="unbounded" ref="Context"/> <xs:any minOccurs="0" maxOccurs="unbounded" processContents="skip" /> </xs:sequence> <xs:attribute name="at" type="xs:string" use="required"/> </xs:restriction> </xs:complexContent> </xs:complexType> </xs:element> """) def post_validate(self): values = self["at"].split(",") try: values = map(int, values) except ValueError: raise XMLValidationError, "Insert's 'at' parameter must be an integer or a comma-separated list of integers, not \"%s\"" % self[ "at"] self.treeindex = values[:-1] self.index = values[-1] def evaluate(self, pmmlSnippet, matchedVariables, namedGroups): variables = dict(self.context) variables.update(matchedVariables) insertion = [] for child in self.children: if not isinstance(child, Context): construction = self.construct(child, variables, namedGroups) if isinstance(construction, list): insertion.extend(construction) else: insertion.append(construction) if len(self.treeindex) > 0: pmmlSnippet = pmmlSnippet[self.treeindex] insertion.reverse() for item in insertion: pmmlSnippet.children.insert(index, item) return pmmlSnippet
class FileInput(PmmlSplit): xsd = load_xsdElement( PmmlSplit, """ <xs:element name="FileInput"> <xs:complexType> <xs:attribute name="fileName" type="xs:string" use="required"/> </xs:complexType> </xs:element> """) def post_validate(self): self.data = xmlbase.loadfile(self["fileName"], pmml.X_ODG_PMML, validation=False)
class LogicalSplit(PmmlSplit): xsd = load_xsdElement( PmmlSplit, """ <xs:element name="LogicalSplit"> <xs:complexType> <xs:sequence> <xs:element ref="Contains" minOccurs="1" maxOccurs="unbounded"/> <xs:element ref="FileOutput" minOccurs="0" maxOccurs="1"/> </xs:sequence> </xs:complexType> </xs:element> """) def post_validate(self): self.fileOutput = self.child(FileOutput, exception=False) def evaluate(self, data): segmentation = data.descendant(pmml.Segmentation, exception=False) if segmentation is None: raise RuntimeError( "PMML file has no <Segmentation> block; cannot split!") # pull out the segments and drop them from the file segments = segmentation.matches(pmml.Segment) segmentation.children = [] for contains in self.matches(Contains): matched = [] nonMatched = [] for segment in segments: if contains.evaluate( segment.child(pmml.nonExtension) ): # assume first nonExtension is the predicate matched.append(segment) else: nonMatched.append(segment) output = data.copy() # copied without the segments output.descendant(pmml.Segmentation).children = matched output.write(contains.fileOutput["fileName"]) # next search will use only the remainder segments = nonMatched if self.fileOutput is not None: output = data.copy() # copied without the segments output.descendant( pmml.Segmentation).children = segments # whatever's left output.write(self.fileOutput["fileName"])
class FileOutput(ScoresAwk): xsd = load_xsdElement(ScoresAwk, """ <xs:element name="FileOutput"> <xs:complexType> <xs:attribute name="fileName" type="xs:string" use="required" /> <xs:attribute name="append" type="xs:boolean" default="false" use="optional" /> </xs:complexType> </xs:element> """) def post_validate(self): if "append" not in self.attrib: self["append"] = False self.file = file(self["fileName"], "a" if self["append"] else "w")
class SeekFooter(XTBL): xsd = load_xsdElement( XTBL, """ <xs:element name="SeekFooter"> <xs:complexType> <xs:attribute name="byteOffset" type="xs:nonNegativeInteger" use="required" /> </xs:complexType> </xs:element> """) tag = "SeekFooter" def __init__(self, name, byteOffset=0): self.attrib = {"byteOffset": byteOffset} self.children = []
class PageFieldOffset(XTBL): xsd = load_xsdElement( XTBL, """ <xs:element name="PageFieldOffset"> <xs:complexType> <xs:attribute name="name" type="xs:string" use="required" /> <xs:attribute name="byteOffset" type="xs:nonNegativeInteger" use="required" /> </xs:complexType> </xs:element> """) tag = "PageFieldOffset" def __init__(self, name, byteOffset): self.attrib = {"name": name, "byteOffset": byteOffset} self.children = []
class MetaDataItem(XTBL): xsd = load_xsdElement( XTBL, """ <xs:element name="MetaDataItem"> <xs:complexType> <xs:attribute name="key" type="xs:string" use="required" /> <xs:attribute name="value" type="xs:string" use="required" /> </xs:complexType> </xs:element> """) tag = "MetaDataItem" def __init__(self, key, value): self.attrib = {"key": key, "value": value} self.children = []
class root(Workflow): xsd = load_xsdElement( Workflow, """ <xs:element name="Workflow"> <xs:complexType> <xs:sequence> <xs:choice minOccurs="1" maxOccurs="unbounded"> <xs:element ref="AugustusConfiguration"/> <xs:element ref="AugustusConfigurationFromFile"/> <xs:element ref="PmmlSed"/> <xs:element ref="PmmlSedFromFile"/> <xs:element ref="PmmlSplit"/> <xs:element ref="PmmlSplitFromFile"/> <xs:element ref="ScoresAwk"/> <xs:element ref="ScoresAwkFromFile"/> </xs:choice> </xs:sequence> </xs:complexType> </xs:element> """) def evaluate(self): for child in self: if child.tag == "AugustusConfiguration": augustus.engine.mainloop.main(child) elif child.tag == "AugustusConfigurationFromFile": augustus.engine.mainloop.main(child.config) elif child.tag == "PmmlSed": child.evaluate() elif child.tag == "PmmlSedFromFile": child.config.evaluate() elif child.tag == "PmmlSplit": child.evaluate() elif child.tag == "PmmlSplitFromFile": child.config.evaluate() elif child.tag == "ScoresAwk": child.evaluate() elif child.tag == "ScoresAwkFromFile": child.config.evaluate()
class StandardInput(PmmlSed): xsd = load_xsdElement(PmmlSed, """ <xs:element name="StandardInput"> <xs:complexType> <xs:attribute name="validate" type="xs:boolean" default="true" use="optional" /> </xs:complexType> </xs:element> """) def post_validate(self): if "validate" not in self.attrib: self["validate"] = True try: self.data = xmlbase.load(sys.stdin.read(), pmml.X_ODG_PMML, validation=self["validate"]) except XMLValidationError, err: raise RuntimeError, "StandardInput PMML failed validation: %s" % str(err)
class MapMissing(XTBL): xsd = load_xsdElement( XTBL, """ <xs:element name="MapMissing"> <xs:complexType> <xs:attribute name="value" type="xs:string" use="required" /> </xs:complexType> </xs:element> """) def setType(self, cast): self.attrib["value"] = cast(self.attrib["value"]) def isMissing(self, value): return (value == self.attrib["value"]) def __eq__(self, other): return self.attrib["value"] == other.attrib["value"]
class FileInput(PmmlSed): xsd = load_xsdElement(PmmlSed, """ <xs:element name="FileInput"> <xs:complexType> <xs:attribute name="fileName" type="xs:string" use="required" /> <xs:attribute name="validate" type="xs:boolean" default="true" use="optional" /> </xs:complexType> </xs:element> """) def post_validate(self): if "validate" not in self.attrib: self["validate"] = True try: self.data = xmlbase.loadfile(self["fileName"], pmml.X_ODG_PMML, validation=self["validate"]) except XMLValidationError, err: raise RuntimeError, "PMML file %s failed validation: %s" % (self["fileName"], str(err))
class Pages(XTBL): xsd = load_xsdElement( XTBL, """ <xs:element name="Pages"> <xs:complexType> <xs:sequence> <xs:element ref="Page" minOccurs="1" maxOccurs="unbounded" /> </xs:sequence> </xs:complexType> </xs:element> """) tag = "Pages" def __init__(self, pages=None): if pages is None: pages = [] self.attrib = {} self.children = pages
class root(PmmlSplit): xsd = load_xsdElement( PmmlSplit, """ <xs:element name="PmmlSplit"> <xs:complexType> <xs:sequence> <xs:element ref="FileInput"/> <xs:choice> <xs:element ref="LoadBalanceSplit"/> <xs:element ref="LogicalSplit"/> </xs:choice> </xs:sequence> </xs:complexType> </xs:element> """) def evaluate(self): # evaluate the subelement with index=1, whatever that is (check XSD!) self.child(which=1).evaluate(self.child(FileInput).data)
class Page(XTBL): xsd = load_xsdElement( XTBL, """ <xs:element name="Page"> <xs:complexType> <xs:sequence> <xs:element ref="PageFieldOffset" minOccurs="1" maxOccurs="unbounded" /> </xs:sequence> <xs:attribute name="length" type="xs:nonNegativeInteger" use="required" /> </xs:complexType> </xs:element> """) tag = "Page" def __init__(self, length, pageFieldOffsets=None): if pageFieldOffsets is None: pageFieldOffsets = [] self.attrib = {"length": length} self.children = pageFieldOffsets
class MetaData(XTBL): xsd = load_xsdElement( XTBL, """ <xs:element name="MetaData"> <xs:complexType> <xs:sequence> <xs:element ref="MetaDataItem" minOccurs="0" maxOccurs="unbounded" /> </xs:sequence> </xs:complexType> </xs:element> """) tag = "MetaData" def __init__(self, metadata): self.attrib = {} self.children = [] keys = metadata.keys() keys.sort() for key in keys: self.children.append(MetaDataItem(key, metadata[key]))