def ast(pfas, check=True, name=None, randseed=None, doc=None, version=None, metadata={}, options={}, tryYaml=False, verbose=False): """Create a single PFA from a chained workflow, returning the result as an abstract syntax tree. :type pfas: list of titus.pfaast.EngineConfig, Pythonized JSON, or JSON strings :param pfas: PFA documents for which the output of document *i* is the input to document *i + 1* :type check: bool :param check: test the chained PFA for validity :type name: string or ``None`` :param name: optional name for the chained PFA :type randseed: integer or ``None`` :param randseed: optional random number seed for the chained PFA :type doc: string or ``None`` :param doc: optional documentation string for the chained PFA :type version: integer or ``None`` :param version: optional version number for the chained PFA :type metadata: dict of strings :param metadata: metadata for the chained PFA (default is ``{}``) :type options: dict of Pythonized JSON :param options: implementation options for the chained PFA (default is ``{}``) :type tryYaml: bool :param tryYaml: if ``True``, attempt to interpret ``pfas`` as YAML (assuming they fail as JSON) :type verbose: bool :param verbose: if ``True``, write status messages to standard output :rtype: titus.pfaast.EngineConfig :return: a PFA document representing the chained workflow """ # normalize all input forms to ASTs if verbose: sys.stderr.write(time.asctime() + " Converting all inputs to ASTs\n") asts = [] for i, src in enumerate(pfas): if verbose: sys.stderr.write(time.asctime() + " step {0}\n".format(i + 1)) if isinstance(src, EngineConfig): ast = src elif isinstance(src, dict): ast = titus.reader.jsonToAst(src) else: try: ast = titus.reader.jsonToAst(src) except ValueError: if tryYaml: ast = titus.reader.yamlToAst(src) else: raise asts.append(ast) pfas = asts # helper functions for transforming names def split(t): if "." in t: return t[:t.rindex(".")], t[t.rindex(".") + 1:] else: return None, t def join(ns, n): if ns is None or ns == "": return n else: return ns + "." + n def prefixType(i, pfa, t): ns, n = split(t) return join(ns, "Step{0:d}_{1}_{2}".format(i + 1, pfa.name, n)) def prefixAction(i, pfa): return "step{0:d}_{1}_action".format(i + 1, pfa.name) def prefixFcnRef(i, pfa, x): if x.startswith("u."): return "u.step{0:d}_{1}_fcn_{2}".format(i + 1, pfa.name, x[2:]) else: return x def prefixFcnDef(i, pfa, x): return "step{0:d}_{1}_fcn_{2}".format(i + 1, pfa.name, x) def prefixCell(i, pfa, x): return "step{0:d}_{1}_{2}".format(i + 1, pfa.name, x) def prefixPool(i, pfa, x): return "step{0:d}_{1}_{2}".format(i + 1, pfa.name, x) # define new names for all types to avoid type name collisions if verbose: sys.stderr.write(time.asctime() + " Changing type names to avoid collisions\n") originalNameToNewName = {} for i, pfa in enumerate(pfas): originalNameToNewName[i] = {} for typeName in pfa.inputPlaceholder.parser.names.names.keys(): originalNameToNewName[i][typeName] = prefixType(i, pfa, typeName) # but any names in the input to the first and the output from the last should not be changed def trivialName(i, avroType, memo): if isinstance(avroType, AvroArray): trivialName(i, avroType.items, memo) elif isinstance(avroType, AvroMap): trivialName(i, avroType.values, memo) elif isinstance(avroType, AvroUnion): for t in avroType.types: trivialName(i, t, memo) elif isinstance(avroType, (AvroFixed, AvroEnum)): t = avroType.fullName originalNameToNewName[i][t] = t elif isinstance(avroType, AvroRecord): t = avroType.fullName if t not in memo: memo.add(t) for f in avroType.fields: trivialName(i, f.avroType, memo) originalNameToNewName[i][t] = t trivialName(0, pfas[0].input, set()) trivialName(len(pfas) - 1, pfas[-1].output, set()) # ensure that chained types match and will be given the same names if verbose: sys.stderr.write(time.asctime() + " Verifying that input/output schemas match along the chain\n") def chainPair(i, first, second, memo): if isinstance(first, AvroNull) and isinstance(second, AvroNull): return True elif isinstance(first, AvroBoolean) and isinstance(second, AvroBoolean): return True elif isinstance(first, AvroInt) and isinstance(second, AvroInt): return True elif isinstance(first, AvroLong) and isinstance(second, AvroLong): return True elif isinstance(first, AvroFloat) and isinstance(second, AvroFloat): return True elif isinstance(first, AvroDouble) and isinstance(second, AvroDouble): return True elif isinstance(first, AvroBytes) and isinstance(second, AvroBytes): return True elif isinstance(first, AvroFixed) and isinstance(second, AvroFixed): t = avroType.fullName if first.size == second.size: originalNameToNewName[i + 1][second.fullName] = originalNameToNewName[i][first.fullName] return True else: return False elif isinstance(first, AvroString) and isinstance(second, AvroString): return True elif isinstance(first, AvroEnum) and isinstance(second, AvroEnum): if first.symbols == second.symbols: originalNameToNewName[i + 1][second.fullName] = originalNameToNewName[i][first.fullName] return True else: return False elif isinstance(first, AvroArray) and isinstance(second, AvroArray): return chainPair(i, first.items, second.items, memo) elif isinstance(first, AvroMap) and isinstance(second, AvroMap): return chainPair(i, first.values, second.values, memo) elif isinstance(first, AvroRecord) and isinstance(second, AvroRecord): if first.fullName not in memo: memo.add(first.fullName) if len(first.fields) != len(second.fields): return False for f1, f2 in zip(first.fields, second.fields): if f1.name != f2.name: return False elif not chainPair(i, f1.avroType, f2.avroType, memo): return False originalNameToNewName[i + 1][second.fullName] = originalNameToNewName[i][first.fullName] return True elif isinstance(first, AvroUnion) and isinstance(second, AvroUnion): for yt in second.types: if not any(chainPair(i, xt, yt, memo) for xt in first.types): return False return True else: return False for i in xrange(len(pfas) - 1): first = pfas[i].output second = pfas[i + 1].input if not chainPair(i, first, second, set()): raise PFAChainError("output of engine {0}: {1} not compatible with input of engine {2}: {3}".format(i + 1, ts(first), i + 2, ts(second))) def rename(i, avroType, memo): if isinstance(avroType, AvroArray): return {"type": "array", "items": rename(i, avroType.items, memo)} elif isinstance(avroType, AvroMap): return {"type": "map", "values": rename(i, avroType.values, memo)} elif isinstance(avroType, AvroUnion): return [rename(i, t, memo) for t in avroType.types] elif isinstance(avroType, AvroFixed): ns, n = split(originalNameToNewName[i][avroType.fullName]) out = {"type": "fixed", "name": n, "size": avroType.size} if ns is not None: out["namespace"] = ns return out elif isinstance(avroType, AvroEnum): ns, n = split(originalNameToNewName[i][avroType.fullName]) out = {"type": "enum", "name": n, "symbols": avroType.symbols} if ns is not None: out["namespace"] = ns return out elif isinstance(avroType, AvroRecord): newName = originalNameToNewName[i][avroType.fullName] if newName in memo: return memo[newName] else: ns, n = split(newName) out = {"type": "record", "name": n, "fields": []} if ns is not None: out["namespace"] = ns memo[newName] = join(ns, n) for f in avroType.fields: newf = {"name": f.name, "type": rename(i, f.avroType, memo)} if f.default is not None: newf["default"] = f.default if f.order is not None: newf["order"] = f.order out["fields"].append(newf) return out else: return jsonlib.loads(repr(avroType)) avroTypeBuilder = AvroTypeBuilder() memo = {} def newPlaceholder(i, oldAvroType): newAvroType = rename(i, oldAvroType, {}) return avroTypeBuilder.makePlaceholder(jsonlib.dumps(newAvroType), memo) # combined name, if not explicitly set if name is None: name = "Chain_" + "_".join(pfa.name for pfa in pfas) # combined method (fold not supported yet, but could be) method = Method.MAP for pfa in pfas: if pfa.method == Method.EMIT: method = Method.EMIT elif pfa.method == Method.FOLD: raise NotImplementedError("chaining of fold-type scoring engines has not been implemented yet") # no zero or merge until we support fold method zero = None merge = None # input/output types from first and last inputPlaceholder = newPlaceholder(0, pfas[0].input) outputPlaceholder = newPlaceholder(len(pfas) - 1, pfas[-1].output) if verbose: sys.stderr.write(time.asctime() + " Adding [name, instance, metadata, actionsStarted, actionsFinished, version] as model parameters\n") cells = {"name": Cell(newPlaceholder(0, AvroString()), jsonlib.dumps(""), False, False, CellPoolSource.EMBEDDED), "instance": Cell(newPlaceholder(0, AvroInt()), jsonlib.dumps(0), False, False, CellPoolSource.EMBEDDED), "metadata": Cell(newPlaceholder(0, AvroMap(AvroString())), jsonlib.dumps({}), False, False, CellPoolSource.EMBEDDED), "actionsStarted": Cell(newPlaceholder(0, AvroLong()), jsonlib.dumps(0), False, False, CellPoolSource.EMBEDDED), "actionsFinished": Cell(newPlaceholder(0, AvroLong()), jsonlib.dumps(0), False, False, CellPoolSource.EMBEDDED)} if version is not None: cells["version"] = Cell(newPlaceholder(0, AvroInt()), 0, False, False, CellPoolSource.EMBEDDED) pools = {} if verbose: sys.stderr.write(time.asctime() + " Converting scoring engine algorithm\n") # all code will go into user functions, including begin/action/end fcns = {} begin = [CellTo("name", [], Ref("name")), CellTo("instance", [], Ref("instance")), CellTo("metadata", [], Ref("metadata"))] if version is not None: begin.append(CellTo("version", [], Ref("version"))) action = [CellTo("actionsStarted", [], Ref("actionsStarted")), CellTo("actionsFinished", [], Ref("actionsFinished"))] end = [CellTo("actionsStarted", [], Ref("actionsStarted")), CellTo("actionsFinished", [], Ref("actionsFinished"))] for i, pfa in enumerate(pfas): if verbose: sys.stderr.write(time.asctime() + " step {0}: {1}\n".format(i + 1, pfa.name)) thisActionFcnName = prefixAction(i, pfa) if i + 1 < len(pfas): nextActionFcnName = prefixAction(i + 1, pfas[i + 1]) else: nextActionFcnName = None # this is a closure; it must be defined in the loop to pick up free variables lazyFcnReplacer = None def genericReplacer(expr, self): if isinstance(expr, FcnDef): return FcnDef([{t.keys()[0]: newPlaceholder(i, t.values()[0])} for t in expr.params], newPlaceholder(i, expr.ret), [x.replace(lazyFcnReplacer) for x in expr.body], # this is the one place where we should pass down fcnReplacer rather than self expr.pos) elif isinstance(expr, FcnRef): return FcnRef(prefixFcnRef(i, pfa, expr.name), epxr.pos) elif isinstance(expr, FcnRefFill): return FcnRefFill(prefixFcnRef(i, pfa, expr.name), dict((k, v.replace(self)) for k, v in expr.fill.items()), expr.pos) elif isinstance(expr, CallUserFcn): # TODO: need to change the symbols of the corresponding enum return CallUserFcn(expr.name.replace(self), [x.replace(self) for x in expr.args], expr.pos) elif isinstance(expr, Call): if pfa.method == Method.EMIT and i + 1 < len(pfas) and expr.name == "emit": return Call("u." + nextActionFcnName, [x.replace(self) for x in expr.args], expr.pos) else: return Call(prefixFcnRef(i, pfa, expr.name), [x.replace(self) for x in expr.args], expr.pos) elif isinstance(expr, Literal): return Literal(newPlaceholder(i, expr.avroType), expr.value, expr.pos) elif isinstance(expr, NewObject): return NewObject(dict((k, v.replace(self)) for k, v in expr.fields.items()), newPlaceholder(i, expr.avroType), expr.pos) elif isinstance(expr, NewArray): return NewArray([x.replace(self) for x in expr.items], newPlaceholder(i, expr.avroType), expr.pos) elif isinstance(expr, CellGet): return CellGet(prefixCell(i, pfa, expr.cell), [x.replace(self) for x in expr.path], expr.pos) elif isinstance(expr, CellTo): return CellTo(prefixCell(i, pfa, expr.cell), [x.replace(self) for x in expr.path], expr.to.replace(self), expr.pos) elif isinstance(expr, PoolGet): return PoolGet(prefixPool(i, pfa, expr.pool), [x.replace(self) for x in expr.path], expr.pos) elif isinstance(expr, PoolTo): return PoolTo(prefixPool(i, pfa, expr.pool), [x.replace(self) for x in expr.path], expr.to.replace(self), expr.init.replace(self), expr.pos) elif isinstance(expr, CastCase): return CastCase(newPlaceholder(i, expr.avroType), expr.named, [x.replace(self) for x in expr.body], expr.pos) elif isinstance(expr, Upcast): return Upcast(expr.expr.replace(self), newPlaceholder(i, expr.avroType), expr.pos) genericReplacer.isDefinedAt = lambda x: isinstance(x, (FcnDef, FcnRef, FcnRefFill, CallUserFcn, Call, Literal, NewObject, CellGet, CellTo, PoolGet, PoolTo, CastCase, Upcast)) def fcnReplacer(expr): return genericReplacer(expr, fcnReplacer) fcnReplacer.isDefinedAt = genericReplacer.isDefinedAt lazyFcnReplacer = fcnReplacer # add statements to begin def beginReplacer(expr): if isinstance(expr, Ref): if expr.name in ("name", "instance", "metadata") or (version is not None and expr.name == "version"): return CellGet(expr.name, [], expr.pos) else: return expr else: return genericReplacer(expr, beginReplacer) beginReplacer.isDefinedAt = lambda x: isinstance(x, Ref) or genericReplacer.isDefinedAt(x) begin.extend([x.replace(beginReplacer) for x in pfa.begin]) # add statements to end def endReplacer(expr): if isinstance(expr, Ref): if expr.name in ("name", "instance", "metadata", "actionsStarted", "actionsFinished") or (version is not None and expr.name == "version"): return CellGet(expr.name, [], expr.pos) else: return expr else: return genericReplacer(expr, endReplacer) endReplacer.isDefinedAt = lambda x: isinstance(x, Ref) or genericReplacer.isDefinedAt(x) end.extend([x.replace(endReplacer) for x in pfa.end]) # convert the action into a user function def actionReplacer(expr): if isinstance(expr, Ref): if expr.name in ("name", "instance", "metadata", "actionsStarted", "actionsFinished") or (version is not None and expr.name == "version"): return CellGet(expr.name, [], expr.pos) else: return expr else: return genericReplacer(expr, actionReplacer) actionReplacer.isDefinedAt = lambda x: isinstance(x, Ref) or genericReplacer.isDefinedAt(x) body = [x.replace(actionReplacer) for x in pfa.action] if method == Method.MAP: # if the overall method is MAP, then we know that all of the individual engines are MAP # the overall action calls a nested chain of engines-as-functions and each engine-as-a-function just does its job and returns (body is unmodified) fcns[thisActionFcnName] = FcnDef([{"input": newPlaceholder(i, pfa.input)}], newPlaceholder(i, pfa.output), body) if i == 0: action.append(Call("u." + thisActionFcnName, [Ref("input")])) else: action[-1] = Call("u." + thisActionFcnName, [action[-1]]) elif method == Method.EMIT: # if the overall method is EMIT, then some individual engines might be MAP or might be EMIT # the overall action calls the first engine-as-a-function and the engines-as-functions call each other (body is modified) if pfa.method == Method.MAP and i + 1 < len(pfas): body = [Call("u." + nextActionFcnName, [Do(body)]), LiteralNull()] elif pfa.method == Method.MAP: body = [Call("emit", [Do(body)])] elif pfa.method == Method.EMIT: body.append(LiteralNull()) fcns[thisActionFcnName] = FcnDef([{"input": newPlaceholder(i, pfa.input)}], newPlaceholder(i, AvroNull()), body) if i == 0: action.append(Call("u." + thisActionFcnName, [Ref("input")])) # convert all of the user functions into user functions for fcnName, fcnDef in pfa.fcns.items(): # note: some of these user-defined functions may call emit; if so, they'll call the right emit fcns[prefixFcnDef(i, pfa, fcnName)] = FcnDef([{t.keys()[0]: newPlaceholder(i, t.values()[0])} for t in fcnDef.paramsPlaceholder], newPlaceholder(i, fcnDef.ret), [x.replace(fcnReplacer) for x in fcnDef.body], fcnDef.pos) if verbose: sys.stderr.write(time.asctime() + " Create types for model parameters\n") for i, pfa in enumerate(pfas): if verbose and len(pfa.cells) > 0: sys.stderr.write(time.asctime() + " step {0}:\n".format(i + 1)) for cellName, cell in pfa.cells.items(): if verbose: sys.stderr.write(time.asctime() + " cell {0}\n".format(cellName)) newCell = Cell(newPlaceholder(i, cell.avroType), cell.init, cell.shared, cell.rollback, cell.source, cell.pos) cells[prefixCell(i, pfa, cellName)] = newCell if cell.source == "embedded": def converter(avroType): original = jsonDecoder(cell.avroType, jsonlib.loads(cell.init)) return jsonlib.dumps(jsonEncoder(avroType, original)) newCell.converter = converter for i, pfa in enumerate(pfas): if verbose and len(pfa.pools) > 0: sys.stderr.write(time.asctime() + " step {0}:\n".format(i + 1)) for poolName, pool in pfa.pools.items(): if verbose: sys.stderr.write(time.asctime() + " pool {0}\n".format(poolName)) newPool = Pool(newPlaceholder(i, pool.avroType), pool.init, pool.shared, pool.rollback, pool.source, pool.pos) pools[prefixPool(i, pfa, poolName)] = newPool if pool.source == "embedded": def converter(avroType): original = jsonDecoder(pool.avroType, jsonlib.loads(pool.init)) return jsonlib.dumps(jsonEncoder(avroType, original)) newPool.converter = converter # make sure all the types work together if verbose: sys.stderr.write(time.asctime() + " Resolving all types\n") avroTypeBuilder.resolveTypes() if verbose: sys.stderr.write(time.asctime() + " Converting the model parameters themselves\n") for i, pfa in enumerate(pfas): if verbose and len(pfa.cells) > 0: sys.stderr.write(time.asctime() + " step {0}:\n".format(i + 1)) for cellName, cell in pfa.cells.items(): if verbose: sys.stderr.write(time.asctime() + " cell {0}\n".format(cellName)) if cell.source == "embedded": newCell = cells[prefixCell(i, pfa, cellName)] newCell.init = newCell.converter(newCell.avroType) for i, pfa in enumerate(pfas): if verbose and len(pfa.pools) > 0: sys.stderr.write(time.asctime() + " step {0}:\n".format(i + 1)) for poolName, pool in pfa.pools.items(): if verbose: sys.stderr.write(time.asctime() + " pool {0}\n".format(poolName)) if pool.source == "embedded": newPool = pools[prefixPool(i, pfa, poolName)] newPool.init = newPool.converter(newPool.avroType) # randseed, doc, version, metadata, and options need to be explicitly set # return a (possibly checked) AST out = EngineConfig(name, method, inputPlaceholder, outputPlaceholder, begin, action, end, fcns, zero, merge, cells, pools, randseed, doc, version, metadata, options) if check: if verbose: sys.stderr.write(time.asctime() + " Verifying PFA validity\n") PFAEngine.fromAst(out) if verbose: sys.stderr.write(time.asctime() + " Done\n") return out
def ast(pfas, check=True, name=None, randseed=None, doc=None, version=None, metadata={}, options={}, tryYaml=False, verbose=False): """Create a single PFA from a chained workflow, returning the result as an abstract syntax tree. :type pfas: list of titus.pfaast.EngineConfig, Pythonized JSON, or JSON strings :param pfas: PFA documents for which the output of document *i* is the input to document *i + 1* :type check: bool :param check: test the chained PFA for validity :type name: string or ``None`` :param name: optional name for the chained PFA :type randseed: integer or ``None`` :param randseed: optional random number seed for the chained PFA :type doc: string or ``None`` :param doc: optional documentation string for the chained PFA :type version: integer or ``None`` :param version: optional version number for the chained PFA :type metadata: dict of strings :param metadata: metadata for the chained PFA (default is ``{}``) :type options: dict of Pythonized JSON :param options: implementation options for the chained PFA (default is ``{}``) :type tryYaml: bool :param tryYaml: if ``True``, attempt to interpret ``pfas`` as YAML (assuming they fail as JSON) :type verbose: bool :param verbose: if ``True``, write status messages to standard output :rtype: titus.pfaast.EngineConfig :return: a PFA document representing the chained workflow """ # normalize all input forms to ASTs if verbose: sys.stderr.write(time.asctime() + " Converting all inputs to ASTs\n") asts = [] for i, src in enumerate(pfas): if verbose: sys.stderr.write(time.asctime() + " step {0}\n".format(i + 1)) if isinstance(src, EngineConfig): ast = src elif isinstance(src, dict): ast = titus.reader.jsonToAst(src) else: try: ast = titus.reader.jsonToAst(src) except ValueError: if tryYaml: ast = titus.reader.yamlToAst(src) else: raise asts.append(ast) pfas = asts # helper functions for transforming names def split(t): if "." in t: return t[:t.rindex(".")], t[t.rindex(".") + 1:] else: return None, t def join(ns, n): if ns is None or ns == "": return n else: return ns + "." + n def prefixType(i, pfa, t): ns, n = split(t) return join(ns, "Step{0:d}_{1}_{2}".format(i + 1, pfa.name, n)) def prefixAction(i, pfa): return "step{0:d}_{1}_action".format(i + 1, pfa.name) def prefixFcnRef(i, pfa, x): if x.startswith("u."): return "u.step{0:d}_{1}_fcn_{2}".format(i + 1, pfa.name, x[2:]) else: return x def prefixFcnDef(i, pfa, x): return "step{0:d}_{1}_fcn_{2}".format(i + 1, pfa.name, x) def prefixCell(i, pfa, x): return "step{0:d}_{1}_{2}".format(i + 1, pfa.name, x) def prefixPool(i, pfa, x): return "step{0:d}_{1}_{2}".format(i + 1, pfa.name, x) # define new names for all types to avoid type name collisions if verbose: sys.stderr.write(time.asctime() + " Changing type names to avoid collisions\n") originalNameToNewName = {} for i, pfa in enumerate(pfas): originalNameToNewName[i] = {} for typeName in list(pfa.inputPlaceholder.parser.names.names.keys()): keyTypeName = typeName if (typeName[0] == "."): keyTypeName = keyTypeName[1:] originalNameToNewName[i][keyTypeName] = prefixType( i, pfa, typeName) # but any names in the input to the first and the output from the last should not be changed def trivialName(i, avroType, memo): if isinstance(avroType, AvroArray): trivialName(i, avroType.items, memo) elif isinstance(avroType, AvroMap): trivialName(i, avroType.values, memo) elif isinstance(avroType, AvroUnion): for t in avroType.types: trivialName(i, t, memo) elif isinstance(avroType, (AvroFixed, AvroEnum)): t = avroType.fullName originalNameToNewName[i][t] = t elif isinstance(avroType, AvroRecord): t = avroType.fullName if t not in memo: memo.add(t) for f in avroType.fields: trivialName(i, f.avroType, memo) originalNameToNewName[i][t] = t trivialName(0, pfas[0].input, set()) trivialName(len(pfas) - 1, pfas[-1].output, set()) # ensure that chained types match and will be given the same names if verbose: sys.stderr.write( time.asctime() + " Verifying that input/output schemas match along the chain\n") def chainPair(i, first, second, memo): if isinstance(first, AvroNull) and isinstance(second, AvroNull): return True elif isinstance(first, AvroBoolean) and isinstance( second, AvroBoolean): return True elif isinstance(first, AvroInt) and isinstance(second, AvroInt): return True elif isinstance(first, AvroLong) and isinstance(second, AvroLong): return True elif isinstance(first, AvroFloat) and isinstance(second, AvroFloat): return True elif isinstance(first, AvroDouble) and isinstance(second, AvroDouble): return True elif isinstance(first, AvroBytes) and isinstance(second, AvroBytes): return True elif isinstance(first, AvroFixed) and isinstance(second, AvroFixed): if first.size == second.size: originalNameToNewName[i + 1][ second.fullName] = originalNameToNewName[i][first.fullName] return True else: return False elif isinstance(first, AvroString) and isinstance(second, AvroString): return True elif isinstance(first, AvroEnum) and isinstance(second, AvroEnum): if first.symbols == second.symbols: originalNameToNewName[i + 1][ second.fullName] = originalNameToNewName[i][first.fullName] return True else: return False elif isinstance(first, AvroArray) and isinstance(second, AvroArray): return chainPair(i, first.items, second.items, memo) elif isinstance(first, AvroMap) and isinstance(second, AvroMap): return chainPair(i, first.values, second.values, memo) elif isinstance(first, AvroRecord) and isinstance(second, AvroRecord): if first.fullName not in memo: memo.add(first.fullName) if len(first.fields) != len(second.fields): return False for f1, f2 in zip(first.fields, second.fields): if f1.name != f2.name: return False elif not chainPair(i, f1.avroType, f2.avroType, memo): return False originalNameToNewName[i + 1][ second.fullName] = originalNameToNewName[i][first.fullName] return True elif isinstance(first, AvroUnion) and isinstance(second, AvroUnion): for yt in second.types: if not any(chainPair(i, xt, yt, memo) for xt in first.types): return False return True else: return False for i in range(len(pfas) - 1): first = pfas[i].output second = pfas[i + 1].input if not chainPair(i, first, second, set()): raise PFAChainError( "output of engine {0}: {1} not compatible with input of engine {2}: {3}" .format(i + 1, ts(first), i + 2, ts(second))) def rename(i, avroType, memo): if isinstance(avroType, AvroArray): return {"type": "array", "items": rename(i, avroType.items, memo)} elif isinstance(avroType, AvroMap): return {"type": "map", "values": rename(i, avroType.values, memo)} elif isinstance(avroType, AvroUnion): return [rename(i, t, memo) for t in avroType.types] elif isinstance(avroType, AvroFixed): ns, n = split(originalNameToNewName[i][avroType.fullName]) out = {"type": "fixed", "name": n, "size": avroType.size} if ns is not None: out["namespace"] = ns return out elif isinstance(avroType, AvroEnum): ns, n = split(originalNameToNewName[i][avroType.fullName]) out = {"type": "enum", "name": n, "symbols": avroType.symbols} if ns is not None: out["namespace"] = ns return out elif isinstance(avroType, AvroRecord): newName = originalNameToNewName[i][avroType.fullName] if newName in memo: return memo[newName] else: ns, n = split(newName) out = {"type": "record", "name": n, "fields": []} if ns is not None: out["namespace"] = ns memo[newName] = join(ns, n) for f in avroType.fields: newf = { "name": f.name, "type": rename(i, f.avroType, memo) } if f.default is not None: newf["default"] = f.default if f.order is not None: newf["order"] = f.order out["fields"].append(newf) return out else: return jsonlib.loads(repr(avroType)) avroTypeBuilder = AvroTypeBuilder() memo = {} def newPlaceholder(i, oldAvroType): newAvroType = rename(i, oldAvroType, {}) return avroTypeBuilder.makePlaceholder(jsonlib.dumps(newAvroType), memo) # combined name, if not explicitly set if name is None: name = "Chain_" + "_".join(pfa.name for pfa in pfas) # combined method (fold not supported yet, but could be) method = Method.MAP for pfa in pfas: if pfa.method == Method.EMIT: method = Method.EMIT elif pfa.method == Method.FOLD: raise NotImplementedError( "chaining of fold-type scoring engines has not been implemented yet" ) # no zero or merge until we support fold method zero = None merge = None # input/output types from first and last inputPlaceholder = newPlaceholder(0, pfas[0].input) outputPlaceholder = newPlaceholder(len(pfas) - 1, pfas[-1].output) if verbose: sys.stderr.write( time.asctime() + " Adding [name, instance, metadata, actionsStarted, actionsFinished, version] as model parameters\n" ) cells = { "name": Cell(newPlaceholder(0, AvroString()), jsonlib.dumps(""), False, False, CellPoolSource.EMBEDDED), "instance": Cell(newPlaceholder(0, AvroInt()), jsonlib.dumps(0), False, False, CellPoolSource.EMBEDDED), "metadata": Cell(newPlaceholder(0, AvroMap(AvroString())), jsonlib.dumps({}), False, False, CellPoolSource.EMBEDDED), "actionsStarted": Cell(newPlaceholder(0, AvroLong()), jsonlib.dumps(0), False, False, CellPoolSource.EMBEDDED), "actionsFinished": Cell(newPlaceholder(0, AvroLong()), jsonlib.dumps(0), False, False, CellPoolSource.EMBEDDED) } if version is not None: cells["version"] = Cell(newPlaceholder(0, AvroInt()), 0, False, False, CellPoolSource.EMBEDDED) pools = {} if verbose: sys.stderr.write(time.asctime() + " Converting scoring engine algorithm\n") # all code will go into user functions, including begin/action/end fcns = {} begin = [ CellTo("name", [], Ref("name")), CellTo("instance", [], Ref("instance")), CellTo("metadata", [], Ref("metadata")) ] if version is not None: begin.append(CellTo("version", [], Ref("version"))) action = [ CellTo("actionsStarted", [], Ref("actionsStarted")), CellTo("actionsFinished", [], Ref("actionsFinished")) ] end = [ CellTo("actionsStarted", [], Ref("actionsStarted")), CellTo("actionsFinished", [], Ref("actionsFinished")) ] for i, pfa in enumerate(pfas): if verbose: sys.stderr.write(time.asctime() + " step {0}: {1}\n".format(i + 1, pfa.name)) thisActionFcnName = prefixAction(i, pfa) if i + 1 < len(pfas): nextActionFcnName = prefixAction(i + 1, pfas[i + 1]) else: nextActionFcnName = None # this is a closure; it must be defined in the loop to pick up free variables lazyFcnReplacer = None def genericReplacer(expr, self): if isinstance(expr, FcnDef): return FcnDef( [{ list(t.keys())[0]: newPlaceholder( i, list(t.values())[0]) } for t in expr.params], newPlaceholder(i, expr.ret), [ x.replace(lazyFcnReplacer) for x in expr.body ], # this is the one place where we should pass down fcnReplacer rather than self expr.pos) elif isinstance(expr, FcnRef): return FcnRef(prefixFcnRef(i, pfa, expr.name), expr.pos) elif isinstance(expr, FcnRefFill): return FcnRefFill( prefixFcnRef(i, pfa, expr.name), dict((k, v.replace(self)) for k, v in list(expr.fill.items())), expr.pos) elif isinstance( expr, CallUserFcn ): # TODO: need to change the symbols of the corresponding enum return CallUserFcn(expr.name.replace(self), [x.replace(self) for x in expr.args], expr.pos) elif isinstance(expr, Call): if pfa.method == Method.EMIT and i + 1 < len( pfas) and expr.name == "emit": return Call("u." + nextActionFcnName, [x.replace(self) for x in expr.args], expr.pos) else: return Call(prefixFcnRef(i, pfa, expr.name), [x.replace(self) for x in expr.args], expr.pos) elif isinstance(expr, Literal): return Literal(newPlaceholder(i, expr.avroType), expr.value, expr.pos) elif isinstance(expr, NewObject): return NewObject( dict((k, v.replace(self)) for k, v in list(expr.fields.items())), newPlaceholder(i, expr.avroType), expr.pos) elif isinstance(expr, NewArray): return NewArray([x.replace(self) for x in expr.items], newPlaceholder(i, expr.avroType), expr.pos) elif isinstance(expr, CellGet): return CellGet(prefixCell(i, pfa, expr.cell), [x.replace(self) for x in expr.path], expr.pos) elif isinstance(expr, CellTo): return CellTo(prefixCell(i, pfa, expr.cell), [x.replace(self) for x in expr.path], expr.to.replace(self), expr.pos) elif isinstance(expr, PoolGet): return PoolGet(prefixPool(i, pfa, expr.pool), [x.replace(self) for x in expr.path], expr.pos) elif isinstance(expr, PoolTo): return PoolTo(prefixPool(i, pfa, expr.pool), [x.replace(self) for x in expr.path], expr.to.replace(self), expr.init.replace(self), expr.pos) elif isinstance(expr, CastCase): return CastCase(newPlaceholder(i, expr.avroType), expr.named, [x.replace(self) for x in expr.body], expr.pos) elif isinstance(expr, Upcast): return Upcast(expr.expr.replace(self), newPlaceholder(i, expr.avroType), expr.pos) genericReplacer.isDefinedAt = lambda x: isinstance( x, (FcnDef, FcnRef, FcnRefFill, CallUserFcn, Call, Literal, NewObject, CellGet, CellTo, PoolGet, PoolTo, CastCase, Upcast)) def fcnReplacer(expr): return genericReplacer(expr, fcnReplacer) fcnReplacer.isDefinedAt = genericReplacer.isDefinedAt lazyFcnReplacer = fcnReplacer # add statements to begin def beginReplacer(expr): if isinstance(expr, Ref): if expr.name in ("name", "instance", "metadata") or (version is not None and expr.name == "version"): return CellGet(expr.name, [], expr.pos) else: return expr else: return genericReplacer(expr, beginReplacer) beginReplacer.isDefinedAt = lambda x: isinstance( x, Ref) or genericReplacer.isDefinedAt(x) begin.extend([x.replace(beginReplacer) for x in pfa.begin]) # add statements to end def endReplacer(expr): if isinstance(expr, Ref): if expr.name in ("name", "instance", "metadata", "actionsStarted", "actionsFinished") or ( version is not None and expr.name == "version"): return CellGet(expr.name, [], expr.pos) else: return expr else: return genericReplacer(expr, endReplacer) endReplacer.isDefinedAt = lambda x: isinstance( x, Ref) or genericReplacer.isDefinedAt(x) end.extend([x.replace(endReplacer) for x in pfa.end]) # convert the action into a user function def actionReplacer(expr): if isinstance(expr, Ref): if expr.name in ("name", "instance", "metadata", "actionsStarted", "actionsFinished") or ( version is not None and expr.name == "version"): return CellGet(expr.name, [], expr.pos) else: return expr else: return genericReplacer(expr, actionReplacer) actionReplacer.isDefinedAt = lambda x: isinstance( x, Ref) or genericReplacer.isDefinedAt(x) body = [x.replace(actionReplacer) for x in pfa.action] if method == Method.MAP: # if the overall method is MAP, then we know that all of the individual engines are MAP # the overall action calls a nested chain of engines-as-functions and each engine-as-a-function just does its job and returns (body is unmodified) fcns[thisActionFcnName] = FcnDef( [{ "input": newPlaceholder(i, pfa.input) }], newPlaceholder(i, pfa.output), body) if i == 0: action.append(Call("u." + thisActionFcnName, [Ref("input")])) else: action[-1] = Call("u." + thisActionFcnName, [action[-1]]) elif method == Method.EMIT: # if the overall method is EMIT, then some individual engines might be MAP or might be EMIT # the overall action calls the first engine-as-a-function and the engines-as-functions call each other (body is modified) if pfa.method == Method.MAP and i + 1 < len(pfas): body = [ Call("u." + nextActionFcnName, [Do(body)]), LiteralNull() ] elif pfa.method == Method.MAP: body = [Call("emit", [Do(body)])] elif pfa.method == Method.EMIT: body.append(LiteralNull()) fcns[thisActionFcnName] = FcnDef( [{ "input": newPlaceholder(i, pfa.input) }], newPlaceholder(i, AvroNull()), body) if i == 0: action.append(Call("u." + thisActionFcnName, [Ref("input")])) # convert all of the user functions into user functions for fcnName, fcnDef in list(pfa.fcns.items()): # note: some of these user-defined functions may call emit; if so, they'll call the right emit fcns[prefixFcnDef(i, pfa, fcnName)] = FcnDef( [{ list(t.keys())[0]: newPlaceholder(i, list(t.values())[0]) } for t in fcnDef.paramsPlaceholder], newPlaceholder(i, fcnDef.ret), [x.replace(fcnReplacer) for x in fcnDef.body], fcnDef.pos) if verbose: sys.stderr.write(time.asctime() + " Create types for model parameters\n") for i, pfa in enumerate(pfas): if verbose and len(pfa.cells) > 0: sys.stderr.write(time.asctime() + " step {0}:\n".format(i + 1)) for cellName, cell in list(pfa.cells.items()): if verbose: sys.stderr.write(time.asctime() + " cell {0}\n".format(cellName)) newCell = Cell(newPlaceholder(i, cell.avroType), cell.init, cell.shared, cell.rollback, cell.source, cell.pos) cells[prefixCell(i, pfa, cellName)] = newCell if cell.source == "embedded": def converter(avroType): original = jsonDecoder(cell.avroType, jsonlib.loads(cell.init)) return jsonlib.dumps(jsonEncoder(avroType, original)) newCell.converter = converter for i, pfa in enumerate(pfas): if verbose and len(pfa.pools) > 0: sys.stderr.write(time.asctime() + " step {0}:\n".format(i + 1)) for poolName, pool in list(pfa.pools.items()): if verbose: sys.stderr.write(time.asctime() + " pool {0}\n".format(poolName)) newPool = Pool(newPlaceholder(i, pool.avroType), pool.init, pool.shared, pool.rollback, pool.source, pool.pos) pools[prefixPool(i, pfa, poolName)] = newPool if pool.source == "embedded": def converter(avroType): original = jsonDecoder(pool.avroType, jsonlib.loads(pool.init)) return jsonlib.dumps(jsonEncoder(avroType, original)) newPool.converter = converter # make sure all the types work together if verbose: sys.stderr.write(time.asctime() + " Resolving all types\n") avroTypeBuilder.resolveTypes() if verbose: sys.stderr.write(time.asctime() + " Converting the model parameters themselves\n") for i, pfa in enumerate(pfas): if verbose and len(pfa.cells) > 0: sys.stderr.write(time.asctime() + " step {0}:\n".format(i + 1)) for cellName, cell in list(pfa.cells.items()): if verbose: sys.stderr.write(time.asctime() + " cell {0}\n".format(cellName)) if cell.source == "embedded": newCell = cells[prefixCell(i, pfa, cellName)] newCell.init = newCell.converter(newCell.avroType) for i, pfa in enumerate(pfas): if verbose and len(pfa.pools) > 0: sys.stderr.write(time.asctime() + " step {0}:\n".format(i + 1)) for poolName, pool in list(pfa.pools.items()): if verbose: sys.stderr.write(time.asctime() + " pool {0}\n".format(poolName)) if pool.source == "embedded": newPool = pools[prefixPool(i, pfa, poolName)] newPool.init = newPool.converter(newPool.avroType) # randseed, doc, version, metadata, and options need to be explicitly set # return a (possibly checked) AST out = EngineConfig(name, method, inputPlaceholder, outputPlaceholder, begin, action, end, fcns, zero, merge, cells, pools, randseed, doc, version, metadata, options) if check: if verbose: sys.stderr.write(time.asctime() + " Verifying PFA validity\n") PFAEngine.fromAst(out) if verbose: sys.stderr.write(time.asctime() + " Done\n") return out
def jsonDecoder(avroType, value): """Decode a JSON object as a given titus.datatype.AvroType. :type avroType: titus.datatype.AvroType :param avroType: how we want to interpret this JSON :type value: dicts, lists, strings, numbers, ``True``, ``False``, ``None`` :param value: the JSON object in Python encoding :rtype: dicts, lists, strings, numbers, ``True``, ``False``, ``None`` :return: an object ready for PFAEngine.action """ if isinstance(avroType, AvroNull): if value is None: return value elif isinstance(avroType, AvroBoolean): if value is True or value is False: return value elif isinstance(avroType, AvroInt): try: return int(value) except (ValueError, TypeError): pass elif isinstance(avroType, AvroLong): try: return long(value) except (ValueError, TypeError): pass elif isinstance(avroType, AvroFloat): try: return float(value) except (ValueError, TypeError): pass elif isinstance(avroType, AvroDouble): try: return float(value) except (ValueError, TypeError): pass elif isinstance(avroType, AvroBytes): if isinstance(value, basestring): return bytes(value) elif isinstance(avroType, AvroFixed): if isinstance(value, basestring): out = bytes(value) if len(out) == avroType.size: return out elif isinstance(avroType, AvroString): if isinstance(value, basestring): return value elif isinstance(avroType, AvroEnum): if isinstance(value, basestring) and value in avroType.symbols: return value elif isinstance(avroType, AvroArray): if isinstance(value, (list, tuple)): return [jsonDecoder(avroType.items, x) for x in value] elif isinstance(avroType, AvroMap): if isinstance(value, dict): return dict((k, jsonDecoder(avroType.values, v)) for k, v in value.items()) elif isinstance(avroType, AvroRecord): if isinstance(value, dict): out = {} for field in avroType.fields: if field.name in value: out[field.name] = jsonDecoder(field.avroType, value[field.name]) elif field.default is not None: out[field.name] = jsonDecoder(field.avroType, field.default) elif isinstance(field.avroType, AvroNull): out[field.name] = None else: raise titus.errors.AvroException("{0} does not match schema {1}".format(json.dumps(value), ts(avroType))) return out elif isinstance(avroType, AvroUnion): if isinstance(value, dict) and len(value) == 1: tag, = value.keys() val, = value.values() types = dict((x.name, x) for x in avroType.types) if tag in types: return {tag: jsonDecoder(types[tag], val)} elif value is None and "null" in [x.name for x in avroType.types]: return None else: raise Exception raise titus.errors.AvroException("{0} does not match schema {1}".format(json.dumps(value), ts(avroType)))
def jsonEncoder(avroType, value, tagged=True): """Encode an object as JSON, given titus.datatype.AvroType. :type avroType: titus.datatype.AvroType :param avroType: type of this object :type value: dicts, lists, strings, numbers, ``True``, ``False``, ``None`` :param value: the object returned from PFAEngine.action :type tagged: bool :param tagged: if True, represent unions as ``{tag: value}``; if False, represent them simply as ``value``. :rtype: dicts, lists, strings, numbers, ``True``, ``False``, ``None`` :return: the JSON object in Python encoding """ if isinstance(avroType, AvroNull) and value is None: return value elif isinstance(avroType, AvroBoolean) and (value is True or value is False): return value elif isinstance(avroType, AvroInt) and isinstance(value, (int, long)) and value is not True and value is not False: return value elif isinstance(avroType, AvroLong) and isinstance(value, (int, long)) and value is not True and value is not False: return value elif isinstance(avroType, AvroFloat) and isinstance(value, (int, long, float)) and value is not True and value is not False: return float(value) elif isinstance(avroType, AvroDouble) and isinstance(value, (int, long, float)) and value is not True and value is not False: return float(value) elif isinstance(avroType, AvroBytes) and isinstance(value, basestring): return value elif isinstance(avroType, AvroFixed) and isinstance(value, basestring): out = bytes(value) if len(out) == avroType.size: return out elif isinstance(avroType, AvroString) and isinstance(value, basestring): return value elif isinstance(avroType, AvroEnum) and isinstance(value, basestring) and value in avroType.symbols: return value elif isinstance(avroType, AvroArray) and isinstance(value, (list, tuple)): return [jsonEncoder(avroType.items, x, tagged) for x in value] elif isinstance(avroType, AvroMap) and isinstance(value, dict): return dict((k, jsonEncoder(avroType.values, v, tagged)) for k, v in value.items()) elif isinstance(avroType, AvroRecord) and isinstance(value, dict): out = {} for field in avroType.fields: if field.name in value: out[field.name] = jsonEncoder(field.avroType, value[field.name], tagged) elif field.default is not None: pass else: raise titus.errors.AvroException("{0} does not match schema {1}".format(json.dumps(value), ts(avroType))) return out elif isinstance(avroType, AvroUnion) and any(isinstance(t, AvroNull) for t in avroType.types) and value is None: return None elif isinstance(avroType, AvroUnion): if isinstance(value, dict) and len(value) == 1: val, = value.values() for t in avroType.types: try: out = jsonEncoder(t, val, tagged) except titus.errors.AvroException: pass else: if tagged: return {t.name: out} else: return out for t in avroType.types: try: out = jsonEncoder(t, value, tagged) except titus.errors.AvroException: pass else: if tagged: return {t.name: out} else: return out raise titus.errors.AvroException("{0} does not match schema {1}".format(json.dumps(value), ts(avroType)))
def removeDuplicateNames(self, x, memo): if isinstance(x, dict) and "name" in x and "type" in x and x["type"] in ("enum", "fixed", "record"): if "namespace" in x: name = x["namespace"] + "." + x["name"] else: name = x["name"] if name in memo: if memo[name] != x: raise titus.errors.AvroException("type name \"{0}\" previously defined as\n{1}\nnow defined as\n{2}".format(name, ts(jsonNodeToAvroType(memo[name])), ts(jsonNodeToAvroType(x)))) return name else: memo[name] = x return dict((k, self.removeDuplicateNames(v, memo)) for k, v in x.items()) elif isinstance(x, dict): return dict((k, self.removeDuplicateNames(v, memo)) for k, v in x.items()) elif isinstance(x, (list, tuple)): return [self.removeDuplicateNames(v, memo) for v in x] else: return x
def checkData(data, avroType): """Return ``True`` if ``data`` satisfies ``avroType`` and can be used in PFAEngine.action.""" if isinstance(avroType, AvroNull): if data == "null": data = None elif data is None: return data else: raise TypeError("expecting {0}, found {1}".format(ts(avroType), data)) elif isinstance(avroType, AvroBoolean): if data == "true": return True elif data == "false": return False elif isinstance(data, booleanTypes): return bool(data) elif data is True or data is False: return data else: raise TypeError("expecting {0}, found {1}".format(ts(avroType), data)) elif isinstance(avroType, AvroInt): if isinstance(data, basestring): try: data = int(data) except ValueError: raise TypeError("expecting {0}, found {1}".format(ts(avroType), data)) elif isinstance(data, integerTypes): data = int(data) elif isinstance(data, (int, long)): return data else: raise TypeError("expecting {0}, found {1}".format(ts(avroType), data)) elif isinstance(avroType, AvroLong): if isinstance(data, basestring): try: data = int(data) except ValueError: raise TypeError("expecting {0}, found {1}".format(ts(avroType), data)) elif isinstance(data, integerTypes): data = int(data) elif isinstance(data, (int, long)): return data else: raise TypeError("expecting {0}, found {1}".format(ts(avroType), data)) elif isinstance(avroType, AvroFloat): if isinstance(data, basestring): try: data = float(data) except ValueError: raise TypeError("expecting {0}, found {1}".format(ts(avroType), data)) elif isinstance(data, floatTypes): data = float(data) elif isinstance(data, (int, long)): data = float(data) elif isinstance(data, float): return data else: raise TypeError("expecting {0}, found {1}".format(ts(avroType), data)) elif isinstance(avroType, AvroDouble): if isinstance(data, basestring): try: data = float(data) except ValueError: raise TypeError("expecting {0}, found {1}".format(ts(avroType), data)) elif isinstance(data, floatTypes): return float(data) elif isinstance(data, (int, long)): return float(data) elif isinstance(data, float): return data else: raise TypeError("expecting {0}, found {1}".format(ts(avroType), data)) elif isinstance(avroType, (AvroBytes, AvroFixed)): if isinstance(data, unicode): return data.encode("utf-8", "replace") elif isinstance(data, str): return data else: raise TypeError("expecting {0}, found {1}".format(ts(avroType), data)) elif isinstance(avroType, (AvroString, AvroEnum)): if isinstance(data, str): return data.decode("utf-8", "replace") elif isinstance(data, unicode): return data else: raise TypeError("expecting {0}, found {1}".format(ts(avroType), data)) elif isinstance(avroType, AvroArray): if hasattr(data, "__iter__"): return [checkData(x, avroType.items) for x in data] else: raise TypeError("expecting {0}, found {1}".format(ts(avroType), data)) elif isinstance(avroType, AvroMap): if hasattr(data, "__iter__") and hasattr(data, "__getitem__"): newData = {} for key in data: value = checkData(data[key], avroType.values) if isinstance(key, str): newData[key.decode("utf-8", "replace")] = value elif isinstance(key, unicode): newData[key] = value else: raise TypeError("expecting {0}, found key {1}".format(ts(avroType), key)) return newData else: raise TypeError("expecting {0}, found {1}".format(ts(avroType), data)) elif isinstance(avroType, AvroRecord): if hasattr(data, "__iter__") and hasattr(data, "__getitem__"): newData = {} for field in avroType.fields: try: value = data[field.name] except KeyError: raise TypeError("expecting {0}, couldn't find key {1}".format(ts(avroType), field.name)) newData[field.name] = checkData(value, field.avroType) return newData else: raise TypeError("expecting {0}, found {1}".format(ts(avroType), data)) elif isinstance(avroType, AvroUnion): if isinstance(data, dict) and len(data) == 1: tag, = data.keys() value, = data.values() for tpe in avroType.types: if tpe.name == tag: if tag == "null": return checkData(value, tpe) else: return {tag: checkData(value, tpe)} raise TypeError("expecting {0}, found {1}".format(ts(avroType), data)) for tpe in avroType.types: try: newData = checkData(data, tpe) except TypeError: pass else: if tpe.name == "null": return newData else: return {tpe.name: newData} raise TypeError("expecting {0}, found {1}".format(ts(avroType), data)) return data
def compare(avroType, x, y): """Returns -1, 0, or 1 depending on whether x is less than, equal to, or greater than y, according to the schema. Assumes that x and y are valid examples of the schema. :type avroType: titus.datatype.AvroType :param avroType: type of this object :type x: dicts, lists, strings, numbers, ``True``, ``False``, ``None`` :param x: Avro object in Python form :type y: dicts, lists, strings, numbers, ``True``, ``False``, ``None`` :param y: Avro object in Python form :rtype: int :return: -1, 0, or 1 """ if isinstance(avroType, AvroNull) and x is None and y is None: return 0 elif isinstance(avroType, AvroBoolean) and (x is True or x is False) and (y is True or y is False): return cmp(x, y) # agrees with Java elif isinstance(avroType, AvroInt) and isinstance(x, (int, long)) and x is not True and x is not False and isinstance(y, (int, long)) and y is not True and y is not False: return cmp(x, y) elif isinstance(avroType, AvroLong) and isinstance(x, (int, long)) and x is not True and x is not False and isinstance(y, (int, long)) and y is not True and y is not False: return cmp(x, y) elif isinstance(avroType, AvroFloat) and isinstance(x, (int, long, float)) and x is not True and x is not False and isinstance(y, (int, long, float)) and y is not True and y is not False: return cmp(x, y) if math.isnan(x): if math.isnan(y): return 0 else: return 1 else: if math.isnan(y): return -1 else: return cmp(x, y) elif isinstance(avroType, AvroDouble) and isinstance(x, (int, long, float)) and x is not True and x is not False and isinstance(y, (int, long, float)) and y is not True and y is not False: if math.isnan(x): if math.isnan(y): return 0 else: return 1 else: if math.isnan(y): return -1 else: return cmp(x, y) elif isinstance(avroType, AvroBytes) and isinstance(x, basestring) and isinstance(y, basestring): return cmp(x, y) elif isinstance(avroType, AvroFixed) and isinstance(x, basestring) and isinstance(y, basestring): return cmp(x, y) elif isinstance(avroType, AvroString) and isinstance(x, basestring) and isinstance(y, basestring): return cmp(x, y) elif isinstance(avroType, AvroEnum) and isinstance(x, basestring) and x in avroType.symbols and isinstance(y, basestring) and y in avroType.symbols: comparison = avroType.symbols.index(x) - avroType.symbols.index(y) if comparison < 0: return -1 elif comparison > 0: return 1 else: return 0 elif isinstance(avroType, AvroArray) and isinstance(x, (list, tuple)) and isinstance(y, (list, tuple)): for xi, yi in zip(x, y): comparison = compare(avroType.items, xi, yi) if comparison != 0: return comparison if len(x) > len(y): return 1 elif len(x) < len(y): return -1 else: return 0 elif isinstance(avroType, AvroMap) and isinstance(x, dict) and isinstance(y, dict): raise NotImplementedError("Avro has no order defined for maps???") elif isinstance(avroType, AvroRecord) and isinstance(x, dict) and isinstance(y, dict): for field in avroType.fields: if field.order == "ignore": continue comparison = compare(field.avroType, x[field.name], y[field.name]) if comparison != 0: if field.order == "descending": return -comparison else: return comparison return 0 elif isinstance(avroType, AvroUnion): if isinstance(x, dict) and len(x) == 1: (xtag, x), = x.items() xtypei, xtype = [(ti, t) for ti, t in enumerate(avroType.types) if t.name == xtag][0] else: xtypei = None for ti, t in enumerate(avroType.types): try: jsonEncoder(t, x) except titus.errors.AvroException: pass else: xtypei = ti xtype = t if xtypei is None: raise titus.errors.AvroException() if isinstance(y, dict) and len(y) == 1: (ytag, y), = y.items() ytypei, ytype = [(ti, t) for ti, t in enumerate(avroType.types) if t.name == ytag][0] else: ytypei = None for ti, t in enumerate(avroType.types): try: jsonEncoder(t, y) except titus.errors.AvroException: pass else: ytypei = ti ytype = t if ytypei is None: raise titus.errors.AvroException() if xtypei == ytypei: return compare(xtype, x, y) else: comparison = xtypei - ytypei if comparison < 0: return -1 elif comparison > 0: return 1 else: return 0 else: raise titus.errors.AvroException("{0} or {1} does not match schema {2}".format(json.dumps(x), json.dumps(y), ts(avroType)))
def jsonEncoder(avroType, value): if isinstance(avroType, AvroNull) and value is None: return value elif isinstance(avroType, AvroBoolean) and (value is True or value is False): return value elif isinstance(avroType, AvroInt) and isinstance(value, (int, long)) and value is not True and value is not False: return value elif isinstance(avroType, AvroLong) and isinstance(value, (int, long)) and value is not True and value is not False: return value elif isinstance(avroType, AvroFloat) and isinstance(value, (int, long, float)) and value is not True and value is not False: return float(value) elif isinstance(avroType, AvroDouble) and isinstance(value, (int, long, float)) and value is not True and value is not False: return float(value) elif isinstance(avroType, AvroBytes) and isinstance(value, basestring): return value elif isinstance(avroType, AvroFixed) and isinstance(value, basestring): out = bytes(value) if len(out) == avroType.size: return out elif isinstance(avroType, AvroString) and isinstance(value, basestring): return value elif isinstance(avroType, AvroEnum) and isinstance(value, basestring) and value in avroType.symbols: return value elif isinstance(avroType, AvroArray) and isinstance(value, (list, tuple)): return [jsonEncoder(avroType.items, x) for x in value] elif isinstance(avroType, AvroMap) and isinstance(value, dict): return dict((k, jsonEncoder(avroType.values, v)) for k, v in value.items()) elif isinstance(avroType, AvroRecord) and isinstance(value, dict): out = {} for field in avroType.fields: if field.name in value: out[field.name] = jsonEncoder(field.avroType, value[field.name]) elif field.default is not None: pass else: raise titus.errors.AvroException("{0} does not match schema {1}".format(json.dumps(value), ts(avroType))) return out elif isinstance(avroType, AvroUnion): if isinstance(value, dict) and len(value) == 1: val, = value.values() for t in avroType.types: try: return {t.name: jsonEncoder(t, val)} except titus.errors.AvroException: pass for t in avroType.types: try: return {t.name: jsonEncoder(t, value)} except titus.errors.AvroException: pass raise titus.errors.AvroException("{0} does not match schema {1}".format(json.dumps(value), ts(avroType)))
def jsonDecoder(avroType, value): if isinstance(avroType, AvroNull): if value is None: return value elif isinstance(avroType, AvroBoolean): if value is True or value is False: return value elif isinstance(avroType, AvroInt): try: return int(value) except ValueError: pass elif isinstance(avroType, AvroLong): try: return long(value) except ValueError: pass elif isinstance(avroType, AvroFloat): try: return float(value) except ValueError: pass elif isinstance(avroType, AvroDouble): try: return float(value) except ValueError: pass elif isinstance(avroType, AvroBytes): if isinstance(value, basestring): return bytes(value) elif isinstance(avroType, AvroFixed): if isinstance(value, basestring): out = bytes(value) if len(out) == avroType.size: return out elif isinstance(avroType, AvroString): if isinstance(value, basestring): return value elif isinstance(avroType, AvroEnum): if isinstance(value, basestring) and value in avroType.symbols: return value elif isinstance(avroType, AvroArray): if isinstance(value, (list, tuple)): return [jsonDecoder(avroType.items, x) for x in value] elif isinstance(avroType, AvroMap): if isinstance(value, dict): return dict((k, jsonDecoder(avroType.values, v)) for k, v in value.items()) elif isinstance(avroType, AvroRecord): if isinstance(value, dict): out = {} for field in avroType.fields: if field.name in value: out[field.name] = jsonDecoder(field.avroType, value[field.name]) elif field.default is not None: out[field.name] = jsonDecoder(field.avroType, field.default) elif isinstance(field.avroType, AvroNull): out[field.name] = None else: raise titus.errors.AvroException("{0} does not match schema {1}".format(json.dumps(value), ts(avroType))) return out elif isinstance(avroType, AvroUnion): if isinstance(value, dict) and len(value) == 1: tag, = value.keys() val, = value.values() types = dict((x.name, x) for x in avroType.types) if tag in types: return {tag: jsonDecoder(types[tag], val)} elif value is None and "null" in [x.name for x in avroType.types]: return None else: raise Exception raise titus.errors.AvroException("{0} does not match schema {1}".format(json.dumps(value), ts(avroType)))