Exemple #1
0
def ast(pfas, check=True, name=None, randseed=None, doc=None, version=None, metadata={}, options={}, tryYaml=False, verbose=False):
    """Create a single PFA from a chained workflow, returning the result as an abstract syntax tree.

    :type pfas: list of titus.pfaast.EngineConfig, Pythonized JSON, or JSON strings
    :param pfas: PFA documents for which the output of document *i* is the input to document *i + 1*
    :type check: bool
    :param check: test the chained PFA for validity
    :type name: string or ``None``
    :param name: optional name for the chained PFA
    :type randseed: integer or ``None``
    :param randseed: optional random number seed for the chained PFA
    :type doc: string or ``None``
    :param doc: optional documentation string for the chained PFA
    :type version: integer or ``None``
    :param version: optional version number for the chained PFA
    :type metadata: dict of strings
    :param metadata: metadata for the chained PFA (default is ``{}``)
    :type options: dict of Pythonized JSON
    :param options: implementation options for the chained PFA (default is ``{}``)
    :type tryYaml: bool
    :param tryYaml: if ``True``, attempt to interpret ``pfas`` as YAML (assuming they fail as JSON)
    :type verbose: bool
    :param verbose: if ``True``, write status messages to standard output
    :rtype: titus.pfaast.EngineConfig
    :return: a PFA document representing the chained workflow
    """

    # normalize all input forms to ASTs
    if verbose: sys.stderr.write(time.asctime() + " Converting all inputs to ASTs\n")
    asts = []
    for i, src in enumerate(pfas):
        if verbose: sys.stderr.write(time.asctime() + "     step {0}\n".format(i + 1))
        if isinstance(src, EngineConfig):
            ast = src
        elif isinstance(src, dict):
            ast = titus.reader.jsonToAst(src)
        else:
            try:
                ast = titus.reader.jsonToAst(src)
            except ValueError:
                if tryYaml:
                    ast = titus.reader.yamlToAst(src)
                else:
                    raise
        asts.append(ast)
    pfas = asts

    # helper functions for transforming names
    def split(t):
        if "." in t:
            return t[:t.rindex(".")], t[t.rindex(".") + 1:]
        else:
            return None, t

    def join(ns, n):
        if ns is None or ns == "":
            return n
        else:
            return ns + "." + n

    def prefixType(i, pfa, t):
        ns, n = split(t)
        return join(ns, "Step{0:d}_{1}_{2}".format(i + 1, pfa.name, n))

    def prefixAction(i, pfa):
        return "step{0:d}_{1}_action".format(i + 1, pfa.name)

    def prefixFcnRef(i, pfa, x):
        if x.startswith("u."):
            return "u.step{0:d}_{1}_fcn_{2}".format(i + 1, pfa.name, x[2:])
        else:
            return x

    def prefixFcnDef(i, pfa, x):
        return "step{0:d}_{1}_fcn_{2}".format(i + 1, pfa.name, x)

    def prefixCell(i, pfa, x):
        return "step{0:d}_{1}_{2}".format(i + 1, pfa.name, x)

    def prefixPool(i, pfa, x):
        return "step{0:d}_{1}_{2}".format(i + 1, pfa.name, x)

    # define new names for all types to avoid type name collisions
    if verbose: sys.stderr.write(time.asctime() + " Changing type names to avoid collisions\n")
    originalNameToNewName = {}
    for i, pfa in enumerate(pfas):
        originalNameToNewName[i] = {}
        for typeName in pfa.inputPlaceholder.parser.names.names.keys():
            originalNameToNewName[i][typeName] = prefixType(i, pfa, typeName)

    # but any names in the input to the first and the output from the last should not be changed
    def trivialName(i, avroType, memo):
        if isinstance(avroType, AvroArray):
            trivialName(i, avroType.items, memo)
        elif isinstance(avroType, AvroMap):
            trivialName(i, avroType.values, memo)
        elif isinstance(avroType, AvroUnion):
            for t in avroType.types:
                trivialName(i, t, memo)
        elif isinstance(avroType, (AvroFixed, AvroEnum)):
            t = avroType.fullName
            originalNameToNewName[i][t] = t
        elif isinstance(avroType, AvroRecord):
            t = avroType.fullName
            if t not in memo:
                memo.add(t)
                for f in avroType.fields:
                    trivialName(i, f.avroType, memo)
                originalNameToNewName[i][t] = t
    trivialName(0, pfas[0].input, set())
    trivialName(len(pfas) - 1, pfas[-1].output, set())

    # ensure that chained types match and will be given the same names
    if verbose: sys.stderr.write(time.asctime() + " Verifying that input/output schemas match along the chain\n")
    def chainPair(i, first, second, memo):
        if isinstance(first, AvroNull) and isinstance(second, AvroNull):
            return True
        elif isinstance(first, AvroBoolean) and isinstance(second, AvroBoolean):
            return True
        elif isinstance(first, AvroInt) and isinstance(second, AvroInt):
            return True
        elif isinstance(first, AvroLong) and isinstance(second, AvroLong):
            return True
        elif isinstance(first, AvroFloat) and isinstance(second, AvroFloat):
            return True
        elif isinstance(first, AvroDouble) and isinstance(second, AvroDouble):
            return True
        elif isinstance(first, AvroBytes) and isinstance(second, AvroBytes):
            return True
        elif isinstance(first, AvroFixed) and isinstance(second, AvroFixed):
            t = avroType.fullName
            if first.size == second.size:
                originalNameToNewName[i + 1][second.fullName] = originalNameToNewName[i][first.fullName]
                return True
            else:
                return False
        elif isinstance(first, AvroString) and isinstance(second, AvroString):
            return True
        elif isinstance(first, AvroEnum) and isinstance(second, AvroEnum):
            if first.symbols == second.symbols:
                originalNameToNewName[i + 1][second.fullName] = originalNameToNewName[i][first.fullName]
                return True
            else:
                return False
        elif isinstance(first, AvroArray) and isinstance(second, AvroArray):
            return chainPair(i, first.items, second.items, memo)
        elif isinstance(first, AvroMap) and isinstance(second, AvroMap):
            return chainPair(i, first.values, second.values, memo)
        elif isinstance(first, AvroRecord) and isinstance(second, AvroRecord):
            if first.fullName not in memo:
                memo.add(first.fullName)
                if len(first.fields) != len(second.fields):
                    return False
                for f1, f2 in zip(first.fields, second.fields):
                    if f1.name != f2.name:
                        return False
                    elif not chainPair(i, f1.avroType, f2.avroType, memo):
                        return False
                originalNameToNewName[i + 1][second.fullName] = originalNameToNewName[i][first.fullName]
                return True
        elif isinstance(first, AvroUnion) and isinstance(second, AvroUnion):
            for yt in second.types:
                if not any(chainPair(i, xt, yt, memo) for xt in first.types):
                    return False
            return True
        else:
            return False
    for i in xrange(len(pfas) - 1):
        first = pfas[i].output
        second = pfas[i + 1].input
        if not chainPair(i, first, second, set()):
            raise PFAChainError("output of engine {0}: {1} not compatible with input of engine {2}: {3}".format(i + 1, ts(first), i + 2, ts(second)))

    def rename(i, avroType, memo):
        if isinstance(avroType, AvroArray):
            return {"type": "array", "items": rename(i, avroType.items, memo)}
        elif isinstance(avroType, AvroMap):
            return {"type": "map", "values": rename(i, avroType.values, memo)}
        elif isinstance(avroType, AvroUnion):
            return [rename(i, t, memo) for t in avroType.types]
        elif isinstance(avroType, AvroFixed):
            ns, n = split(originalNameToNewName[i][avroType.fullName])
            out = {"type": "fixed", "name": n, "size": avroType.size}
            if ns is not None:
                out["namespace"] = ns
            return out
        elif isinstance(avroType, AvroEnum):
            ns, n = split(originalNameToNewName[i][avroType.fullName])
            out = {"type": "enum", "name": n, "symbols": avroType.symbols}
            if ns is not None:
                out["namespace"] = ns
            return out
        elif isinstance(avroType, AvroRecord):
            newName = originalNameToNewName[i][avroType.fullName]
            if newName in memo:
                return memo[newName]
            else:
                ns, n = split(newName)
                out = {"type": "record", "name": n, "fields": []}
                if ns is not None:
                    out["namespace"] = ns
                memo[newName] = join(ns, n)
                for f in avroType.fields:
                    newf = {"name": f.name, "type": rename(i, f.avroType, memo)}
                    if f.default is not None:
                        newf["default"] = f.default
                    if f.order is not None:
                        newf["order"] = f.order
                    out["fields"].append(newf)
                return out
        else:
            return jsonlib.loads(repr(avroType))

    avroTypeBuilder = AvroTypeBuilder()
    memo = {}
    def newPlaceholder(i, oldAvroType):
        newAvroType = rename(i, oldAvroType, {})
        return avroTypeBuilder.makePlaceholder(jsonlib.dumps(newAvroType), memo)
            
    # combined name, if not explicitly set
    if name is None:
        name = "Chain_" + "_".join(pfa.name for pfa in pfas)

    # combined method (fold not supported yet, but could be)
    method = Method.MAP
    for pfa in pfas:
        if pfa.method == Method.EMIT:
            method = Method.EMIT
        elif pfa.method == Method.FOLD:
            raise NotImplementedError("chaining of fold-type scoring engines has not been implemented yet")

    # no zero or merge until we support fold method
    zero = None
    merge = None

    # input/output types from first and last
    inputPlaceholder = newPlaceholder(0, pfas[0].input)
    outputPlaceholder = newPlaceholder(len(pfas) - 1, pfas[-1].output)

    if verbose: sys.stderr.write(time.asctime() + " Adding [name, instance, metadata, actionsStarted, actionsFinished, version] as model parameters\n")

    cells = {"name": Cell(newPlaceholder(0, AvroString()), jsonlib.dumps(""), False, False, CellPoolSource.EMBEDDED),
             "instance": Cell(newPlaceholder(0, AvroInt()), jsonlib.dumps(0), False, False, CellPoolSource.EMBEDDED),
             "metadata": Cell(newPlaceholder(0, AvroMap(AvroString())), jsonlib.dumps({}), False, False, CellPoolSource.EMBEDDED),
             "actionsStarted": Cell(newPlaceholder(0, AvroLong()), jsonlib.dumps(0), False, False, CellPoolSource.EMBEDDED),
             "actionsFinished": Cell(newPlaceholder(0, AvroLong()), jsonlib.dumps(0), False, False, CellPoolSource.EMBEDDED)}
    if version is not None:
        cells["version"] = Cell(newPlaceholder(0, AvroInt()), 0, False, False, CellPoolSource.EMBEDDED)
    pools = {}

    if verbose: sys.stderr.write(time.asctime() + " Converting scoring engine algorithm\n")

    # all code will go into user functions, including begin/action/end
    fcns = {}

    begin = [CellTo("name", [], Ref("name")),
             CellTo("instance", [], Ref("instance")),
             CellTo("metadata", [], Ref("metadata"))]
    if version is not None:
        begin.append(CellTo("version", [], Ref("version")))

    action = [CellTo("actionsStarted", [], Ref("actionsStarted")),
              CellTo("actionsFinished", [], Ref("actionsFinished"))]

    end = [CellTo("actionsStarted", [], Ref("actionsStarted")),
           CellTo("actionsFinished", [], Ref("actionsFinished"))]

    for i, pfa in enumerate(pfas):
        if verbose: sys.stderr.write(time.asctime() + "     step {0}: {1}\n".format(i + 1, pfa.name))

        thisActionFcnName = prefixAction(i, pfa)
        if i + 1 < len(pfas):
            nextActionFcnName = prefixAction(i + 1, pfas[i + 1])
        else:
            nextActionFcnName = None

        # this is a closure; it must be defined in the loop to pick up free variables
        lazyFcnReplacer = None
        def genericReplacer(expr, self):
            if isinstance(expr, FcnDef):
                return FcnDef([{t.keys()[0]: newPlaceholder(i, t.values()[0])} for t in expr.params],
                              newPlaceholder(i, expr.ret),
                              [x.replace(lazyFcnReplacer) for x in expr.body],     # this is the one place where we should pass down fcnReplacer rather than self
                              expr.pos)
            elif isinstance(expr, FcnRef):
                return FcnRef(prefixFcnRef(i, pfa, expr.name), epxr.pos)
            elif isinstance(expr, FcnRefFill):
                return FcnRefFill(prefixFcnRef(i, pfa, expr.name),
                                  dict((k, v.replace(self)) for k, v in expr.fill.items()),
                                  expr.pos)
            elif isinstance(expr, CallUserFcn):   # TODO: need to change the symbols of the corresponding enum
                return CallUserFcn(expr.name.replace(self),
                                   [x.replace(self) for x in expr.args],
                                   expr.pos)
            elif isinstance(expr, Call):
                if pfa.method == Method.EMIT and i + 1 < len(pfas) and expr.name == "emit":
                    return Call("u." + nextActionFcnName,
                                [x.replace(self) for x in expr.args],
                                expr.pos)
                else:
                    return Call(prefixFcnRef(i, pfa, expr.name),
                                [x.replace(self) for x in expr.args],
                                expr.pos)
            elif isinstance(expr, Literal):
                return Literal(newPlaceholder(i, expr.avroType),
                               expr.value,
                               expr.pos)
            elif isinstance(expr, NewObject):
                return NewObject(dict((k, v.replace(self)) for k, v in expr.fields.items()),
                                 newPlaceholder(i, expr.avroType),
                                 expr.pos)
            elif isinstance(expr, NewArray):
                return NewArray([x.replace(self) for x in expr.items],
                                newPlaceholder(i, expr.avroType),
                                expr.pos)
            elif isinstance(expr, CellGet):
                return CellGet(prefixCell(i, pfa, expr.cell),
                               [x.replace(self) for x in expr.path],
                               expr.pos)
            elif isinstance(expr, CellTo):
                return CellTo(prefixCell(i, pfa, expr.cell),
                              [x.replace(self) for x in expr.path],
                              expr.to.replace(self),
                              expr.pos)
            elif isinstance(expr, PoolGet):
                return PoolGet(prefixPool(i, pfa, expr.pool),
                               [x.replace(self) for x in expr.path],
                               expr.pos)
            elif isinstance(expr, PoolTo):
                return PoolTo(prefixPool(i, pfa, expr.pool),
                              [x.replace(self) for x in expr.path],
                              expr.to.replace(self),
                              expr.init.replace(self),
                              expr.pos)
            elif isinstance(expr, CastCase):
                return CastCase(newPlaceholder(i, expr.avroType),
                                expr.named,
                                [x.replace(self) for x in expr.body],
                                expr.pos)
            elif isinstance(expr, Upcast):
                return Upcast(expr.expr.replace(self),
                              newPlaceholder(i, expr.avroType),
                              expr.pos)
        genericReplacer.isDefinedAt = lambda x: isinstance(x, (FcnDef, FcnRef, FcnRefFill, CallUserFcn, Call, Literal, NewObject, CellGet, CellTo, PoolGet, PoolTo, CastCase, Upcast))

        def fcnReplacer(expr):
            return genericReplacer(expr, fcnReplacer)
        fcnReplacer.isDefinedAt = genericReplacer.isDefinedAt

        lazyFcnReplacer = fcnReplacer

        # add statements to begin
        def beginReplacer(expr):
            if isinstance(expr, Ref):
                if expr.name in ("name", "instance", "metadata") or (version is not None and expr.name == "version"):
                    return CellGet(expr.name, [], expr.pos)
                else:
                    return expr
            else:
                return genericReplacer(expr, beginReplacer)
        beginReplacer.isDefinedAt = lambda x: isinstance(x, Ref) or genericReplacer.isDefinedAt(x)
        begin.extend([x.replace(beginReplacer) for x in pfa.begin])

        # add statements to end
        def endReplacer(expr):
            if isinstance(expr, Ref):
                if expr.name in ("name", "instance", "metadata", "actionsStarted", "actionsFinished") or (version is not None and expr.name == "version"):
                    return CellGet(expr.name, [], expr.pos)
                else:
                    return expr
            else:
                return genericReplacer(expr, endReplacer)
        endReplacer.isDefinedAt = lambda x: isinstance(x, Ref) or genericReplacer.isDefinedAt(x)
        end.extend([x.replace(endReplacer) for x in pfa.end])

        # convert the action into a user function
        def actionReplacer(expr):
            if isinstance(expr, Ref):
                if expr.name in ("name", "instance", "metadata", "actionsStarted", "actionsFinished") or (version is not None and expr.name == "version"):
                    return CellGet(expr.name, [], expr.pos)
                else:
                    return expr
            else:
                return genericReplacer(expr, actionReplacer)
        actionReplacer.isDefinedAt = lambda x: isinstance(x, Ref) or genericReplacer.isDefinedAt(x)

        body = [x.replace(actionReplacer) for x in pfa.action]

        if method == Method.MAP:
            # if the overall method is MAP, then we know that all of the individual engines are MAP
            # the overall action calls a nested chain of engines-as-functions and each engine-as-a-function just does its job and returns (body is unmodified)
            fcns[thisActionFcnName] = FcnDef([{"input": newPlaceholder(i, pfa.input)}], newPlaceholder(i, pfa.output), body)
            if i == 0:
                action.append(Call("u." + thisActionFcnName, [Ref("input")]))
            else:
                action[-1] = Call("u." + thisActionFcnName, [action[-1]])

        elif method == Method.EMIT:
            # if the overall method is EMIT, then some individual engines might be MAP or might be EMIT
            # the overall action calls the first engine-as-a-function and the engines-as-functions call each other (body is modified)
            if pfa.method == Method.MAP and i + 1 < len(pfas):
                body = [Call("u." + nextActionFcnName, [Do(body)]), LiteralNull()]
            elif pfa.method == Method.MAP:
                body = [Call("emit", [Do(body)])]
            elif pfa.method == Method.EMIT:
                body.append(LiteralNull())

            fcns[thisActionFcnName] = FcnDef([{"input": newPlaceholder(i, pfa.input)}], newPlaceholder(i, AvroNull()), body)
            if i == 0:
                action.append(Call("u." + thisActionFcnName, [Ref("input")]))

        # convert all of the user functions into user functions
        for fcnName, fcnDef in pfa.fcns.items():
            # note: some of these user-defined functions may call emit; if so, they'll call the right emit
            fcns[prefixFcnDef(i, pfa, fcnName)] = FcnDef([{t.keys()[0]: newPlaceholder(i, t.values()[0])} for t in fcnDef.paramsPlaceholder],
                                                         newPlaceholder(i, fcnDef.ret),
                                                         [x.replace(fcnReplacer) for x in fcnDef.body],
                                                         fcnDef.pos)

    if verbose: sys.stderr.write(time.asctime() + " Create types for model parameters\n")

    for i, pfa in enumerate(pfas):
        if verbose and len(pfa.cells) > 0: sys.stderr.write(time.asctime() + "     step {0}:\n".format(i + 1))
        for cellName, cell in pfa.cells.items():
            if verbose: sys.stderr.write(time.asctime() + "         cell {0}\n".format(cellName))
            newCell = Cell(newPlaceholder(i, cell.avroType), cell.init, cell.shared, cell.rollback, cell.source, cell.pos)
            cells[prefixCell(i, pfa, cellName)] = newCell
            if cell.source == "embedded":
                def converter(avroType):
                    original = jsonDecoder(cell.avroType, jsonlib.loads(cell.init))
                    return jsonlib.dumps(jsonEncoder(avroType, original))
                newCell.converter = converter
                
    for i, pfa in enumerate(pfas):
        if verbose and len(pfa.pools) > 0: sys.stderr.write(time.asctime() + "     step {0}:\n".format(i + 1))
        for poolName, pool in pfa.pools.items():
            if verbose: sys.stderr.write(time.asctime() + "         pool {0}\n".format(poolName))
            newPool = Pool(newPlaceholder(i, pool.avroType), pool.init, pool.shared, pool.rollback, pool.source, pool.pos)
            pools[prefixPool(i, pfa, poolName)] = newPool
            if pool.source == "embedded":
                def converter(avroType):
                    original = jsonDecoder(pool.avroType, jsonlib.loads(pool.init))
                    return jsonlib.dumps(jsonEncoder(avroType, original))
                newPool.converter = converter
                
    # make sure all the types work together
    if verbose: sys.stderr.write(time.asctime() + " Resolving all types\n")
    avroTypeBuilder.resolveTypes()

    if verbose: sys.stderr.write(time.asctime() + " Converting the model parameters themselves\n")

    for i, pfa in enumerate(pfas):
        if verbose and len(pfa.cells) > 0: sys.stderr.write(time.asctime() + "     step {0}:\n".format(i + 1))
        for cellName, cell in pfa.cells.items():
            if verbose: sys.stderr.write(time.asctime() + "         cell {0}\n".format(cellName))
            if cell.source == "embedded":
                newCell = cells[prefixCell(i, pfa, cellName)]
                newCell.init = newCell.converter(newCell.avroType)

    for i, pfa in enumerate(pfas):
        if verbose and len(pfa.pools) > 0: sys.stderr.write(time.asctime() + "     step {0}:\n".format(i + 1))
        for poolName, pool in pfa.pools.items():
            if verbose: sys.stderr.write(time.asctime() + "         pool {0}\n".format(poolName))
            if pool.source == "embedded":
                newPool = pools[prefixPool(i, pfa, poolName)]
                newPool.init = newPool.converter(newPool.avroType)

    # randseed, doc, version, metadata, and options need to be explicitly set

    # return a (possibly checked) AST
    out = EngineConfig(name,
                       method,
                       inputPlaceholder,
                       outputPlaceholder,
                       begin,
                       action,
                       end,
                       fcns,
                       zero,
                       merge,
                       cells,
                       pools,
                       randseed,
                       doc,
                       version,
                       metadata,
                       options)
    if check:
        if verbose: sys.stderr.write(time.asctime() + " Verifying PFA validity\n")
        PFAEngine.fromAst(out)

    if verbose: sys.stderr.write(time.asctime() + " Done\n")
    return out
Exemple #2
0
def ast(pfas,
        check=True,
        name=None,
        randseed=None,
        doc=None,
        version=None,
        metadata={},
        options={},
        tryYaml=False,
        verbose=False):
    """Create a single PFA from a chained workflow, returning the result as an abstract syntax tree.

    :type pfas: list of titus.pfaast.EngineConfig, Pythonized JSON, or JSON strings
    :param pfas: PFA documents for which the output of document *i* is the input to document *i + 1*
    :type check: bool
    :param check: test the chained PFA for validity
    :type name: string or ``None``
    :param name: optional name for the chained PFA
    :type randseed: integer or ``None``
    :param randseed: optional random number seed for the chained PFA
    :type doc: string or ``None``
    :param doc: optional documentation string for the chained PFA
    :type version: integer or ``None``
    :param version: optional version number for the chained PFA
    :type metadata: dict of strings
    :param metadata: metadata for the chained PFA (default is ``{}``)
    :type options: dict of Pythonized JSON
    :param options: implementation options for the chained PFA (default is ``{}``)
    :type tryYaml: bool
    :param tryYaml: if ``True``, attempt to interpret ``pfas`` as YAML (assuming they fail as JSON)
    :type verbose: bool
    :param verbose: if ``True``, write status messages to standard output
    :rtype: titus.pfaast.EngineConfig
    :return: a PFA document representing the chained workflow
    """

    # normalize all input forms to ASTs
    if verbose:
        sys.stderr.write(time.asctime() + " Converting all inputs to ASTs\n")
    asts = []
    for i, src in enumerate(pfas):
        if verbose:
            sys.stderr.write(time.asctime() + "     step {0}\n".format(i + 1))
        if isinstance(src, EngineConfig):
            ast = src
        elif isinstance(src, dict):
            ast = titus.reader.jsonToAst(src)
        else:
            try:
                ast = titus.reader.jsonToAst(src)
            except ValueError:
                if tryYaml:
                    ast = titus.reader.yamlToAst(src)
                else:
                    raise
        asts.append(ast)
    pfas = asts

    # helper functions for transforming names
    def split(t):
        if "." in t:
            return t[:t.rindex(".")], t[t.rindex(".") + 1:]
        else:
            return None, t

    def join(ns, n):
        if ns is None or ns == "":
            return n
        else:
            return ns + "." + n

    def prefixType(i, pfa, t):
        ns, n = split(t)
        return join(ns, "Step{0:d}_{1}_{2}".format(i + 1, pfa.name, n))

    def prefixAction(i, pfa):
        return "step{0:d}_{1}_action".format(i + 1, pfa.name)

    def prefixFcnRef(i, pfa, x):
        if x.startswith("u."):
            return "u.step{0:d}_{1}_fcn_{2}".format(i + 1, pfa.name, x[2:])
        else:
            return x

    def prefixFcnDef(i, pfa, x):
        return "step{0:d}_{1}_fcn_{2}".format(i + 1, pfa.name, x)

    def prefixCell(i, pfa, x):
        return "step{0:d}_{1}_{2}".format(i + 1, pfa.name, x)

    def prefixPool(i, pfa, x):
        return "step{0:d}_{1}_{2}".format(i + 1, pfa.name, x)

    # define new names for all types to avoid type name collisions
    if verbose:
        sys.stderr.write(time.asctime() +
                         " Changing type names to avoid collisions\n")
    originalNameToNewName = {}
    for i, pfa in enumerate(pfas):
        originalNameToNewName[i] = {}
        for typeName in list(pfa.inputPlaceholder.parser.names.names.keys()):
            keyTypeName = typeName
            if (typeName[0] == "."):
                keyTypeName = keyTypeName[1:]
            originalNameToNewName[i][keyTypeName] = prefixType(
                i, pfa, typeName)

    # but any names in the input to the first and the output from the last should not be changed
    def trivialName(i, avroType, memo):
        if isinstance(avroType, AvroArray):
            trivialName(i, avroType.items, memo)
        elif isinstance(avroType, AvroMap):
            trivialName(i, avroType.values, memo)
        elif isinstance(avroType, AvroUnion):
            for t in avroType.types:
                trivialName(i, t, memo)
        elif isinstance(avroType, (AvroFixed, AvroEnum)):
            t = avroType.fullName
            originalNameToNewName[i][t] = t
        elif isinstance(avroType, AvroRecord):
            t = avroType.fullName
            if t not in memo:
                memo.add(t)
                for f in avroType.fields:
                    trivialName(i, f.avroType, memo)
                originalNameToNewName[i][t] = t

    trivialName(0, pfas[0].input, set())
    trivialName(len(pfas) - 1, pfas[-1].output, set())

    # ensure that chained types match and will be given the same names
    if verbose:
        sys.stderr.write(
            time.asctime() +
            " Verifying that input/output schemas match along the chain\n")

    def chainPair(i, first, second, memo):
        if isinstance(first, AvroNull) and isinstance(second, AvroNull):
            return True
        elif isinstance(first, AvroBoolean) and isinstance(
                second, AvroBoolean):
            return True
        elif isinstance(first, AvroInt) and isinstance(second, AvroInt):
            return True
        elif isinstance(first, AvroLong) and isinstance(second, AvroLong):
            return True
        elif isinstance(first, AvroFloat) and isinstance(second, AvroFloat):
            return True
        elif isinstance(first, AvroDouble) and isinstance(second, AvroDouble):
            return True
        elif isinstance(first, AvroBytes) and isinstance(second, AvroBytes):
            return True
        elif isinstance(first, AvroFixed) and isinstance(second, AvroFixed):
            if first.size == second.size:
                originalNameToNewName[i + 1][
                    second.fullName] = originalNameToNewName[i][first.fullName]
                return True
            else:
                return False
        elif isinstance(first, AvroString) and isinstance(second, AvroString):
            return True
        elif isinstance(first, AvroEnum) and isinstance(second, AvroEnum):
            if first.symbols == second.symbols:
                originalNameToNewName[i + 1][
                    second.fullName] = originalNameToNewName[i][first.fullName]
                return True
            else:
                return False
        elif isinstance(first, AvroArray) and isinstance(second, AvroArray):
            return chainPair(i, first.items, second.items, memo)
        elif isinstance(first, AvroMap) and isinstance(second, AvroMap):
            return chainPair(i, first.values, second.values, memo)
        elif isinstance(first, AvroRecord) and isinstance(second, AvroRecord):
            if first.fullName not in memo:
                memo.add(first.fullName)
                if len(first.fields) != len(second.fields):
                    return False
                for f1, f2 in zip(first.fields, second.fields):
                    if f1.name != f2.name:
                        return False
                    elif not chainPair(i, f1.avroType, f2.avroType, memo):
                        return False
                originalNameToNewName[i + 1][
                    second.fullName] = originalNameToNewName[i][first.fullName]
                return True
        elif isinstance(first, AvroUnion) and isinstance(second, AvroUnion):
            for yt in second.types:
                if not any(chainPair(i, xt, yt, memo) for xt in first.types):
                    return False
            return True
        else:
            return False

    for i in range(len(pfas) - 1):
        first = pfas[i].output
        second = pfas[i + 1].input
        if not chainPair(i, first, second, set()):
            raise PFAChainError(
                "output of engine {0}: {1} not compatible with input of engine {2}: {3}"
                .format(i + 1, ts(first), i + 2, ts(second)))

    def rename(i, avroType, memo):
        if isinstance(avroType, AvroArray):
            return {"type": "array", "items": rename(i, avroType.items, memo)}
        elif isinstance(avroType, AvroMap):
            return {"type": "map", "values": rename(i, avroType.values, memo)}
        elif isinstance(avroType, AvroUnion):
            return [rename(i, t, memo) for t in avroType.types]
        elif isinstance(avroType, AvroFixed):
            ns, n = split(originalNameToNewName[i][avroType.fullName])
            out = {"type": "fixed", "name": n, "size": avroType.size}
            if ns is not None:
                out["namespace"] = ns
            return out
        elif isinstance(avroType, AvroEnum):
            ns, n = split(originalNameToNewName[i][avroType.fullName])
            out = {"type": "enum", "name": n, "symbols": avroType.symbols}
            if ns is not None:
                out["namespace"] = ns
            return out
        elif isinstance(avroType, AvroRecord):
            newName = originalNameToNewName[i][avroType.fullName]
            if newName in memo:
                return memo[newName]
            else:
                ns, n = split(newName)
                out = {"type": "record", "name": n, "fields": []}
                if ns is not None:
                    out["namespace"] = ns
                memo[newName] = join(ns, n)
                for f in avroType.fields:
                    newf = {
                        "name": f.name,
                        "type": rename(i, f.avroType, memo)
                    }
                    if f.default is not None:
                        newf["default"] = f.default
                    if f.order is not None:
                        newf["order"] = f.order
                    out["fields"].append(newf)
                return out
        else:
            return jsonlib.loads(repr(avroType))

    avroTypeBuilder = AvroTypeBuilder()
    memo = {}

    def newPlaceholder(i, oldAvroType):
        newAvroType = rename(i, oldAvroType, {})
        return avroTypeBuilder.makePlaceholder(jsonlib.dumps(newAvroType),
                                               memo)

    # combined name, if not explicitly set
    if name is None:
        name = "Chain_" + "_".join(pfa.name for pfa in pfas)

    # combined method (fold not supported yet, but could be)
    method = Method.MAP
    for pfa in pfas:
        if pfa.method == Method.EMIT:
            method = Method.EMIT
        elif pfa.method == Method.FOLD:
            raise NotImplementedError(
                "chaining of fold-type scoring engines has not been implemented yet"
            )

    # no zero or merge until we support fold method
    zero = None
    merge = None

    # input/output types from first and last
    inputPlaceholder = newPlaceholder(0, pfas[0].input)
    outputPlaceholder = newPlaceholder(len(pfas) - 1, pfas[-1].output)

    if verbose:
        sys.stderr.write(
            time.asctime() +
            " Adding [name, instance, metadata, actionsStarted, actionsFinished, version] as model parameters\n"
        )

    cells = {
        "name":
        Cell(newPlaceholder(0, AvroString()), jsonlib.dumps(""), False, False,
             CellPoolSource.EMBEDDED),
        "instance":
        Cell(newPlaceholder(0, AvroInt()), jsonlib.dumps(0), False, False,
             CellPoolSource.EMBEDDED),
        "metadata":
        Cell(newPlaceholder(0, AvroMap(AvroString())), jsonlib.dumps({}),
             False, False, CellPoolSource.EMBEDDED),
        "actionsStarted":
        Cell(newPlaceholder(0, AvroLong()), jsonlib.dumps(0), False, False,
             CellPoolSource.EMBEDDED),
        "actionsFinished":
        Cell(newPlaceholder(0, AvroLong()), jsonlib.dumps(0), False, False,
             CellPoolSource.EMBEDDED)
    }
    if version is not None:
        cells["version"] = Cell(newPlaceholder(0, AvroInt()), 0, False, False,
                                CellPoolSource.EMBEDDED)
    pools = {}

    if verbose:
        sys.stderr.write(time.asctime() +
                         " Converting scoring engine algorithm\n")

    # all code will go into user functions, including begin/action/end
    fcns = {}

    begin = [
        CellTo("name", [], Ref("name")),
        CellTo("instance", [], Ref("instance")),
        CellTo("metadata", [], Ref("metadata"))
    ]
    if version is not None:
        begin.append(CellTo("version", [], Ref("version")))

    action = [
        CellTo("actionsStarted", [], Ref("actionsStarted")),
        CellTo("actionsFinished", [], Ref("actionsFinished"))
    ]

    end = [
        CellTo("actionsStarted", [], Ref("actionsStarted")),
        CellTo("actionsFinished", [], Ref("actionsFinished"))
    ]

    for i, pfa in enumerate(pfas):
        if verbose:
            sys.stderr.write(time.asctime() +
                             "     step {0}: {1}\n".format(i + 1, pfa.name))

        thisActionFcnName = prefixAction(i, pfa)
        if i + 1 < len(pfas):
            nextActionFcnName = prefixAction(i + 1, pfas[i + 1])
        else:
            nextActionFcnName = None

        # this is a closure; it must be defined in the loop to pick up free variables
        lazyFcnReplacer = None

        def genericReplacer(expr, self):
            if isinstance(expr, FcnDef):
                return FcnDef(
                    [{
                        list(t.keys())[0]: newPlaceholder(
                            i,
                            list(t.values())[0])
                    } for t in expr.params],
                    newPlaceholder(i, expr.ret),
                    [
                        x.replace(lazyFcnReplacer) for x in expr.body
                    ],  # this is the one place where we should pass down fcnReplacer rather than self
                    expr.pos)
            elif isinstance(expr, FcnRef):
                return FcnRef(prefixFcnRef(i, pfa, expr.name), expr.pos)
            elif isinstance(expr, FcnRefFill):
                return FcnRefFill(
                    prefixFcnRef(i, pfa, expr.name),
                    dict((k, v.replace(self))
                         for k, v in list(expr.fill.items())), expr.pos)
            elif isinstance(
                    expr, CallUserFcn
            ):  # TODO: need to change the symbols of the corresponding enum
                return CallUserFcn(expr.name.replace(self),
                                   [x.replace(self) for x in expr.args],
                                   expr.pos)
            elif isinstance(expr, Call):
                if pfa.method == Method.EMIT and i + 1 < len(
                        pfas) and expr.name == "emit":
                    return Call("u." + nextActionFcnName,
                                [x.replace(self) for x in expr.args], expr.pos)
                else:
                    return Call(prefixFcnRef(i, pfa, expr.name),
                                [x.replace(self) for x in expr.args], expr.pos)
            elif isinstance(expr, Literal):
                return Literal(newPlaceholder(i, expr.avroType), expr.value,
                               expr.pos)
            elif isinstance(expr, NewObject):
                return NewObject(
                    dict((k, v.replace(self))
                         for k, v in list(expr.fields.items())),
                    newPlaceholder(i, expr.avroType), expr.pos)
            elif isinstance(expr, NewArray):
                return NewArray([x.replace(self) for x in expr.items],
                                newPlaceholder(i, expr.avroType), expr.pos)
            elif isinstance(expr, CellGet):
                return CellGet(prefixCell(i, pfa, expr.cell),
                               [x.replace(self) for x in expr.path], expr.pos)
            elif isinstance(expr, CellTo):
                return CellTo(prefixCell(i, pfa, expr.cell),
                              [x.replace(self) for x in expr.path],
                              expr.to.replace(self), expr.pos)
            elif isinstance(expr, PoolGet):
                return PoolGet(prefixPool(i, pfa, expr.pool),
                               [x.replace(self) for x in expr.path], expr.pos)
            elif isinstance(expr, PoolTo):
                return PoolTo(prefixPool(i, pfa, expr.pool),
                              [x.replace(self) for x in expr.path],
                              expr.to.replace(self), expr.init.replace(self),
                              expr.pos)
            elif isinstance(expr, CastCase):
                return CastCase(newPlaceholder(i, expr.avroType), expr.named,
                                [x.replace(self) for x in expr.body], expr.pos)
            elif isinstance(expr, Upcast):
                return Upcast(expr.expr.replace(self),
                              newPlaceholder(i, expr.avroType), expr.pos)

        genericReplacer.isDefinedAt = lambda x: isinstance(
            x, (FcnDef, FcnRef, FcnRefFill, CallUserFcn, Call, Literal,
                NewObject, CellGet, CellTo, PoolGet, PoolTo, CastCase, Upcast))

        def fcnReplacer(expr):
            return genericReplacer(expr, fcnReplacer)

        fcnReplacer.isDefinedAt = genericReplacer.isDefinedAt

        lazyFcnReplacer = fcnReplacer

        # add statements to begin
        def beginReplacer(expr):
            if isinstance(expr, Ref):
                if expr.name in ("name", "instance",
                                 "metadata") or (version is not None
                                                 and expr.name == "version"):
                    return CellGet(expr.name, [], expr.pos)
                else:
                    return expr
            else:
                return genericReplacer(expr, beginReplacer)

        beginReplacer.isDefinedAt = lambda x: isinstance(
            x, Ref) or genericReplacer.isDefinedAt(x)
        begin.extend([x.replace(beginReplacer) for x in pfa.begin])

        # add statements to end
        def endReplacer(expr):
            if isinstance(expr, Ref):
                if expr.name in ("name", "instance", "metadata",
                                 "actionsStarted", "actionsFinished") or (
                                     version is not None
                                     and expr.name == "version"):
                    return CellGet(expr.name, [], expr.pos)
                else:
                    return expr
            else:
                return genericReplacer(expr, endReplacer)

        endReplacer.isDefinedAt = lambda x: isinstance(
            x, Ref) or genericReplacer.isDefinedAt(x)
        end.extend([x.replace(endReplacer) for x in pfa.end])

        # convert the action into a user function
        def actionReplacer(expr):
            if isinstance(expr, Ref):
                if expr.name in ("name", "instance", "metadata",
                                 "actionsStarted", "actionsFinished") or (
                                     version is not None
                                     and expr.name == "version"):
                    return CellGet(expr.name, [], expr.pos)
                else:
                    return expr
            else:
                return genericReplacer(expr, actionReplacer)

        actionReplacer.isDefinedAt = lambda x: isinstance(
            x, Ref) or genericReplacer.isDefinedAt(x)

        body = [x.replace(actionReplacer) for x in pfa.action]

        if method == Method.MAP:
            # if the overall method is MAP, then we know that all of the individual engines are MAP
            # the overall action calls a nested chain of engines-as-functions and each engine-as-a-function just does its job and returns (body is unmodified)
            fcns[thisActionFcnName] = FcnDef(
                [{
                    "input": newPlaceholder(i, pfa.input)
                }], newPlaceholder(i, pfa.output), body)
            if i == 0:
                action.append(Call("u." + thisActionFcnName, [Ref("input")]))
            else:
                action[-1] = Call("u." + thisActionFcnName, [action[-1]])

        elif method == Method.EMIT:
            # if the overall method is EMIT, then some individual engines might be MAP or might be EMIT
            # the overall action calls the first engine-as-a-function and the engines-as-functions call each other (body is modified)
            if pfa.method == Method.MAP and i + 1 < len(pfas):
                body = [
                    Call("u." + nextActionFcnName, [Do(body)]),
                    LiteralNull()
                ]
            elif pfa.method == Method.MAP:
                body = [Call("emit", [Do(body)])]
            elif pfa.method == Method.EMIT:
                body.append(LiteralNull())

            fcns[thisActionFcnName] = FcnDef(
                [{
                    "input": newPlaceholder(i, pfa.input)
                }], newPlaceholder(i, AvroNull()), body)
            if i == 0:
                action.append(Call("u." + thisActionFcnName, [Ref("input")]))

        # convert all of the user functions into user functions
        for fcnName, fcnDef in list(pfa.fcns.items()):
            # note: some of these user-defined functions may call emit; if so, they'll call the right emit
            fcns[prefixFcnDef(i, pfa, fcnName)] = FcnDef(
                [{
                    list(t.keys())[0]: newPlaceholder(i,
                                                      list(t.values())[0])
                } for t in fcnDef.paramsPlaceholder],
                newPlaceholder(i, fcnDef.ret),
                [x.replace(fcnReplacer) for x in fcnDef.body], fcnDef.pos)

    if verbose:
        sys.stderr.write(time.asctime() +
                         " Create types for model parameters\n")

    for i, pfa in enumerate(pfas):
        if verbose and len(pfa.cells) > 0:
            sys.stderr.write(time.asctime() + "     step {0}:\n".format(i + 1))
        for cellName, cell in list(pfa.cells.items()):
            if verbose:
                sys.stderr.write(time.asctime() +
                                 "         cell {0}\n".format(cellName))
            newCell = Cell(newPlaceholder(i, cell.avroType), cell.init,
                           cell.shared, cell.rollback, cell.source, cell.pos)
            cells[prefixCell(i, pfa, cellName)] = newCell
            if cell.source == "embedded":

                def converter(avroType):
                    original = jsonDecoder(cell.avroType,
                                           jsonlib.loads(cell.init))
                    return jsonlib.dumps(jsonEncoder(avroType, original))

                newCell.converter = converter

    for i, pfa in enumerate(pfas):
        if verbose and len(pfa.pools) > 0:
            sys.stderr.write(time.asctime() + "     step {0}:\n".format(i + 1))
        for poolName, pool in list(pfa.pools.items()):
            if verbose:
                sys.stderr.write(time.asctime() +
                                 "         pool {0}\n".format(poolName))
            newPool = Pool(newPlaceholder(i, pool.avroType), pool.init,
                           pool.shared, pool.rollback, pool.source, pool.pos)
            pools[prefixPool(i, pfa, poolName)] = newPool
            if pool.source == "embedded":

                def converter(avroType):
                    original = jsonDecoder(pool.avroType,
                                           jsonlib.loads(pool.init))
                    return jsonlib.dumps(jsonEncoder(avroType, original))

                newPool.converter = converter

    # make sure all the types work together
    if verbose: sys.stderr.write(time.asctime() + " Resolving all types\n")
    avroTypeBuilder.resolveTypes()

    if verbose:
        sys.stderr.write(time.asctime() +
                         " Converting the model parameters themselves\n")

    for i, pfa in enumerate(pfas):
        if verbose and len(pfa.cells) > 0:
            sys.stderr.write(time.asctime() + "     step {0}:\n".format(i + 1))
        for cellName, cell in list(pfa.cells.items()):
            if verbose:
                sys.stderr.write(time.asctime() +
                                 "         cell {0}\n".format(cellName))
            if cell.source == "embedded":
                newCell = cells[prefixCell(i, pfa, cellName)]
                newCell.init = newCell.converter(newCell.avroType)

    for i, pfa in enumerate(pfas):
        if verbose and len(pfa.pools) > 0:
            sys.stderr.write(time.asctime() + "     step {0}:\n".format(i + 1))
        for poolName, pool in list(pfa.pools.items()):
            if verbose:
                sys.stderr.write(time.asctime() +
                                 "         pool {0}\n".format(poolName))
            if pool.source == "embedded":
                newPool = pools[prefixPool(i, pfa, poolName)]
                newPool.init = newPool.converter(newPool.avroType)

    # randseed, doc, version, metadata, and options need to be explicitly set

    # return a (possibly checked) AST
    out = EngineConfig(name, method, inputPlaceholder, outputPlaceholder,
                       begin, action, end, fcns, zero, merge, cells, pools,
                       randseed, doc, version, metadata, options)
    if check:
        if verbose:
            sys.stderr.write(time.asctime() + " Verifying PFA validity\n")
        PFAEngine.fromAst(out)

    if verbose: sys.stderr.write(time.asctime() + " Done\n")
    return out
Exemple #3
0
def jsonDecoder(avroType, value):
    """Decode a JSON object as a given titus.datatype.AvroType.

    :type avroType: titus.datatype.AvroType
    :param avroType: how we want to interpret this JSON
    :type value: dicts, lists, strings, numbers, ``True``, ``False``, ``None``
    :param value: the JSON object in Python encoding
    :rtype: dicts, lists, strings, numbers, ``True``, ``False``, ``None``
    :return: an object ready for PFAEngine.action
    """

    if isinstance(avroType, AvroNull):
        if value is None:
            return value
    elif isinstance(avroType, AvroBoolean):
        if value is True or value is False:
            return value
    elif isinstance(avroType, AvroInt):
        try:
            return int(value)
        except (ValueError, TypeError):
            pass
    elif isinstance(avroType, AvroLong):
        try:
            return long(value)
        except (ValueError, TypeError):
            pass
    elif isinstance(avroType, AvroFloat):
        try:
            return float(value)
        except (ValueError, TypeError):
            pass
    elif isinstance(avroType, AvroDouble):
        try:
            return float(value)
        except (ValueError, TypeError):
            pass
    elif isinstance(avroType, AvroBytes):
        if isinstance(value, basestring):
            return bytes(value)
    elif isinstance(avroType, AvroFixed):
        if isinstance(value, basestring):
            out = bytes(value)
            if len(out) == avroType.size:
                return out
    elif isinstance(avroType, AvroString):
        if isinstance(value, basestring):
            return value
    elif isinstance(avroType, AvroEnum):
        if isinstance(value, basestring) and value in avroType.symbols:
            return value
    elif isinstance(avroType, AvroArray):
        if isinstance(value, (list, tuple)):
            return [jsonDecoder(avroType.items, x) for x in value]
    elif isinstance(avroType, AvroMap):
        if isinstance(value, dict):
            return dict((k, jsonDecoder(avroType.values, v)) for k, v in value.items())
    elif isinstance(avroType, AvroRecord):
        if isinstance(value, dict):
            out = {}
            for field in avroType.fields:
                if field.name in value:
                    out[field.name] = jsonDecoder(field.avroType, value[field.name])
                elif field.default is not None:
                    out[field.name] = jsonDecoder(field.avroType, field.default)
                elif isinstance(field.avroType, AvroNull):
                    out[field.name] = None
                else:
                    raise titus.errors.AvroException("{0} does not match schema {1}".format(json.dumps(value), ts(avroType)))
            return out
    elif isinstance(avroType, AvroUnion):
        if isinstance(value, dict) and len(value) == 1:
            tag, = value.keys()
            val, = value.values()
            types = dict((x.name, x) for x in avroType.types)
            if tag in types:
                return {tag: jsonDecoder(types[tag], val)}
        elif value is None and "null" in [x.name for x in avroType.types]:
            return None
    else:
        raise Exception
    raise titus.errors.AvroException("{0} does not match schema {1}".format(json.dumps(value), ts(avroType)))
Exemple #4
0
def jsonEncoder(avroType, value, tagged=True):
    """Encode an object as JSON, given titus.datatype.AvroType.

    :type avroType: titus.datatype.AvroType
    :param avroType: type of this object
    :type value: dicts, lists, strings, numbers, ``True``, ``False``, ``None``
    :param value: the object returned from PFAEngine.action
    :type tagged: bool
    :param tagged: if True, represent unions as ``{tag: value}``; if False, represent them simply as ``value``.
    :rtype: dicts, lists, strings, numbers, ``True``, ``False``, ``None``
    :return: the JSON object in Python encoding
    """

    if isinstance(avroType, AvroNull) and value is None:
        return value
    elif isinstance(avroType, AvroBoolean) and (value is True or value is False):
        return value
    elif isinstance(avroType, AvroInt) and isinstance(value, (int, long)) and value is not True and value is not False:
        return value
    elif isinstance(avroType, AvroLong) and isinstance(value, (int, long)) and value is not True and value is not False:
        return value
    elif isinstance(avroType, AvroFloat) and isinstance(value, (int, long, float)) and value is not True and value is not False:
        return float(value)
    elif isinstance(avroType, AvroDouble) and isinstance(value, (int, long, float)) and value is not True and value is not False:
        return float(value)
    elif isinstance(avroType, AvroBytes) and isinstance(value, basestring):
        return value
    elif isinstance(avroType, AvroFixed) and isinstance(value, basestring):
        out = bytes(value)
        if len(out) == avroType.size:
            return out
    elif isinstance(avroType, AvroString) and isinstance(value, basestring):
        return value
    elif isinstance(avroType, AvroEnum) and isinstance(value, basestring) and value in avroType.symbols:
        return value
    elif isinstance(avroType, AvroArray) and isinstance(value, (list, tuple)):
        return [jsonEncoder(avroType.items, x, tagged) for x in value]
    elif isinstance(avroType, AvroMap) and isinstance(value, dict):
        return dict((k, jsonEncoder(avroType.values, v, tagged)) for k, v in value.items())
    elif isinstance(avroType, AvroRecord) and isinstance(value, dict):
        out = {}
        for field in avroType.fields:
            if field.name in value:
                out[field.name] = jsonEncoder(field.avroType, value[field.name], tagged)
            elif field.default is not None:
                pass
            else:
                raise titus.errors.AvroException("{0} does not match schema {1}".format(json.dumps(value), ts(avroType)))
        return out
    elif isinstance(avroType, AvroUnion) and any(isinstance(t, AvroNull) for t in avroType.types) and value is None:
        return None
    elif isinstance(avroType, AvroUnion):
        if isinstance(value, dict) and len(value) == 1:
            val, = value.values()
            for t in avroType.types:
                try:
                    out = jsonEncoder(t, val, tagged)
                except titus.errors.AvroException:
                    pass
                else:
                    if tagged:
                        return {t.name: out}
                    else:
                        return out
        for t in avroType.types:
            try:
                out = jsonEncoder(t, value, tagged)
            except titus.errors.AvroException:
                pass
            else:
                if tagged:
                    return {t.name: out}
                else:
                    return out
    raise titus.errors.AvroException("{0} does not match schema {1}".format(json.dumps(value), ts(avroType)))
Exemple #5
0
    def removeDuplicateNames(self, x, memo):
        if isinstance(x, dict) and "name" in x and "type" in x and x["type"] in ("enum", "fixed", "record"):
            if "namespace" in x:
                name = x["namespace"] + "." + x["name"]
            else:
                name = x["name"]
            if name in memo:
                if memo[name] != x:
                    raise titus.errors.AvroException("type name \"{0}\" previously defined as\n{1}\nnow defined as\n{2}".format(name, ts(jsonNodeToAvroType(memo[name])), ts(jsonNodeToAvroType(x))))
                return name
            else:
                memo[name] = x
                return dict((k, self.removeDuplicateNames(v, memo)) for k, v in x.items())

        elif isinstance(x, dict):
            return dict((k, self.removeDuplicateNames(v, memo)) for k, v in x.items())

        elif isinstance(x, (list, tuple)):
            return [self.removeDuplicateNames(v, memo) for v in x]

        else:
            return x
Exemple #6
0
def checkData(data, avroType):
    """Return ``True`` if ``data`` satisfies ``avroType`` and can be used in PFAEngine.action."""

    if isinstance(avroType, AvroNull):
        if data == "null":
            data = None
        elif data is None:
            return data
        else:
            raise TypeError("expecting {0}, found {1}".format(ts(avroType), data))

    elif isinstance(avroType, AvroBoolean):
        if data == "true":
            return True
        elif data == "false":
            return False
        elif isinstance(data, booleanTypes):
            return bool(data)
        elif data is True or data is False:
            return data
        else:
            raise TypeError("expecting {0}, found {1}".format(ts(avroType), data))

    elif isinstance(avroType, AvroInt):
        if isinstance(data, basestring):
            try:
                data = int(data)
            except ValueError:
                raise TypeError("expecting {0}, found {1}".format(ts(avroType), data))
        elif isinstance(data, integerTypes):
            data = int(data)
        elif isinstance(data, (int, long)):
            return data
        else:
            raise TypeError("expecting {0}, found {1}".format(ts(avroType), data))

    elif isinstance(avroType, AvroLong):
        if isinstance(data, basestring):
            try:
                data = int(data)
            except ValueError:
                raise TypeError("expecting {0}, found {1}".format(ts(avroType), data))
        elif isinstance(data, integerTypes):
            data = int(data)
        elif isinstance(data, (int, long)):
            return data
        else:
            raise TypeError("expecting {0}, found {1}".format(ts(avroType), data))

    elif isinstance(avroType, AvroFloat):
        if isinstance(data, basestring):
            try:
                data = float(data)
            except ValueError:
                raise TypeError("expecting {0}, found {1}".format(ts(avroType), data))
        elif isinstance(data, floatTypes):
            data = float(data)
        elif isinstance(data, (int, long)):
            data = float(data)
        elif isinstance(data, float):
            return data
        else:
            raise TypeError("expecting {0}, found {1}".format(ts(avroType), data))

    elif isinstance(avroType, AvroDouble):
        if isinstance(data, basestring):
            try:
                data = float(data)
            except ValueError:
                raise TypeError("expecting {0}, found {1}".format(ts(avroType), data))
        elif isinstance(data, floatTypes):
            return float(data)
        elif isinstance(data, (int, long)):
            return float(data)
        elif isinstance(data, float):
            return data
        else:
            raise TypeError("expecting {0}, found {1}".format(ts(avroType), data))

    elif isinstance(avroType, (AvroBytes, AvroFixed)):
        if isinstance(data, unicode):
            return data.encode("utf-8", "replace")
        elif isinstance(data, str):
            return data
        else:
            raise TypeError("expecting {0}, found {1}".format(ts(avroType), data))

    elif isinstance(avroType, (AvroString, AvroEnum)):
        if isinstance(data, str):
            return data.decode("utf-8", "replace")
        elif isinstance(data, unicode):
            return data
        else:
            raise TypeError("expecting {0}, found {1}".format(ts(avroType), data))

    elif isinstance(avroType, AvroArray):
        if hasattr(data, "__iter__"):
            return [checkData(x, avroType.items) for x in data]
        else:
            raise TypeError("expecting {0}, found {1}".format(ts(avroType), data))

    elif isinstance(avroType, AvroMap):
        if hasattr(data, "__iter__") and hasattr(data, "__getitem__"):
            newData = {}
            for key in data:
                value = checkData(data[key], avroType.values)
                if isinstance(key, str):
                    newData[key.decode("utf-8", "replace")] = value
                elif isinstance(key, unicode):
                    newData[key] = value
                else:
                    raise TypeError("expecting {0}, found key {1}".format(ts(avroType), key))
            return newData
        else:
            raise TypeError("expecting {0}, found {1}".format(ts(avroType), data))

    elif isinstance(avroType, AvroRecord):
        if hasattr(data, "__iter__") and hasattr(data, "__getitem__"):
            newData = {}
            for field in avroType.fields:
                try:
                    value = data[field.name]
                except KeyError:
                    raise TypeError("expecting {0}, couldn't find key {1}".format(ts(avroType), field.name))
                newData[field.name] = checkData(value, field.avroType)
            return newData
        else:
            raise TypeError("expecting {0}, found {1}".format(ts(avroType), data))

    elif isinstance(avroType, AvroUnion):
        if isinstance(data, dict) and len(data) == 1:
            tag, = data.keys()
            value, = data.values()
            for tpe in avroType.types:
                if tpe.name == tag:
                    if tag == "null":
                        return checkData(value, tpe)
                    else:
                        return {tag: checkData(value, tpe)}
            raise TypeError("expecting {0}, found {1}".format(ts(avroType), data))

        for tpe in avroType.types:
            try:
                newData = checkData(data, tpe)
            except TypeError:
                pass
            else:
                if tpe.name == "null":
                    return newData
                else:
                    return {tpe.name: newData}
        raise TypeError("expecting {0}, found {1}".format(ts(avroType), data))

    return data
Exemple #7
0
def compare(avroType, x, y):
    """Returns -1, 0, or 1 depending on whether x is less than, equal to, or greater than y, according to the schema.

    Assumes that x and y are valid examples of the schema.

    :type avroType: titus.datatype.AvroType
    :param avroType: type of this object
    :type x: dicts, lists, strings, numbers, ``True``, ``False``, ``None``
    :param x: Avro object in Python form
    :type y: dicts, lists, strings, numbers, ``True``, ``False``, ``None``
    :param y: Avro object in Python form
    :rtype: int
    :return: -1, 0, or 1
    """

    if isinstance(avroType, AvroNull) and x is None and y is None:
        return 0
    elif isinstance(avroType, AvroBoolean) and (x is True or x is False) and (y is True or y is False):
        return cmp(x, y)    # agrees with Java
    elif isinstance(avroType, AvroInt) and isinstance(x, (int, long)) and x is not True and x is not False and isinstance(y, (int, long)) and y is not True and y is not False:
        return cmp(x, y)
    elif isinstance(avroType, AvroLong) and isinstance(x, (int, long)) and x is not True and x is not False and isinstance(y, (int, long)) and y is not True and y is not False:
        return cmp(x, y)
    elif isinstance(avroType, AvroFloat) and isinstance(x, (int, long, float)) and x is not True and x is not False and isinstance(y, (int, long, float)) and y is not True and y is not False:
        return cmp(x, y)
        if math.isnan(x):
            if math.isnan(y):
                return 0
            else:
                return 1
        else:
            if math.isnan(y):
                return -1
            else:
                return cmp(x, y)
    elif isinstance(avroType, AvroDouble) and isinstance(x, (int, long, float)) and x is not True and x is not False and isinstance(y, (int, long, float)) and y is not True and y is not False:
        if math.isnan(x):
            if math.isnan(y):
                return 0
            else:
                return 1
        else:
            if math.isnan(y):
                return -1
            else:
                return cmp(x, y)
    elif isinstance(avroType, AvroBytes) and isinstance(x, basestring) and isinstance(y, basestring):
        return cmp(x, y)
    elif isinstance(avroType, AvroFixed) and isinstance(x, basestring) and isinstance(y, basestring):
        return cmp(x, y)
    elif isinstance(avroType, AvroString) and isinstance(x, basestring) and isinstance(y, basestring):
        return cmp(x, y)
    elif isinstance(avroType, AvroEnum) and isinstance(x, basestring) and x in avroType.symbols and isinstance(y, basestring) and y in avroType.symbols:
        comparison = avroType.symbols.index(x) - avroType.symbols.index(y)
        if comparison < 0:
            return -1
        elif comparison > 0:
            return 1
        else:
            return 0
    elif isinstance(avroType, AvroArray) and isinstance(x, (list, tuple)) and isinstance(y, (list, tuple)):
        for xi, yi in zip(x, y):
            comparison = compare(avroType.items, xi, yi)
            if comparison != 0:
                return comparison
        if len(x) > len(y):
            return 1
        elif len(x) < len(y):
            return -1
        else:
            return 0
    elif isinstance(avroType, AvroMap) and isinstance(x, dict) and isinstance(y, dict):
        raise NotImplementedError("Avro has no order defined for maps???")
    elif isinstance(avroType, AvroRecord) and isinstance(x, dict) and isinstance(y, dict):
        for field in avroType.fields:
            if field.order == "ignore":
                continue
            comparison = compare(field.avroType, x[field.name], y[field.name])
            if comparison != 0:
                if field.order == "descending":
                    return -comparison
                else:
                    return comparison
        return 0
    elif isinstance(avroType, AvroUnion):
        if isinstance(x, dict) and len(x) == 1:
            (xtag, x), = x.items()
            xtypei, xtype = [(ti, t) for ti, t in enumerate(avroType.types) if t.name == xtag][0]
        else:
            xtypei = None
            for ti, t in enumerate(avroType.types):
                try:
                    jsonEncoder(t, x)
                except titus.errors.AvroException:
                    pass
                else:
                    xtypei = ti
                    xtype = t
            if xtypei is None:
                raise titus.errors.AvroException()
        if isinstance(y, dict) and len(y) == 1:
            (ytag, y), = y.items()
            ytypei, ytype = [(ti, t) for ti, t in enumerate(avroType.types) if t.name == ytag][0]
        else:
            ytypei = None
            for ti, t in enumerate(avroType.types):
                try:
                    jsonEncoder(t, y)
                except titus.errors.AvroException:
                    pass
                else:
                    ytypei = ti
                    ytype = t
            if ytypei is None:
                raise titus.errors.AvroException()
        if xtypei == ytypei:
            return compare(xtype, x, y)
        else:
            comparison = xtypei - ytypei
            if comparison < 0:
                return -1
            elif comparison > 0:
                return 1
            else:
                return 0
    else:
        raise titus.errors.AvroException("{0} or {1} does not match schema {2}".format(json.dumps(x), json.dumps(y), ts(avroType)))
Exemple #8
0
def jsonEncoder(avroType, value):
    if isinstance(avroType, AvroNull) and value is None:
        return value
    elif isinstance(avroType, AvroBoolean) and (value is True or value is False):
        return value
    elif isinstance(avroType, AvroInt) and isinstance(value, (int, long)) and value is not True and value is not False:
        return value
    elif isinstance(avroType, AvroLong) and isinstance(value, (int, long)) and value is not True and value is not False:
        return value
    elif isinstance(avroType, AvroFloat) and isinstance(value, (int, long, float)) and value is not True and value is not False:
        return float(value)
    elif isinstance(avroType, AvroDouble) and isinstance(value, (int, long, float)) and value is not True and value is not False:
        return float(value)
    elif isinstance(avroType, AvroBytes) and isinstance(value, basestring):
        return value
    elif isinstance(avroType, AvroFixed) and isinstance(value, basestring):
        out = bytes(value)
        if len(out) == avroType.size:
            return out
    elif isinstance(avroType, AvroString) and isinstance(value, basestring):
        return value
    elif isinstance(avroType, AvroEnum) and isinstance(value, basestring) and value in avroType.symbols:
        return value
    elif isinstance(avroType, AvroArray) and isinstance(value, (list, tuple)):
        return [jsonEncoder(avroType.items, x) for x in value]
    elif isinstance(avroType, AvroMap) and isinstance(value, dict):
        return dict((k, jsonEncoder(avroType.values, v)) for k, v in value.items())
    elif isinstance(avroType, AvroRecord) and isinstance(value, dict):
        out = {}
        for field in avroType.fields:
            if field.name in value:
                out[field.name] = jsonEncoder(field.avroType, value[field.name])
            elif field.default is not None:
                pass
            else:
                raise titus.errors.AvroException("{0} does not match schema {1}".format(json.dumps(value), ts(avroType)))
        return out

    elif isinstance(avroType, AvroUnion):
        if isinstance(value, dict) and len(value) == 1:
            val, = value.values()
            for t in avroType.types:
                try:
                    return {t.name: jsonEncoder(t, val)}
                except titus.errors.AvroException:
                    pass
        for t in avroType.types:
            try:
                return {t.name: jsonEncoder(t, value)}
            except titus.errors.AvroException:
                pass
    raise titus.errors.AvroException("{0} does not match schema {1}".format(json.dumps(value), ts(avroType)))
Exemple #9
0
def jsonDecoder(avroType, value):
    if isinstance(avroType, AvroNull):
        if value is None:
            return value
    elif isinstance(avroType, AvroBoolean):
        if value is True or value is False:
            return value
    elif isinstance(avroType, AvroInt):
        try:
            return int(value)
        except ValueError:
            pass
    elif isinstance(avroType, AvroLong):
        try:
            return long(value)
        except ValueError:
            pass
    elif isinstance(avroType, AvroFloat):
        try:
            return float(value)
        except ValueError:
            pass
    elif isinstance(avroType, AvroDouble):
        try:
            return float(value)
        except ValueError:
            pass
    elif isinstance(avroType, AvroBytes):
        if isinstance(value, basestring):
            return bytes(value)
    elif isinstance(avroType, AvroFixed):
        if isinstance(value, basestring):
            out = bytes(value)
            if len(out) == avroType.size:
                return out
    elif isinstance(avroType, AvroString):
        if isinstance(value, basestring):
            return value
    elif isinstance(avroType, AvroEnum):
        if isinstance(value, basestring) and value in avroType.symbols:
            return value
    elif isinstance(avroType, AvroArray):
        if isinstance(value, (list, tuple)):
            return [jsonDecoder(avroType.items, x) for x in value]
    elif isinstance(avroType, AvroMap):
        if isinstance(value, dict):
            return dict((k, jsonDecoder(avroType.values, v)) for k, v in value.items())
    elif isinstance(avroType, AvroRecord):
        if isinstance(value, dict):
            out = {}
            for field in avroType.fields:
                if field.name in value:
                    out[field.name] = jsonDecoder(field.avroType, value[field.name])
                elif field.default is not None:
                    out[field.name] = jsonDecoder(field.avroType, field.default)
                elif isinstance(field.avroType, AvroNull):
                    out[field.name] = None
                else:
                    raise titus.errors.AvroException("{0} does not match schema {1}".format(json.dumps(value), ts(avroType)))
            return out
    elif isinstance(avroType, AvroUnion):
        if isinstance(value, dict) and len(value) == 1:
            tag, = value.keys()
            val, = value.values()
            types = dict((x.name, x) for x in avroType.types)
            if tag in types:
                return {tag: jsonDecoder(types[tag], val)}
        elif value is None and "null" in [x.name for x in avroType.types]:
            return None
    else:
        raise Exception
    raise titus.errors.AvroException("{0} does not match schema {1}".format(json.dumps(value), ts(avroType)))