Example #1
0
    def __init__(self, column_def, agg_func, name=None, fname=None, astype=None):
        """
        Args:
            column_def: List of (or a single) column definitions, as accepted by Column.
            agg_func: List of (or a single) row-wise aggregation function (will be passed to
                Panda's groupby.agg())
            name: List of strings of names for the column_defintions. Must be of 
                same length as column_def, or be None, in which case the names
                default to the string representation of the each column definition.
            fname: List of strings of names for the aggregation functions. Must be of 
                same length as agg_func or be None, in which case the names
                default to the string representation of the each aggregation function.
            astype: List of pandas dtypes, which gets passed to Column together with the
                column definitions. Must be of same length as column_def or be None, 
                in which case no casting will be performed.

        Names for the resulting column reductions are of the format `columnname_functionname`.
        """

        # make list of column reductions from arguments  --> each ColumnReduction  needs a Column(definition) and an agg_func
        column_def = util.make_list(column_def)
        agg_func = util.make_list(agg_func)
        astype = util.make_list(astype)

        if len(astype) == 1:
            astype = astype*len(column_def)

        if len(astype) != len(column_def):
            raise ValueError("astype must be a datatype, or a list of datatypes of same length as column_def")

        column_reductions = [ColumnReduction(Column(defn,at), func) 
                             for (defn, at), func in product(zip(column_def, astype), agg_func)]

        # make list of names from the arguments --> same length as the list of column reductions above!
        name = [str(c) for c in column_def] if name is None else util.make_list(name)
        if len(name) != len(column_def):
            raise ValueError("name and column_def must be same length, or name must be None.")
        
        fname = [str(a) for a in agg_func] if fname is None else util.make_list(fname)
        if len(fname) != len(agg_func):
            raise ValueError("fname and agg_func must be same length, or fname must be None.")

        # if there's only on agg_func, you can override the function-naming-schema,
        # by passing fname=False; then we only use the name
        if len(agg_func) == 1 and fname == [False]:
            column_names = name
        else:
            column_names = ['%s_%s'%(cn, fn) for cn, fn in product(name, fname)]

        ColumnFunction.__init__(self, column_reductions, column_names)
Example #2
0
    def map_inputs(self):
        kwargs = {}
        args = []

        if hasattr(self, 'inputs_mapping'):
            inputs_mapping = util.make_list(self.inputs_mapping)

            if len(self.inputs) < len(inputs_mapping):
                raise ValueError('Too many inputs_mappings')

            for input, mapping in zip(self.inputs, inputs_mapping):
                result = input.get_result()
                if isinstance(mapping, dict):
                    # pass through any missing keys, so {} is the identity
                    # do it first so that inputs_mapping overrides keys
                    for k in set(result.keys()).\
                            difference(set(mapping.keys())):
                        kwargs[k] = result[k]

                    for k in mapping:
                        if mapping[k] is not None:
                            kwargs[mapping[k]] = result[k]
                elif isinstance(mapping, list):
                    if len(mapping) > len(result):
                        raise ValueError("More keywords than results")
                    for kw, r in zip(mapping, result):
                        if kw is not None:
                            kwargs[kw] = r
                elif isinstance(mapping, string_types):
                    kwargs[mapping] = input.get_result()
                elif mapping is None:  # drop Nones
                    pass
                else:
                    raise ValueError(
                        'Input mapping is neither dict nor str: %s' % mapping)

            mapped_inputs = len(inputs_mapping)
        else:
            mapped_inputs = 0

        for i in range(mapped_inputs, len(self.inputs)):
            result = self.inputs[i].get_result()
            # without a mapping we handle two cases
            # when the result is a dict merge it with a global dict
            if isinstance(result, dict):
                # but do not override
                kwargs.update(
                    {k: v
                     for k, v in result.items() if k not in kwargs})
            # otherwise use it as a positional argument
            else:
                args.append(result)

        return args, kwargs
Example #3
0
def parse_workflows(workflows):
    steps = []
    for s in workflows:
        if s.endswith('.yaml'):
            steps += serialize.load(s)
        else:
            modulename = s[:s.split('(')[0].rfind('.')]
            exec('import %s' % modulename)
            if s.find('(') < 0:
                s += '()'
            ss = util.make_list(eval('%s' % s))
            logging.info('Loaded %s with %s leaf steps' % (s, len(ss)))
            steps += ss
    return steps
Example #4
0
File: step.py Project: dean12/drain
    def map_inputs(self):
        kwargs = {}
        args = []

        if hasattr(self, 'inputs_mapping'):
            inputs_mapping = util.make_list(self.inputs_mapping)
            
            if len(self.inputs) < len(inputs_mapping):
                raise ValueError('Too many inputs_mappings')

            for input, mapping in zip(self.inputs, inputs_mapping):
                result = input.get_result()
                if isinstance(mapping, dict):
                    # pass through any missing keys, so {} is the identity
                    # do it first so that inputs_mapping overrides keys
                    for k in set(result.keys()).difference(set(mapping.keys())):
                        kwargs[k] = result[k]

                    for k in mapping:
                        if mapping[k] is not None:
                            kwargs[mapping[k]] = result[k]

                elif isinstance(mapping, basestring):
                    kwargs[mapping] = input.get_result()
                elif mapping is None: # drop Nones
                    pass
                else:
                    raise ValueError('Input mapping is neither dict nor str: %s' % mapping)

            mapped_inputs = len(inputs_mapping)
        else:
            mapped_inputs = 0

        for i in range(mapped_inputs, len(self.inputs)):
            result = self.inputs[i].get_result()
            # without a mapping we handle two cases
            # when the result is a dict merge it with a global dict
            if isinstance(result, dict):
                # but do not override
                kwargs.update({k:v for k,v in result.iteritems() if k not in kwargs})
            # otherwise use it as a positional argument
            else:
                args.append(result)

        return args, kwargs
Example #5
0
        drain.PATH = os.path.abspath(args.path)
    elif drain.PATH is None:
        raise ValueError(
            'Must pass path argument or set DRAINPATH environment variable')

    steps = []
    for s in args.steps.split(';'):
        if s.endswith('.yaml'):
            steps += serialize.load(s)
        else:
            modulename, fname = s.split('::')
            mod = importlib.import_module(modulename)
            s_attr = getattr(mod, fname)
            # if s is callable, it should return a collection of Steps
            # otherwise assume it is a collection of Steps
            ss = util.make_list(
                s_attr() if hasattr(s_attr, '__call__') else s_attr)
            logging.info('Loaded %s with %s leaf steps' % (s, len(ss)))
            steps += ss

    if args.Drakeinput is None and os.path.exists('Drakefile'):
        args.Drakeinput = 'Drakefile'
    drakeinput = os.path.abspath(args.Drakeinput) if args.Drakeinput else None

    workflow = drake.to_drakefile(steps,
                                  preview=args.preview,
                                  debug=args.debug,
                                  input_drakefile=drakeinput,
                                  bindir=os.path.dirname(__file__))

    if not args.preview:
        with open(args.drakeoutput, 'w') as drakefile:
Example #6
0
    step.OUTPUTDIR = os.path.abspath(args.outputdir)
    drain.yaml.configure()

    steps = []
    for s in args.steps.split(';'):
        if s.endswith('.yaml'):
            steps +=  drain.yaml.load(s)
        else:
            print s
            modulename, fname = s.split('::')
            mod = importlib.import_module(modulename)
            s = getattr(mod, fname)
            # if s is callable, it should return a collection of Steps
            # otherwise assume it is a collection of Steps
            s = util.make_list(s() if hasattr(s, '__call__') else s)
            steps += s

    if args.Drakeinput is None and os.path.exists('Drakefile'):
        args.Drakeinput = 'Drakefile'
    drakeinput = os.path.abspath(args.Drakeinput) if args.Drakeinput else None

    workflow = drake.to_drakefile(steps, preview=args.preview, debug=args.debug, input_drakefile=drakeinput)

    if not args.preview:
        with open(args.drakeoutput, 'w') as drakefile:
            drakefile.write(workflow)
    else:
        sys.stdout.write(workflow)

    drake_args = list(drake_args) if drake_args is not None else []
Example #7
0
def dapply(self,
           fn,
           pairwise=False,
           symmetric=True,
           diagonal=False,
           block=None,
           **kwargs):
    """
    Apply function to each step object in the index

    Args:
        fn: function to apply. If a list then each function is applied
        pairwise: whether to apply the function to pairs of steps
        symmetric, diagonal, block: passed to apply_pairwise when pairwise=True
        kwargs: a keyword arguments to pass to each function. Arguments
            with list value are grid searched using util.dict_product.

    Returns: a StepFrame or StepSeries
    """
    search_keys = [
        k for k, v in kwargs.items() if isinstance(v, list) and len(v) > 1
    ]
    functions = util.make_list(fn)
    search = list(product(functions, util.dict_product(kwargs)))

    results = []
    for fn, kw in search:
        if not pairwise:
            r = self.index.to_series().apply(lambda step: fn(step, **kw))
        else:
            r = apply_pairwise(self,
                               fn,
                               symmetric=symmetric,
                               diagonal=diagonal,
                               block=block,
                               **kw)

        name = [] if len(functions) == 1 else [fn.__name__]
        name += util.dict_subset(kw, search_keys).values()

        if isinstance(r, pd.DataFrame):
            columns = pd.MultiIndex.from_tuples(
                [tuple(name + util.make_list(c)) for c in r.columns])
            r.columns = columns
        else:
            r.name = tuple(name)
        results.append(r)

    if len(results) > 1:
        result = pd.concat(results, axis=1)
        # get subset of parameters that were searched over
        column_names = [] if len(functions) == 1 else [None]
        column_names += search_keys
        column_names += [None
                         ] * (len(result.columns.names) - len(column_names))
        result.columns.names = column_names

        return StepFrame(result)
    else:
        result = results[0]
        if isinstance(result, pd.DataFrame):
            return StepFrame(result)
        else:
            result.name = functions[0].__name__
            return StepSeries(result)
Example #8
0
        args.preview = True

    step.OUTPUTDIR = os.path.abspath(args.outputdir)
    drain.yaml.configure()

    steps = []
    for s in args.steps.split(';'):
        if s.endswith('.yaml'):
            steps +=  drain.yaml.load(s)
        else:
            modulename, fname = s.split('::')
            mod = importlib.import_module(modulename)
            s_attr = getattr(mod, fname)
            # if s is callable, it should return a collection of Steps
            # otherwise assume it is a collection of Steps
            ss = util.make_list(s_attr() if hasattr(s_attr, '__call__') else s_attr)
            logging.info('Loaded %s with %s leaf steps' % (s, len(ss)))
            steps += ss

    if args.Drakeinput is None and os.path.exists('Drakefile'):
        args.Drakeinput = 'Drakefile'
    drakeinput = os.path.abspath(args.Drakeinput) if args.Drakeinput else None

    workflow = drake.to_drakefile(steps, preview=args.preview, debug=args.debug, input_drakefile=drakeinput)

    if not args.preview:
        with open(args.drakeoutput, 'w') as drakefile:
            logging.info('Writing drake workflow %s' % args.drakeoutput)
            drakefile.write(workflow)
    else:
        sys.stdout.write(workflow)
Example #9
0
    def __init__(self,
                 column_def,
                 agg_func,
                 name=None,
                 fname=None,
                 astype=None):
        """
        Args:
            column_def: List of (or a single) column definitions, as accepted by Column.
            agg_func: List of (or a single) row-wise aggregation function (will be passed to
                Panda's groupby.agg())
            name: List of strings of names for the column_defintions. Must be of 
                same length as column_def, or be None, in which case the names
                default to the string representation of the each column definition.
            fname: List of strings of names for the aggregation functions. Must be of 
                same length as agg_func or be None, in which case the names
                default to the string representation of the each aggregation function.
            astype: List of pandas dtypes, which gets passed to Column together with the
                column definitions. Must be of same length as column_def or be None, 
                in which case no casting will be performed.

        Names for the resulting column reductions are of the format `columnname_functionname`.
        """

        # make list of column reductions from arguments  --> each ColumnReduction  needs a Column(definition) and an agg_func
        column_def = util.make_list(column_def)
        agg_func = util.make_list(agg_func)
        astype = util.make_list(astype)

        if len(astype) == 1:
            astype = astype * len(column_def)

        if len(astype) != len(column_def):
            raise ValueError(
                "astype must be a datatype, or a list of datatypes of same length as column_def"
            )

        column_reductions = [
            ColumnReduction(Column(defn, at), func)
            for (defn, at), func in product(zip(column_def, astype), agg_func)
        ]

        # make list of names from the arguments --> same length as the list of column reductions above!
        name = [str(c)
                for c in column_def] if name is None else util.make_list(name)
        if len(name) != len(column_def):
            raise ValueError(
                "name and column_def must be same length, or name must be None."
            )

        fname = [str(a)
                 for a in agg_func] if fname is None else util.make_list(fname)
        if len(fname) != len(agg_func):
            raise ValueError(
                "fname and agg_func must be same length, or fname must be None."
            )

        # if there's only on agg_func, you can override the function-naming-schema,
        # by passing fname=False; then we only use the name
        if len(agg_func) == 1 and fname == [False]:
            column_names = name
        else:
            column_names = [
                '%s_%s' % (cn, fn) for cn, fn in product(name, fname)
            ]

        ColumnFunction.__init__(self, column_reductions, column_names)