def __init__(self, column_def, agg_func, name=None, fname=None, astype=None): """ Args: column_def: List of (or a single) column definitions, as accepted by Column. agg_func: List of (or a single) row-wise aggregation function (will be passed to Panda's groupby.agg()) name: List of strings of names for the column_defintions. Must be of same length as column_def, or be None, in which case the names default to the string representation of the each column definition. fname: List of strings of names for the aggregation functions. Must be of same length as agg_func or be None, in which case the names default to the string representation of the each aggregation function. astype: List of pandas dtypes, which gets passed to Column together with the column definitions. Must be of same length as column_def or be None, in which case no casting will be performed. Names for the resulting column reductions are of the format `columnname_functionname`. """ # make list of column reductions from arguments --> each ColumnReduction needs a Column(definition) and an agg_func column_def = util.make_list(column_def) agg_func = util.make_list(agg_func) astype = util.make_list(astype) if len(astype) == 1: astype = astype*len(column_def) if len(astype) != len(column_def): raise ValueError("astype must be a datatype, or a list of datatypes of same length as column_def") column_reductions = [ColumnReduction(Column(defn,at), func) for (defn, at), func in product(zip(column_def, astype), agg_func)] # make list of names from the arguments --> same length as the list of column reductions above! name = [str(c) for c in column_def] if name is None else util.make_list(name) if len(name) != len(column_def): raise ValueError("name and column_def must be same length, or name must be None.") fname = [str(a) for a in agg_func] if fname is None else util.make_list(fname) if len(fname) != len(agg_func): raise ValueError("fname and agg_func must be same length, or fname must be None.") # if there's only on agg_func, you can override the function-naming-schema, # by passing fname=False; then we only use the name if len(agg_func) == 1 and fname == [False]: column_names = name else: column_names = ['%s_%s'%(cn, fn) for cn, fn in product(name, fname)] ColumnFunction.__init__(self, column_reductions, column_names)
def map_inputs(self): kwargs = {} args = [] if hasattr(self, 'inputs_mapping'): inputs_mapping = util.make_list(self.inputs_mapping) if len(self.inputs) < len(inputs_mapping): raise ValueError('Too many inputs_mappings') for input, mapping in zip(self.inputs, inputs_mapping): result = input.get_result() if isinstance(mapping, dict): # pass through any missing keys, so {} is the identity # do it first so that inputs_mapping overrides keys for k in set(result.keys()).\ difference(set(mapping.keys())): kwargs[k] = result[k] for k in mapping: if mapping[k] is not None: kwargs[mapping[k]] = result[k] elif isinstance(mapping, list): if len(mapping) > len(result): raise ValueError("More keywords than results") for kw, r in zip(mapping, result): if kw is not None: kwargs[kw] = r elif isinstance(mapping, string_types): kwargs[mapping] = input.get_result() elif mapping is None: # drop Nones pass else: raise ValueError( 'Input mapping is neither dict nor str: %s' % mapping) mapped_inputs = len(inputs_mapping) else: mapped_inputs = 0 for i in range(mapped_inputs, len(self.inputs)): result = self.inputs[i].get_result() # without a mapping we handle two cases # when the result is a dict merge it with a global dict if isinstance(result, dict): # but do not override kwargs.update( {k: v for k, v in result.items() if k not in kwargs}) # otherwise use it as a positional argument else: args.append(result) return args, kwargs
def parse_workflows(workflows): steps = [] for s in workflows: if s.endswith('.yaml'): steps += serialize.load(s) else: modulename = s[:s.split('(')[0].rfind('.')] exec('import %s' % modulename) if s.find('(') < 0: s += '()' ss = util.make_list(eval('%s' % s)) logging.info('Loaded %s with %s leaf steps' % (s, len(ss))) steps += ss return steps
def map_inputs(self): kwargs = {} args = [] if hasattr(self, 'inputs_mapping'): inputs_mapping = util.make_list(self.inputs_mapping) if len(self.inputs) < len(inputs_mapping): raise ValueError('Too many inputs_mappings') for input, mapping in zip(self.inputs, inputs_mapping): result = input.get_result() if isinstance(mapping, dict): # pass through any missing keys, so {} is the identity # do it first so that inputs_mapping overrides keys for k in set(result.keys()).difference(set(mapping.keys())): kwargs[k] = result[k] for k in mapping: if mapping[k] is not None: kwargs[mapping[k]] = result[k] elif isinstance(mapping, basestring): kwargs[mapping] = input.get_result() elif mapping is None: # drop Nones pass else: raise ValueError('Input mapping is neither dict nor str: %s' % mapping) mapped_inputs = len(inputs_mapping) else: mapped_inputs = 0 for i in range(mapped_inputs, len(self.inputs)): result = self.inputs[i].get_result() # without a mapping we handle two cases # when the result is a dict merge it with a global dict if isinstance(result, dict): # but do not override kwargs.update({k:v for k,v in result.iteritems() if k not in kwargs}) # otherwise use it as a positional argument else: args.append(result) return args, kwargs
drain.PATH = os.path.abspath(args.path) elif drain.PATH is None: raise ValueError( 'Must pass path argument or set DRAINPATH environment variable') steps = [] for s in args.steps.split(';'): if s.endswith('.yaml'): steps += serialize.load(s) else: modulename, fname = s.split('::') mod = importlib.import_module(modulename) s_attr = getattr(mod, fname) # if s is callable, it should return a collection of Steps # otherwise assume it is a collection of Steps ss = util.make_list( s_attr() if hasattr(s_attr, '__call__') else s_attr) logging.info('Loaded %s with %s leaf steps' % (s, len(ss))) steps += ss if args.Drakeinput is None and os.path.exists('Drakefile'): args.Drakeinput = 'Drakefile' drakeinput = os.path.abspath(args.Drakeinput) if args.Drakeinput else None workflow = drake.to_drakefile(steps, preview=args.preview, debug=args.debug, input_drakefile=drakeinput, bindir=os.path.dirname(__file__)) if not args.preview: with open(args.drakeoutput, 'w') as drakefile:
step.OUTPUTDIR = os.path.abspath(args.outputdir) drain.yaml.configure() steps = [] for s in args.steps.split(';'): if s.endswith('.yaml'): steps += drain.yaml.load(s) else: print s modulename, fname = s.split('::') mod = importlib.import_module(modulename) s = getattr(mod, fname) # if s is callable, it should return a collection of Steps # otherwise assume it is a collection of Steps s = util.make_list(s() if hasattr(s, '__call__') else s) steps += s if args.Drakeinput is None and os.path.exists('Drakefile'): args.Drakeinput = 'Drakefile' drakeinput = os.path.abspath(args.Drakeinput) if args.Drakeinput else None workflow = drake.to_drakefile(steps, preview=args.preview, debug=args.debug, input_drakefile=drakeinput) if not args.preview: with open(args.drakeoutput, 'w') as drakefile: drakefile.write(workflow) else: sys.stdout.write(workflow) drake_args = list(drake_args) if drake_args is not None else []
def dapply(self, fn, pairwise=False, symmetric=True, diagonal=False, block=None, **kwargs): """ Apply function to each step object in the index Args: fn: function to apply. If a list then each function is applied pairwise: whether to apply the function to pairs of steps symmetric, diagonal, block: passed to apply_pairwise when pairwise=True kwargs: a keyword arguments to pass to each function. Arguments with list value are grid searched using util.dict_product. Returns: a StepFrame or StepSeries """ search_keys = [ k for k, v in kwargs.items() if isinstance(v, list) and len(v) > 1 ] functions = util.make_list(fn) search = list(product(functions, util.dict_product(kwargs))) results = [] for fn, kw in search: if not pairwise: r = self.index.to_series().apply(lambda step: fn(step, **kw)) else: r = apply_pairwise(self, fn, symmetric=symmetric, diagonal=diagonal, block=block, **kw) name = [] if len(functions) == 1 else [fn.__name__] name += util.dict_subset(kw, search_keys).values() if isinstance(r, pd.DataFrame): columns = pd.MultiIndex.from_tuples( [tuple(name + util.make_list(c)) for c in r.columns]) r.columns = columns else: r.name = tuple(name) results.append(r) if len(results) > 1: result = pd.concat(results, axis=1) # get subset of parameters that were searched over column_names = [] if len(functions) == 1 else [None] column_names += search_keys column_names += [None ] * (len(result.columns.names) - len(column_names)) result.columns.names = column_names return StepFrame(result) else: result = results[0] if isinstance(result, pd.DataFrame): return StepFrame(result) else: result.name = functions[0].__name__ return StepSeries(result)
args.preview = True step.OUTPUTDIR = os.path.abspath(args.outputdir) drain.yaml.configure() steps = [] for s in args.steps.split(';'): if s.endswith('.yaml'): steps += drain.yaml.load(s) else: modulename, fname = s.split('::') mod = importlib.import_module(modulename) s_attr = getattr(mod, fname) # if s is callable, it should return a collection of Steps # otherwise assume it is a collection of Steps ss = util.make_list(s_attr() if hasattr(s_attr, '__call__') else s_attr) logging.info('Loaded %s with %s leaf steps' % (s, len(ss))) steps += ss if args.Drakeinput is None and os.path.exists('Drakefile'): args.Drakeinput = 'Drakefile' drakeinput = os.path.abspath(args.Drakeinput) if args.Drakeinput else None workflow = drake.to_drakefile(steps, preview=args.preview, debug=args.debug, input_drakefile=drakeinput) if not args.preview: with open(args.drakeoutput, 'w') as drakefile: logging.info('Writing drake workflow %s' % args.drakeoutput) drakefile.write(workflow) else: sys.stdout.write(workflow)
def __init__(self, column_def, agg_func, name=None, fname=None, astype=None): """ Args: column_def: List of (or a single) column definitions, as accepted by Column. agg_func: List of (or a single) row-wise aggregation function (will be passed to Panda's groupby.agg()) name: List of strings of names for the column_defintions. Must be of same length as column_def, or be None, in which case the names default to the string representation of the each column definition. fname: List of strings of names for the aggregation functions. Must be of same length as agg_func or be None, in which case the names default to the string representation of the each aggregation function. astype: List of pandas dtypes, which gets passed to Column together with the column definitions. Must be of same length as column_def or be None, in which case no casting will be performed. Names for the resulting column reductions are of the format `columnname_functionname`. """ # make list of column reductions from arguments --> each ColumnReduction needs a Column(definition) and an agg_func column_def = util.make_list(column_def) agg_func = util.make_list(agg_func) astype = util.make_list(astype) if len(astype) == 1: astype = astype * len(column_def) if len(astype) != len(column_def): raise ValueError( "astype must be a datatype, or a list of datatypes of same length as column_def" ) column_reductions = [ ColumnReduction(Column(defn, at), func) for (defn, at), func in product(zip(column_def, astype), agg_func) ] # make list of names from the arguments --> same length as the list of column reductions above! name = [str(c) for c in column_def] if name is None else util.make_list(name) if len(name) != len(column_def): raise ValueError( "name and column_def must be same length, or name must be None." ) fname = [str(a) for a in agg_func] if fname is None else util.make_list(fname) if len(fname) != len(agg_func): raise ValueError( "fname and agg_func must be same length, or fname must be None." ) # if there's only on agg_func, you can override the function-naming-schema, # by passing fname=False; then we only use the name if len(agg_func) == 1 and fname == [False]: column_names = name else: column_names = [ '%s_%s' % (cn, fn) for cn, fn in product(name, fname) ] ColumnFunction.__init__(self, column_reductions, column_names)