def get_aggregates(self, date, delta): aggregates = [ Aggregate(lambda e: e.med_risk_f == 'Y', 'any', 'medical_risk', fname=False), Aggregate('emplymnt_c', lambda e: set(list_filter_none(e)), 'employment_status', fname=False), Aggregate('occptn_c', lambda o: set(list_filter_none(o)), 'occupation', fname=False), Aggregate(['hsehld_n', 'hse_inc_a'], 'median', ['household_size', 'household_income']), Aggregate('language', lambda ls: union(set(l) for l in ls), fname=False), Aggregate('assistance', lambda ls: union(set(l) for l in ls), fname=False), Aggregate('clinicid_i', lambda c: set(c), 'clinic', fname=False) ] return aggregates
def binarize_set(df, column, values=None): d = df[column].dropna() # avoid nulls if values is None: values = util.union(d) for value in values: name = values[value] if type(values) is dict else str(value) column_name = column + '_'+ name.replace(' ', '_') df[column_name] = d.apply(lambda c: value in c) df[column_name].fillna(0, inplace=True) df.drop(column, axis=1, inplace=True)
def binarize_set(df, column, values=None): d = df[column].dropna() # avoid nulls if values is None: values = util.union(d) for value in values: name = values[value] if type(values) is dict else str(value) column_name = column + '_' + name.replace(' ', '_') df[column_name] = d.apply(lambda c: value in c) df[column_name].fillna(0, inplace=True) df.drop(column, axis=1, inplace=True)
def get_aggregates(self, date, delta): aggregates = [ Aggregate('length', 'max', fname=False), Aggregate('weight', 'max', fname=False), Aggregate('head_circumference', 'max', fname=False), Aggregate('apgar', 'max', 'apgar_score', fname=False), Aggregate('brth_typ_c', lambda b: set(b), 'place_type', fname=False), Aggregate('inf_disp_c',lambda i: set(i), 'disposition', fname=False), Aggregate('complication', lambda cs: union(set(c) for c in cs), fname=False), Aggregate(lambda b: b.apors_f == 'Y', 'any', 'apors', fname=False), Aggregate(lambda b: b.icu_f == 'Y', 'any', 'icu', fname=False), Aggregate('clinicid_i', lambda c: set(c), 'clinic', fname=False) ] return aggregates
def get_aggregates(self, date, delta): aggregates = [ Aggregate('length', 'max', fname=False), Aggregate('weight', 'max', fname=False), Aggregate('head_circumference', 'max', fname=False), Aggregate('apgar', 'max', 'apgar_score', fname=False), Aggregate('brth_typ_c', lambda b: set(b), 'place_type', fname=False), Aggregate('inf_disp_c', lambda i: set(i), 'disposition', fname=False), Aggregate('complication', lambda cs: union(set(c) for c in cs), fname=False), Aggregate(lambda b: b.apors_f == 'Y', 'any', 'apors', fname=False), Aggregate(lambda b: b.icu_f == 'Y', 'any', 'icu', fname=False), Aggregate('clinicid_i', lambda c: set(c), 'clinic', fname=False) ] return aggregates
def argument_names(self): return list(util.union(map(set, self.arguments)))
def expand(self, prefix=False, index=True, diff=True, existence=True): """ This function is a member of StepFrame and StepSeries. It is used to expand the kwargs of the steps either into the index (index=True) or as columns (index=False). By default (diff=True) only the kwargs which differ among steps are expanded. Note that index objects in pandas must be hashable so any unhashable argument values are converted to string representations (using pprint) when index=True. If "inputs" is an argument those steps' kwargs are also expanded (and their inputs recursively). If there are multiple steps with the same argument names they are prefixed by their names or if those are not set then by their class names. To enable prefixing for all args set prefix=True. Sometimes the difference between pipelines is that a step exists or it doesn't. When diff=True and existence=True, instead of expanding all the kwargs for that step, we expand a single column whose name is the step name and whose value is a boolean indicating whether the step exists in the given tree. Args: prefix: whether to always use step name prefix for kwarg name. Default False, which uses prefixes when necessary, i.e. for keywords that are shared by multiple step names. index: If True expand args into index. Otherwise expand into columns diff: whether to only expand keywords whose values that are non-constant existence: whether to check for existence of a step in the tree instead of a full diff. Only applicable when diff=True. See note above. Returns: a DatFrame with the arguments of the steps expanded. """ # collect kwargs resulting in a list of {name: kwargs} dicts dicts = [step._collect_kwargs(s) for s in self.index] # if any of the kwargs are themselves dicts, expand them dicts = [{k: util.dict_expand(v) for k, v in s.items()} for s in dicts] if diff: diff_dicts = [{} for d in dicts] # the desired list of dicts names = util.union([set(d.keys()) for d in dicts]) # all names among these steps for name in names: if existence: ndicts = [d[name] for d in dicts if name in d.keys()] # all dicts for this name else: ndicts = [d[name] if name in d.keys() else {} for d in dicts] ndiffs = util.dict_diff(ndicts) # diffs for this name if sum(map(len, ndiffs)) == 0: # if they're all the same # but not all had the key and existence=True if existence and len(ndicts) < len(self): for m, d in zip(diff_dicts, dicts): m[name] = {tuple(): name in d.keys()} else: # if there was a diff diff_iter = iter(ndiffs) for m, d in zip(diff_dicts, dicts): if name in d.keys() or not existence: m[name] = diff_iter.next( ) # get the corresponding diff dicts = diff_dicts # restructure so name is in the key merged_dicts = [] for dd in dicts: merged_dicts.append( util.dict_merge(*({ tuple([name] + list(util.make_tuple(k))): v for k, v in d.items() } for name, d in dd.items()))) # prefix_keys are the keys that will keep their prefix keys = [list((k[1:] for k in d.keys())) for d in merged_dicts] if not prefix: key_count = [Counter(kk) for kk in keys] prefix_keys = util.union({k for k in c if c[k] > 1} for c in key_count) else: prefix_keys = util.union((set(kk) for kk in keys)) merged_dicts = [{ str.join('_', map(str, k if k[1:] in prefix_keys else k[1:])): v for k, v in d.items() } for d in merged_dicts] expanded = pd.DataFrame(merged_dicts, index=self.index) if index: columns = list(expanded.columns) try: expanded.set_index(columns, inplace=True) except TypeError: _print_unhashable(expanded, columns) expanded.set_index(columns, inplace=True) df = self.__class__.__bases__[0](self, copy=True) df.index = expanded.index else: df = pd.concat((expanded, self), axis=1) # When index=False, the index is still a Step collection df = StepFrame(expanded) return df