Beispiel #1
0
    def get_aggregates(self, date, delta):

        aggregates = [
            Aggregate(lambda e: e.med_risk_f == 'Y',
                      'any',
                      'medical_risk',
                      fname=False),
            Aggregate('emplymnt_c',
                      lambda e: set(list_filter_none(e)),
                      'employment_status',
                      fname=False),
            Aggregate('occptn_c',
                      lambda o: set(list_filter_none(o)),
                      'occupation',
                      fname=False),
            Aggregate(['hsehld_n', 'hse_inc_a'], 'median',
                      ['household_size', 'household_income']),
            Aggregate('language',
                      lambda ls: union(set(l) for l in ls),
                      fname=False),
            Aggregate('assistance',
                      lambda ls: union(set(l) for l in ls),
                      fname=False),
            Aggregate('clinicid_i', lambda c: set(c), 'clinic', fname=False)
        ]

        return aggregates
Beispiel #2
0
def binarize_set(df, column, values=None):
    d = df[column].dropna() # avoid nulls
    if values is None:
        values = util.union(d)
    for value in values:
        name = values[value] if type(values) is dict else str(value)
        column_name = column + '_'+ name.replace(' ', '_')
        df[column_name] = d.apply(lambda c: value in c)
        df[column_name].fillna(0, inplace=True)
    df.drop(column, axis=1, inplace=True)
Beispiel #3
0
def binarize_set(df, column, values=None):
    d = df[column].dropna()  # avoid nulls
    if values is None:
        values = util.union(d)
    for value in values:
        name = values[value] if type(values) is dict else str(value)
        column_name = column + '_' + name.replace(' ', '_')
        df[column_name] = d.apply(lambda c: value in c)
        df[column_name].fillna(0, inplace=True)
    df.drop(column, axis=1, inplace=True)
Beispiel #4
0
    def get_aggregates(self, date, delta):
        
        aggregates = [
            Aggregate(lambda e: e.med_risk_f == 'Y', 'any', 
                'medical_risk', fname=False),
            Aggregate('emplymnt_c', lambda e: set(list_filter_none(e)), 
                'employment_status', fname=False),
            Aggregate('occptn_c', lambda o: set(list_filter_none(o)), 
                'occupation', fname=False),
            Aggregate(['hsehld_n', 'hse_inc_a'], 'median', 
                ['household_size', 'household_income']),
            Aggregate('language', lambda ls: union(set(l) for l in ls),
                fname=False),
            Aggregate('assistance', lambda ls: union(set(l) for l in ls),
                fname=False),
            Aggregate('clinicid_i', lambda c: set(c), 'clinic', fname=False)
        ]

        return aggregates
Beispiel #5
0
    def get_aggregates(self, date, delta):
        
        aggregates = [
            Aggregate('length', 'max', fname=False),
            Aggregate('weight', 'max', fname=False),
            Aggregate('head_circumference', 'max', fname=False),
            Aggregate('apgar', 'max', 'apgar_score', fname=False),
            Aggregate('brth_typ_c', lambda b: set(b), 'place_type', fname=False),
            Aggregate('inf_disp_c',lambda i: set(i), 'disposition', fname=False),
            Aggregate('complication', lambda cs: union(set(c) for c in cs), fname=False),
            Aggregate(lambda b: b.apors_f == 'Y', 'any', 'apors', fname=False),
            Aggregate(lambda b: b.icu_f == 'Y', 'any', 'icu', fname=False),
            Aggregate('clinicid_i', lambda c: set(c), 'clinic', fname=False)
        ]

        return aggregates
Beispiel #6
0
    def get_aggregates(self, date, delta):

        aggregates = [
            Aggregate('length', 'max', fname=False),
            Aggregate('weight', 'max', fname=False),
            Aggregate('head_circumference', 'max', fname=False),
            Aggregate('apgar', 'max', 'apgar_score', fname=False),
            Aggregate('brth_typ_c',
                      lambda b: set(b),
                      'place_type',
                      fname=False),
            Aggregate('inf_disp_c',
                      lambda i: set(i),
                      'disposition',
                      fname=False),
            Aggregate('complication',
                      lambda cs: union(set(c) for c in cs),
                      fname=False),
            Aggregate(lambda b: b.apors_f == 'Y', 'any', 'apors', fname=False),
            Aggregate(lambda b: b.icu_f == 'Y', 'any', 'icu', fname=False),
            Aggregate('clinicid_i', lambda c: set(c), 'clinic', fname=False)
        ]

        return aggregates
Beispiel #7
0
 def argument_names(self):
     return list(util.union(map(set, self.arguments)))
Beispiel #8
0
def expand(self, prefix=False, index=True, diff=True, existence=True):
    """
    This function is a member of StepFrame and StepSeries. It is used to
    expand the kwargs of the steps either into the index (index=True) or
    as columns (index=False). By default (diff=True) only the kwargs which
    differ among steps are expanded.

    Note that index objects in pandas must be hashable so any unhashable
    argument values are converted to string representations (using pprint)
    when index=True.

    If "inputs" is an argument those steps' kwargs are also expanded (and
    their inputs recursively). If there are multiple steps with the same
    argument names they are prefixed by their names or if those are not set
    then by their class names. To enable prefixing for all args set
    prefix=True.

    Sometimes the difference between pipelines is that a step exists or it
    doesn't. When diff=True and existence=True, instead of expanding all
    the kwargs for that step, we expand a single column whose name is the
    step name and whose value is a boolean indicating whether the step exists
    in the given tree.

    Args:
        prefix: whether to always use step name prefix for kwarg name.
            Default False, which uses prefixes when necessary, i.e. for
            keywords that are shared by multiple step names.
        index: If True expand args into index. Otherwise expand into
            columns
        diff: whether to only expand keywords whose values that are
            non-constant
        existence: whether to check for existence of a step in the tree
            instead of a full diff. Only applicable when diff=True. See
            note above.

    Returns: a DatFrame with the arguments of the steps expanded.
    """
    # collect kwargs resulting in a list of {name: kwargs} dicts
    dicts = [step._collect_kwargs(s) for s in self.index]
    # if any of the kwargs are themselves dicts, expand them
    dicts = [{k: util.dict_expand(v) for k, v in s.items()} for s in dicts]

    if diff:
        diff_dicts = [{} for d in dicts]  # the desired list of dicts

        names = util.union([set(d.keys())
                            for d in dicts])  # all names among these steps
        for name in names:
            if existence:
                ndicts = [d[name] for d in dicts
                          if name in d.keys()]  # all dicts for this name
            else:
                ndicts = [d[name] if name in d.keys() else {} for d in dicts]

            ndiffs = util.dict_diff(ndicts)  # diffs for this name

            if sum(map(len, ndiffs)) == 0:  # if they're all the same
                # but not all had the key and existence=True
                if existence and len(ndicts) < len(self):
                    for m, d in zip(diff_dicts, dicts):
                        m[name] = {tuple(): name in d.keys()}
            else:  # if there was a diff
                diff_iter = iter(ndiffs)
                for m, d in zip(diff_dicts, dicts):
                    if name in d.keys() or not existence:
                        m[name] = diff_iter.next(
                        )  # get the corresponding diff

        dicts = diff_dicts

    # restructure so name is in the key
    merged_dicts = []
    for dd in dicts:
        merged_dicts.append(
            util.dict_merge(*({
                tuple([name] + list(util.make_tuple(k))): v
                for k, v in d.items()
            } for name, d in dd.items())))

    # prefix_keys are the keys that will keep their prefix
    keys = [list((k[1:] for k in d.keys())) for d in merged_dicts]
    if not prefix:
        key_count = [Counter(kk) for kk in keys]
        prefix_keys = util.union({k for k in c if c[k] > 1} for c in key_count)
    else:
        prefix_keys = util.union((set(kk) for kk in keys))

    merged_dicts = [{
        str.join('_', map(str, k if k[1:] in prefix_keys else k[1:])): v
        for k, v in d.items()
    } for d in merged_dicts]

    expanded = pd.DataFrame(merged_dicts, index=self.index)

    if index:
        columns = list(expanded.columns)
        try:
            expanded.set_index(columns, inplace=True)
        except TypeError:
            _print_unhashable(expanded, columns)
            expanded.set_index(columns, inplace=True)

        df = self.__class__.__bases__[0](self, copy=True)
        df.index = expanded.index

    else:
        df = pd.concat((expanded, self), axis=1)
        # When index=False, the index is still a Step collection
        df = StepFrame(expanded)

    return df