def __init__(self, obj, groupby_obj=None, keys=None, axis=0, level=None, grouper=None, exclusions=None, selection=None, as_index=True, sort=True, group_keys=True, squeeze=False, observed=False, mutated=False, grouper_cache=None): def fill_value(v, key): return v if v is not None or groupby_obj is None else getattr( groupby_obj, key) self.obj = obj self.keys = fill_value(keys, 'keys') self.axis = fill_value(axis, 'axis') self.level = fill_value(level, 'level') self.exclusions = fill_value(exclusions, 'exclusions') self.selection = selection self.as_index = fill_value(as_index, 'as_index') self.sort = fill_value(sort, 'sort') self.group_keys = fill_value(group_keys, 'group_keys') self.squeeze = fill_value(squeeze, 'squeeze') self.observed = fill_value(observed, 'observed') self.mutated = fill_value(mutated, 'mutated') if groupby_obj is None: groupby_kw = dict(keys=keys, axis=axis, level=level, grouper=grouper, exclusions=exclusions, as_index=as_index, group_keys=group_keys, squeeze=squeeze, observed=observed, mutated=mutated) if not _HAS_SQUEEZE: # pragma: no branch groupby_kw.pop('squeeze') if obj.ndim == 2: self.groupby_obj = DataFrameGroupBy(obj, **groupby_kw) else: self.groupby_obj = SeriesGroupBy(obj, **groupby_kw) else: self.groupby_obj = groupby_obj if grouper_cache: self.groupby_obj.grouper._cache = grouper_cache if selection: self.groupby_obj = self.groupby_obj[selection] self.is_frame = isinstance(self.groupby_obj, DataFrameGroupBy)
def __init__(self, groupby): self.groupby = groupby self.grouper = groupby.grouper self.obj = groupby.obj # create the first df groupby so we can delegate from __getattr__ f_ind = groupby.obj.items[0] self.first = DataFrameGroupBy(groupby.obj.ix[f_ind], grouper=self.grouper)
def _add_stats_to_summary(self, groupedby: DataFrameGroupBy, fieldname: str, filter_by_ns: bool = False) -> None: """Takes grouped stats and adds min, max, and median to stats""" _ = {self.ns[i].update({fieldname: []}) for i in self.ns.keys()} if filter_by_ns: _ = { self.ns[i][fieldname].append(groupedby[i].min()) if i in groupedby else self.ns[i][fieldname].append(0) for i in self.ns.keys() } # pylint disable=expression-not-assigned _ = { self.ns[i][fieldname].append(groupedby[i].max()) if i in groupedby else self.ns[i][fieldname].append(0) for i in self.ns.keys() } # pylint disable=expression-not-assigned _ = { self.ns[i][fieldname].append(groupedby[i].median( numeric_only=False)) if i in groupedby else self.ns[i][fieldname].append(0) for i in self.ns.keys() } else: min_field = groupedby.min() max_field = groupedby.max() med_field = groupedby.median(numeric_only=False) _ = { self.ns[i][fieldname].append(min_field[i]) if i in min_field else self.ns[i][fieldname].append(0) for i in self.ns.keys() } _ = { self.ns[i][fieldname].append(max_field[i]) if i in max_field else self.ns[i][fieldname].append(0) for i in self.ns.keys() } _ = { self.ns[i][fieldname].append(med_field[i]) if i in med_field else self.ns[i][fieldname].append(0) for i in self.ns.keys() }
def normalized_param_plots( param: str, dataframe_grouped: DataFrameGroupBy, example: bool = True ): """ Plot radius or area normalized plots of param values. """ param_normalized = f"{param}_norm" _, axes = plt.subplots(2, 1, figsize=utils.paper_figsize(0.8)) target_datas = [] for target, radius in utils.Utils.circle_names_with_diameter.items(): if radius != 50: continue target_data = dataframe_grouped.get_group(target).copy() target_data[param_normalized] = target_data[param] / get_best_value( target_data, param, "radius" ) target_data["radius_normalized"] = target_data["radius"] / max( target_data["radius"] ) target_data["area_normalized"] = target_data["area"] / max(target_data["area"]) # categorize based on radius target_data["radius Cat"] = [ "full" if rad < max(target_data["radius"]) / 2 else "limited" for rad in target_data["radius"].values ] target_data["radius Cat"] = target_data["radius Cat"].astype("category") target_datas.append(target_data) # Plotting sns.scatterplot( data=target_data, x="radius_normalized", y=param_normalized, hue="radius Cat", ax=axes[0], ) sns.scatterplot( data=target_data, x="area_normalized", y=param_normalized, hue="radius Cat", ax=axes[1], ) if example: break for ax in axes: ax.legend().remove() target_datas_df = pd.concat(target_datas) g = sns.JointGrid( data=target_datas_df, x="area_normalized", y=param_normalized, ) g.plot(sns.scatterplot, sns.histplot)
def apply(self, func, *args, **kwargs): result = {} for key, df in self.obj.iteritems(): grp = DataFrameGroupBy(df, grouper=self.grouper) if not callable(func): f = getattr(grp, func) res = f(*args, **kwargs) result[key] = res return Panel.from_dict(result)
def count_neighbors_within_distance_groups( grouped_distances: DataFrameGroupBy, ) -> DataFrame: """Count number of neighbors within each group of same-distance site-index pairs. :param grouped_distances: A data frame grouped over site-index pairs, subspecies pairs, and bin intervals. :return: A pandas ``DataFrame`` of neighbor counts aggregated over site-index pairs and separation distances. """ return (grouped_distances.apply( lambda x: pd.to_numeric(arg=x["distance_ij"].count(), downcast="integer")).rename("n").reset_index())
def apply(self, func, *args, **kwargs): result = OrderedDict() for key, df in self.obj.items(): grp = DataFrameGroupBy(df, grouper=self.grouper) f = func if not isinstance(func, collections.Callable): f = getattr(grp, func) res = f(*args, **kwargs) else: # call the grouper.apply cuz we will box our own data keys, data, mutated = grp.grouper.apply(f, df, grp.axis) res = box_data(keys, data) result[key] = res return box_data(result)
def radius_constrained_param( dataframe_grouped: DataFrameGroupBy, param: str, target: str ): """ Visualize effect of radius. """ target_data = dataframe_grouped.get_group(target).copy() target_data["Number of Traces Cat"] = pd.cut(target_data["Number of Traces"], 5) target_data["radius Cat"] = [ "full" if rad < max(target_data["radius"]) / 2 else "limited" for rad in target_data["radius"].values ] target_data["radius Cat"] = target_data["radius Cat"].astype("category") sns.lmplot(data=target_data, x="radius", y=param, hue="radius Cat")
def roc_est_calculator( df_groups: DataFrameGroupBy, cfg: RocEstCalculatorCfg) -> Tuple[np.ndarray, np.ndarray]: """ Args: df_groups: cfg: Returns: """ if isinstance(cfg.extract_metric_func, str): base_metric = df_groups[cfg.extract_metric_func].mean() else: raise NotImplementedError( 'Implemented, but not tested. Test before using it and remove this raise' ) base_metric = df_groups.apply(cfg.extract_metric_func) corona_from_datetime = pd.to_datetime(cfg.corona_from_date_str) fp_for_tp_list = [] cur_rand_seed = cfg.rand_seed for i_epoch in range(cfg.epochs_num): for i_grp, (group_id, df_grp) in enumerate(df_groups): cur_rand_seed = cur_rand_seed + 1 df_grp_plus_corona = add_rand_infected_simulation( df_grp, df_grp.datetime >= corona_from_datetime, cur_rand_seed, cfg.corona_sim_cfg) if isinstance(cfg.extract_metric_func, str): cur_metric = df_grp_plus_corona[cfg.extract_metric_func].mean() else: cur_metric = cfg.extract_metric_func(df_grp_plus_corona) # We added corona, hence expect the metric to increase or be the same assert cur_metric >= base_metric.iloc[i_grp] - 1e-7 if cfg.corona_sim_cfg.p_infection == 0.: # For probability 0 infection, we don't expect any change: assert np.isclose(cur_metric, base_metric.iloc[i_grp]) tmp = base_metric.copy().drop(index=base_metric.index[i_grp]) assert tmp.shape[0] == base_metric.shape[0] - 1 # The following is the minimal FP threshold such that cur_metric will be above threshold cur_detected_for_fp_th = np.sum( tmp >= cur_metric) / (base_metric.shape[0] - 1) fp_for_tp_list.append(cur_detected_for_fp_th) fp_vec = np.asarray(sorted(fp_for_tp_list)) tp_vec = np.linspace(0.0, fp_vec.size - 0.0, fp_vec.size) / fp_vec.size return fp_vec, tp_vec
def respect_event_resolution(grouper: DataFrameGroupBy, resolution): """Resample to make sure the df slice contains events with the same frequency as the given resolution. The input BeliefsDataFrame (see below) should represent beliefs about sequential sub-events formed by a single source at a single unique belief time. Extra beliefs are added with nan values. :Example: >>> df = df.groupby([pd.Grouper(freq="1D", level="event_start"), "belief_time", "source"]).pipe(respect_event_resolution, timedelta(hours=1)) So don't pass a BeliefsDataFrame directly, but pipe it so that we receive a DataFrameGroupBy object, which we can iterate over to obtain a BeliefsDataFrame slice for a unique belief time, source and (in our example) day of events. We then make sure an event is stated explicitly for (in our example) each hour. """ # We need to loop over each belief time in this slice, and reindex such that each subslice has rows for each event. Then recombine. # Get a list of n groups, one group for each belief_time with info about how we sliced and the actual slice groups = list(grouper.__iter__()) # Describe the event_start bin for the slices (we take the first, because the slices share the same event_start bin) bin_size = grouper.keys[0].freq bin_start = groups[0][0][0] bin_end = bin_start + bin_size # Build up our new BeliefsDataFrame (by copying over and emptying the rows, the metadata should be copied over) df = groups[0][1].copy().iloc[0:0] for (group) in ( groups ): # Loop over the groups (we grouped by unique belief time and unique source) # Get the BeliefsDataFrame for a unique belief time and source df_slice = group[1] if not df_slice.empty: lvl0 = pd.date_range( start=bin_start, end=bin_end, freq=to_offset(resolution).freqstr, closed="left", name="event_start", ) df = df.append( tb_utils.replace_multi_index_level(df_slice, level="event_start", index=lvl0, intersection=True)) return df
def set_truth( grouped: DataFrameGroupBy, right_source: "classes.BeliefSource") -> "classes.BeliefsDataFrame": """Overwrite the beliefs of each source by those of the given source. Terminology-wise, we say the given source is considered to be right, so it's beliefs contain the truth to be used as a reference for accuracy calculations. """ # Pick out the group that is considered to contain the true observations gr_dict = dict(grouped.__iter__()) if right_source in gr_dict: truth_group = gr_dict[right_source] else: raise KeyError("Source %s not found in BeliefsDataFrame." % right_source) # Replace each original group with the truth group, while adding back the source for each original group gr_list = [ tb_utils.replace_multi_index_level(truth_group, "source", pd.Index([key] * len(truth_group))) for key, group in grouped ] return pd.concat(gr_list)