def reduce_result_set( results: Iterable[pd.DataFrame], reference_groups, dimensions: Iterable[Field], share_dimensions: Iterable[Field], ): """ Reduces the result sets from individual queries into a single data frame. This effectively joins sets of references and concatenates the sets of totals. :param results: A list of data frame :param reference_groups: A list of groups of references (grouped by interval such as WoW, etc) :param dimensions: A list of dimensions, used for setting the index on the result data frame. :param share_dimensions: A list of dimensions from which the totals are used for calculating share operations. :return: """ # One result group for each rolled up dimension. Groups contain one member plus one for each reference type used. result_groups = chunks(results, 1 + len(reference_groups)) dimension_keys = [alias_selector(d.alias) for d in dimensions] totals_dimension_keys = [ alias_selector(d.alias) for d in find_totals_dimensions(dimensions, share_dimensions) ] dimension_dtypes = result_groups[0][0][dimension_keys].dtypes # Reduce each group to one data frame per rolled up dimension group_data_frames = [] for i, result_group in enumerate(result_groups): if dimension_keys: result_group = [ result.set_index(dimension_keys) for result in result_group ] base_df = result_group[0] reference_dfs = [ _make_reference_data_frame(base_df, result, reference) for result, reference_group in zip(result_group[1:], reference_groups) for reference in reference_group ] reduced = reduce( lambda left, right: pd.merge( left, right, how="outer", left_index=True, right_index=True), [base_df] + reference_dfs, ) # If there are rolled up dimensions in this result set then replace the NaNs for that dimension value with a # marker to indicate totals. # The data frames will be ordered so that the first group will contain the data without any rolled up # dimensions, then followed by the groups with them, ordered by the last rollup dimension first. if totals_dimension_keys[:i]: reduced = _replace_nans_for_totals_values(reduced, dimension_dtypes) group_data_frames.append(reduced) return pd.concat(group_data_frames, sort=False).sort_index(na_position="first")
def _get_sq_field_for_blender_field(field, queries, field_maps, reference=None): unmodified_field = find_field_in_modified_field(field) field_alias = alias_selector(reference_type_alias(field, reference)) # search for the field in each field map to determine which subquery it will be in for query, field_map in zip(queries, field_maps): if query is None or unmodified_field not in field_map: continue mapped_field = field_map[unmodified_field] mapped_field_alias = alias_selector( reference_type_alias(mapped_field, reference)) subquery_field = query[mapped_field_alias] # case #1 modified fields, ex. day(timestamp) or rollup(dimension) return field.for_(subquery_field).as_(field_alias) # Need to copy the metrics if there are references so that the `get_sql` monkey patch does not conflict. # Given some of them might have nested metrics themselves, the clone process is performed recursively. definition = field.definition while isinstance(definition, Field): definition = definition.definition # case #2: complex blender fields return _deepcopy_recursive(definition).as_(field_alias)
def _apply_cumulative_for_reference_delta_percent(self, data_frame, reference): """ When a delta percent reference is combined with a cumulative operation, the delta percent values need to be calculated based on the result of performing the operation on both the base values as well as the reference values. The correct result can not be obtained by simply applying the operation to the delta percent values. """ # apply the operation on the original reference values original_reference_alias = alias_selector( reference_type_alias(self.metric, reference)) reference_values_after_operation = self._apply_cumulative( data_frame, original_reference_alias) # get the base values on which the operation is already performed base_values_after_operation_key = alias_selector(self.alias) base_values_after_operation = data_frame[ base_values_after_operation_key] # recalculate the delta using the values on which the operation is already performed ref_delta_df = base_values_after_operation.subtract( reference_values_after_operation, fill_value=0) # recalculate the delta percent return calculate_delta_percent(reference_values_after_operation, ref_delta_df)
def map_hyperlink_templates(df: pd.DataFrame, dimensions: List[Field]) -> Dict[str, str]: """ Creates a mapping for each dimension to it's hyperlink template if it is possible to create the hyperlink template for it. The hyperlink template is a URL-like string containing curley braces enclosing dimension keys: `{dimension}`. While rendering this widget, the dimension key placeholders need to be replaced with the dimension values for that row. :param df: The result data set that is being transformed. The data frame SHOULD be pivoted/transposed if that step is required, before calling this function, in order to prevent the template from being included for the dimension if one of the required dimensions is pivoted. :param dimensions: The list of dimensions included in the query that created the result data set df. :return: A dict with the dimension key as the key and the hyperlink template as the value. Templates will only be included if it will be possible to fill in the required parameters. """ hyperlink_templates = {} pattern = re.compile(r"{[^{}]+}") for dimension in dimensions: hyperlink_template = dimension.hyperlink_template if hyperlink_template is None: continue required_hyperlink_parameters = [ alias_selector(argument[1:-1]) for argument in pattern.findall(hyperlink_template) ] # Check that all of the required dimensions are in the result data set. Only include the hyperlink template # in the return value of this function if all are present. unavailable_hyperlink_parameters = set( required_hyperlink_parameters) & set(df.index.names) if not unavailable_hyperlink_parameters: continue # replace the dimension keys with the formatted values. This will come in handy later when replacing the # actual values hyperlink_template = hyperlink_template.format( **{ alias_for_alias_selector(argument): "{" + argument + "}" for argument in required_hyperlink_parameters }) f_dimension_alias = alias_selector(dimension.alias) hyperlink_templates[f_dimension_alias] = hyperlink_template return hyperlink_templates
def hide_aliases( self, fields: List[HideField], ) -> FrozenSet[str]: hide_aliases = { alias_selector(item if isinstance(item, str) else item.alias) for item in self.hide } for field in fields: if field.fetch_only: hide_aliases.add(alias_selector(field.alias)) return hide_aliases
def fetch_data( database: Database, queries: List[Type[QueryBuilder]], dimensions: Iterable[Field], share_dimensions: Iterable[Field] = (), reference_groups=(), ) -> Tuple[int, pd.DataFrame]: queries = [str(query) for query in queries] # Indicate which dimensions need to be parsed as date types # For this we create a dictionary with the dimension alias as key and PANDAS_TO_DATETIME_FORMAT as value pandas_parse_dates = {} for dimension in dimensions: unmodified_dimension = find_field_in_modified_field(dimension) if unmodified_dimension.data_type == DataType.date: pandas_parse_dates[alias_selector( unmodified_dimension.alias)] = PANDAS_TO_DATETIME_FORMAT results = database.fetch_dataframes(*queries, parse_dates=pandas_parse_dates) max_rows_returned = max([len(x) for x in results], default=0) logger.info('max_rows_returned', extra={ 'row_count': max_rows_returned, 'database': str(database) }) return max_rows_returned, reduce_result_set(results, reference_groups, dimensions, share_dimensions)
def transform(self, data_frame, slicer, dimensions, references): import matplotlib.pyplot as plt data_frame = data_frame.copy() n_axes = len(self.items) figsize = (14, 5 * n_axes) fig, plt_axes = plt.subplots(n_axes, sharex='row', figsize=figsize) fig.suptitle(self.title) if not hasattr(plt_axes, '__iter__'): plt_axes = (plt_axes, ) colors = itertools.cycle('bgrcmyk') for axis, plt_axis in zip(self.items, plt_axes): for series in axis: series_color = next(colors) linestyles = itertools.cycle(['-', '--', '-.', ':']) for reference in [None] + references: metric = series.metric f_metric_key = utils.alias_selector( reference_alias(metric, reference)) f_metric_label = reference_label(metric, reference) plot = self.get_plot_func_for_series_type( data_frame[f_metric_key], f_metric_label, series) plot(ax=plt_axis, label=axis.label, color=series_color, stacked=series.stacking is not None, linestyle=next(linestyles)) \ .legend(loc='center left', bbox_to_anchor=(1, 0.5)) return plt_axes
def apply(self, data_frame, reference): if reference and reference.delta: return self._apply_share_for_reference_delta(data_frame, reference) f_metric_alias = alias_selector(reference_alias( self.metric, reference)) return self._apply_share(data_frame, f_metric_alias)
def test_apply_to_two_dims_over_second(self): share = Share(mock_dataset.fields.votes, over=mock_dataset.fields.political_party) result = share.apply(dimx2_date_str_totals_df, None) f_metric_key = alias_selector(mock_dataset.fields.votes.alias) expected = pd.Series( [ 49.79, 7.07, 43.12, 100.0, 49.78, 50.21, 100.0, 48.83, 51.16, 100.0, 55.42, 44.57, 100.0, 60.39, 39.60, 100.0, 26.60, 73.39, 100.0, ], name=f_metric_key, index=dimx2_date_str_totals_df.index, ) pandas.testing.assert_series_equal(expected, result, rtol=0.5e-3)
def test_apply_to_zero_dims(self): share = Share(mock_dataset.fields.votes) result = share.apply(dimx0_metricx1_df, None) f_metric_key = alias_selector(mock_dataset.fields.votes.alias) expected = pd.Series([100.], name=f_metric_key) pandas.testing.assert_series_equal(expected, result)
def apply(self, data_frame, reference): if reference and reference.delta_percent: return self._apply_cumulative_for_reference_delta_percent( data_frame, reference) f_metric_alias = alias_selector(reference_alias( self.metric, reference)) return self._apply_cumulative(data_frame, f_metric_alias)
def apply_reference_filters(df: pd.DataFrame, reference: Reference) -> pd.DataFrame: for reference_filter in reference.filters: df_column_key = alias_selector(reference_alias(reference_filter.metric, reference)) if df_column_key in df: column = df[df_column_key] dataframe_filter = ComparisonOperator.eval(column, reference_filter.operator, reference_filter.value) df = df.loc[dataframe_filter] return df
def test_hide_data_frame_indexes_hides_found_aliases(self): widget = Widget() base_df = dimx2_date_str_df.copy()[[alias_selector('wins'), alias_selector('votes')]] result = base_df.copy() widget.hide_data_frame_indexes( result, [ alias_selector(mock_dataset.fields.political_party.alias), alias_selector(mock_dataset.fields.votes.alias), alias_selector('unknown'), ], ) expected = base_df.copy() expected.reset_index('$political_party', inplace=True, drop=True) del expected['$votes'] pd.testing.assert_frame_equal(expected, result)
def test_apply_to_one_dim_over_none(self): share = Share(mock_dataset.fields.votes) result = share.apply(dimx1_str_df, None) f_metric_key = alias_selector(mock_dataset.fields.votes.alias) expected = pd.Series([100.] * 3, name=f_metric_key, index=dimx1_str_df.index) pandas.testing.assert_series_equal(expected, result, rtol=0.5e-3)
def test_apply_to_two_dims_over_first(self): share = Share(mock_dataset.fields.votes, over=mock_dataset.fields.timestamp) result = share.apply(dimx2_date_str_totalsx2_df, None) f_metric_key = alias_selector(mock_dataset.fields.votes.alias) metric_series = dimx2_date_str_totalsx2_df[f_metric_key] expected = 100 * metric_series / metric_series.iloc[-1] pandas.testing.assert_series_equal(expected, result, rtol=0.5e-3)
def hide_data_frame_indexes(data_frame, dimensions_to_hide): data_frame_indexes = ([data_frame.index.name] if not isinstance(data_frame.index, pd.MultiIndex) else data_frame.index.names) for dimension in dimensions_to_hide: dimesion_alias = alias_selector(dimension.alias) if dimesion_alias in data_frame_indexes: data_frame.reset_index(level=dimesion_alias, drop=True, inplace=True)
def fetch(self, hint=None, force_include=()) -> List[str]: """ Fetch the data for this query and transform it into the widgets. :param hint: For database vendors that support it, add a query hint to collect analytics on the queries triggered by fireant. :param force_include: A list of dimension values to include in the result set. This can be used to avoid having necessary results cut off due to the pagination. These results will be returned at the head of the results. :return: A list of dict (JSON) objects containing the widget configurations. """ query = add_hints(self.sql, hint)[0] dimension = self.dimensions[0] alias_definition = dimension.definition.as_( alias_selector(dimension.alias)) dimension_definition = dimension.definition if self.hint_table: alias_definition = alias_definition.replace_table( alias_definition.table, self.hint_table) dimension_definition = dimension.definition.replace_table( dimension_definition.table, self.hint_table) if force_include: include = self.dataset.database.to_char(dimension_definition).isin( [str(x) for x in force_include]) # Ensure that these values are included query = query.orderby(include, order=Order.desc) # Filter out NULL values from choices query = query.where(dimension_definition.notnull()) # Order by the dimension definition that the choices are for query = query.orderby(alias_definition) max_rows_returned, data = fetch_data(self.dataset.database, [query], self.dimensions) if len(data.index.names) > 1: display_alias = data.index.names[1] data.reset_index(display_alias, inplace=True) choices = data[display_alias] else: data["display"] = data.index.tolist() choices = data["display"] dimension_display = self.dimensions[-1] choices = choices.map( lambda raw: display_value(raw, dimension_display) or raw) return self._transform_for_return(choices, max_rows_returned=max_rows_returned)
def apply(self, data_frame, reference): (arg, ) = self.args df_alias = alias_selector(reference_alias(arg, reference)) if isinstance(data_frame.index, pd.MultiIndex): levels = self._group_levels(data_frame.index) return data_frame[df_alias].groupby(level=levels).apply( self.rolling_mean) return self.rolling_mean(data_frame[df_alias])
def test_apply_to_one_dim_over_first(self): share = Share(mock_dataset.fields.votes, over=mock_dataset.fields.political_party) result = share.apply(dimx1_str_totals_df, None) f_metric_key = alias_selector(mock_dataset.fields.votes.alias) expected = pd.Series([48.8487, 0.9638, 50.1873, 100.0], name=f_metric_key, index=dimx1_str_totals_df.index) pandas.testing.assert_series_equal(expected, result, rtol=0.5e-3)
def apply_cumulative(self, data_frame, reference): arg = self.args[0] df_key = alias_selector(reference_alias(arg, reference)) if isinstance(data_frame.index, pd.MultiIndex) and not data_frame.empty: levels = self._group_levels(data_frame.index) return data_frame[df_key].groupby(level=levels).apply(self.cummean) return self.cummean(data_frame[df_key])
def apply_cumulative_for_delta_percent(self, data_frame, reference): """ When a delta percent reference is combined with a cumulative operation, the delta percent values need to be calculated based on the result of performing the operation on both the base values as well as the reference values. The correct result can not be obtained by simply applying the operation to the delta percent values. This function could be simplified if the passed in data_frame also contained the original reference values instead of only the delta percent values. Currently this function recalculates those original values using the delta percent values. """ operation_metric = self.args[0] # get the base values on which this reference is based on base_df_key = alias_selector(operation_metric.alias) base_values = data_frame[base_df_key] # get references delta percent values reference_df_key = alias_selector( reference_alias(operation_metric, reference)) reference_delta_percent_values = data_frame[reference_df_key] / 100 # overwrite the percentage values with the original values (by recalculating them using the delta_percent value) data_frame[reference_df_key] = base_values / ( reference_delta_percent_values + 1) # now apply the operation on the restored original reference values reference_values_after_operation = self.apply_cumulative( data_frame, reference) # get the base values on which the operation is already performed base_values_after_operation_key = alias_selector(self.alias) base_values_after_operation = data_frame[ base_values_after_operation_key] # recalculate the delta using the values on which the operation is already performed ref_delta_df = base_values_after_operation.subtract( reference_values_after_operation, fill_value=0) # recalculate the delta percent return calculate_delta_percent(reference_values_after_operation, ref_delta_df)
def transform(self, data_frame, slicer, dimensions, references): """ WRITEME :param data_frame: :param slicer: :param dimensions: :param references: :return: """ result = data_frame.copy() items = [ item if reference is None else ReferenceItem(item, reference) for item in self.items for reference in [None] + references ] if isinstance(data_frame.index, pd.MultiIndex): index_levels = [ alias_selector(dimension.alias) for dimension in dimensions ] result = result.reorder_levels(index_levels) result = result[[alias_selector(item.alias) for item in items]] if dimensions: result.index.names = [ dimension.label or dimension.alias for dimension in dimensions ] result.columns = pd.Index([item.label for item in items], name="Metrics") pivot_dimensions = [ dimension.label or dimension.alias for dimension in self.pivot ] pivot_df = self.pivot_data_frame(result, pivot_dimensions, self.transpose) return self.add_formatting(dimensions, items, pivot_df).fillna(value=formats.BLANK_VALUE)
def _render_pie_series(self, metric: Field, reference: Reference, data_frame: pd.DataFrame, dimension_fields: List[Field]) -> dict: metric_alias = utils.alias_selector(metric.alias) if self.split_dimension: dimension_fields = [ dimension for dimension in dimension_fields if dimension != self.split_dimension ] data_frame = data_frame.reset_index(alias_selector( self.split_dimension.alias), drop=True) data = [] for dimension_values, y in data_frame[metric_alias].iteritems(): dimension_values = utils.wrap_list(dimension_values) name = self._format_dimension_values(dimension_fields, dimension_values) data.append({ "name": name or metric.label, "y": formats.raw_value(y, metric) }) return { "name": reference_label(metric, reference), "type": "pie", "data": data, "tooltip": { "pointFormat": '<span style="color:{point.color}">\u25CF</span> {series.name}: ' "<b>{point.y} ({point.percentage:.1f}%)</b><br/>", "valueDecimals": metric.precision, "valuePrefix": reference_prefix(metric, reference), "valueSuffix": reference_suffix(metric, reference), }, }
def _get_sq_field_for_blender_field(field, reference=None): unmodified_field = find_field_in_modified_field(field) field_alias = alias_selector(reference_alias(field, reference)) # search for the field in each field map to determine which subquery it will be in for query, field_map in zip(queries, field_maps): if unmodified_field not in field_map: continue mapped_field = field_map[unmodified_field] mapped_field_alias = alias_selector( reference_alias(mapped_field, reference)) subquery_field = query[mapped_field_alias] # case #1 modified fields, ex. day(timestamp) or rollup(dimension) return field.for_(subquery_field).as_(field_alias) # Need to copy the metrics if there are references so that the `get_sql` monkey patch does not conflict definition = copy.deepcopy(field.definition) # case #2: complex blender fields return definition.as_(field_alias)
def _apply_share_for_reference_delta(self, data_frame, reference): # apply the operation on the original reference values original_reference_alias = alias_selector( reference_type_alias(self.metric, reference)) reference_values_after_operation = self._apply_share( data_frame, original_reference_alias) # get the base values on which the operation is already performed base_values_after_operation_key = alias_selector(self.alias) base_values_after_operation = data_frame[ base_values_after_operation_key] # recalculate the delta using the values on which the operation is already performed ref_delta_df = base_values_after_operation.subtract( reference_values_after_operation, fill_value=0) if reference.delta_percent: # recalculate the delta percent ref_delta_df = calculate_delta_percent( reference_values_after_operation, ref_delta_df) return ref_delta_df
def test_apply_to_two_dims_over_none(self): share = Share(mock_dataset.fields.votes) result = share.apply(dimx2_date_str_df, None) f_metric_key = alias_selector(mock_dataset.fields.votes.alias) expected = pd.Series([100.] * 13, name=f_metric_key, index=dimx2_date_str_df.index) pandas.testing.assert_series_equal(expected, result, check_less_precise=True)
def apply(self, data_frame, reference): metric, over = self.args f_metric_alias = alias_selector(reference_alias(metric, reference)) if over is None: df = data_frame[f_metric_alias] return 100 * df / df if not isinstance(data_frame.index, pd.MultiIndex): marker = get_totals_marker_for_dtype(data_frame.index.dtype) totals = data_frame.loc[marker, f_metric_alias] if totals == 0: return np.nan return 100 * data_frame[f_metric_alias] / totals f_over_alias = alias_selector(over.alias) idx = data_frame.index.names.index(f_over_alias) group_levels = data_frame.index.names[idx:] over_dim_value = get_totals_marker_for_dtype( data_frame.index.levels[idx].dtype) totals_alias = (slice(None), ) * idx + (slice(over_dim_value, over_dim_value), ) totals = reduce_data_frame_levels( data_frame.loc[totals_alias, f_metric_alias], group_levels) def apply_totals(group_df): if not isinstance(totals, pd.Series): return 100 * group_df / totals n_index_levels = len(totals.index.names) extra_level_names = group_df.index.names[n_index_levels:] group_df = group_df.reset_index(extra_level_names, drop=True) share = 100 * group_df / totals[group_df.index] return pd.Series(share.values, index=group_df.index) return (data_frame[f_metric_alias].groupby( level=group_levels).apply(apply_totals).reorder_levels( order=data_frame.index.names).sort_index())
def test_apply_to_two_dims_over_second_with_one_row_per_group(self): raw_df = dimx2_date_str_totals_df.iloc[[0, 3, 4, 6]] share = Share(mock_dataset.fields.votes, over=mock_dataset.fields.political_party) result = share.apply(raw_df, None) f_metric_key = alias_selector(mock_dataset.fields.votes.alias) expected = pd.Series([49.79, 100.0, 49.78, 100.0], name=f_metric_key, index=raw_df.index) pandas.testing.assert_series_equal(expected, result, rtol=0.5e-3)
def add_formatting(self, dimensions: List[Field], items: List[Field], pivot_df: pd.DataFrame, use_raw_values: bool) -> pd.DataFrame: format_df = pivot_df.copy() def _get_field_display(item): return partial( formats.display_value, field=item, nan_value="", null_value="", use_raw_value=use_raw_values, ) if self.transpose or not self.transpose and len(dimensions) == len( self.pivot) > 0: for item in items: field_display = _get_field_display(item) alias = alias_selector(items[0].alias) format_df.loc[alias] = format_df.loc[alias].apply( field_display) return format_df if self.pivot and len(items) == 1: field_display = _get_field_display(items[0]) format_df = format_df.applymap(field_display) return format_df for item in items: key = alias_selector(item.alias) field_display = _get_field_display(item) format_df[key] = (format_df[key].apply(field_display) if isinstance(format_df[key], pd.Series) else format_df[key].applymap(field_display)) return format_df
def fetch(self, hint=None) -> Iterable[Dict]: """ Fetch the data for this query and transform it into the widgets. :param hint: A query hint label used with database vendors which support it. Adds a label comment to the query. :return: A list of dict (JSON) objects containing the widget configurations. """ queries = add_hints(self.sql, hint) operations = find_operations_for_widgets(self._widgets) share_dimensions = find_share_dimensions(self._dimensions, operations) data_frame = fetch_data( self.dataset.database, queries, self._dimensions, share_dimensions, self.reference_groups, ) # Apply operations for operation in operations: for reference in [None] + self._references: df_key = alias_selector(reference_alias(operation, reference)) data_frame[df_key] = operation.apply(data_frame, reference) data_frame = scrub_totals_from_share_results(data_frame, self._dimensions) data_frame = special_cases.apply_operations_to_data_frame( operations, data_frame) data_frame = paginate( data_frame, self._widgets, orders=self.orders, limit=self._limit, offset=self._offset, ) # Apply transformations return [ widget.transform(data_frame, self.dataset, self._dimensions, self._references) for widget in self._widgets ]