def _get_exclusions(self, delta=False): """Create dictionary for filter parameters for exclude clause. Returns: (Dict): query filter dictionary """ exclusions = QueryFilterCollection() tag_column = self._mapper.tag_column tag_group_by = self.get_tag_group_by_keys() if tag_group_by: for tag in tag_group_by: tag_db_name = tag_column + '__' + strip_tag_prefix(tag) filt = { 'field': tag_db_name, 'operation': 'isnull', 'parameter': True } q_filter = QueryFilter(**filt) exclusions.add(q_filter) composed_exclusions = exclusions.compose() LOG.debug(f'_get_exclusions: {composed_exclusions}') return composed_exclusions
def _set_operator_specified_filters(self, operator): """Set any filters using AND instead of OR.""" filters = QueryFilterCollection() composed_filter = Q() for filter_key in self.SUPPORTED_FILTERS: operator_key = operator + ":" + filter_key filter_value = self.parameters.get_filter(operator_key) logical_operator = operator if filter_value and len(filter_value) < 2: logical_operator = "or" if filter_value and not OrgQueryHandler.has_wildcard(filter_value): filter_obj = self.FILTER_MAP.get(filter_key) if isinstance(filter_obj, list): for _filt in filter_obj: filt_filters = QueryFilterCollection() for item in filter_value: q_filter = QueryFilter(parameter=item, logical_operator=logical_operator, **_filt) filt_filters.add(q_filter) composed_filter = composed_filter | filt_filters.compose() else: for item in filter_value: q_filter = QueryFilter(parameter=item, logical_operator=logical_operator, **filter_obj) filters.add(q_filter) if filters: composed_filter = composed_filter & filters.compose() return composed_filter
def _get_filter(self, delta=False): # noqa: C901 """Create dictionary for filter parameters. Args: delta (Boolean): Construct timeframe for delta Returns: (Dict): query filter dictionary """ filters = QueryFilterCollection() for filter_key in self.SUPPORTED_FILTERS: filter_value = self.parameters.get_filter(filter_key) if filter_value and not OrgQueryHandler.has_wildcard(filter_value): filter_obj = self.FILTER_MAP.get(filter_key) for item in filter_value: q_filter = QueryFilter(parameter=item, **filter_obj) filters.add(q_filter) # Update filters that specifiy and or or in the query parameter and_composed_filters = self._set_operator_specified_filters("and") or_composed_filters = self._set_operator_specified_filters("or") composed_filters = filters.compose() filter_list = [composed_filters, and_composed_filters, or_composed_filters] final_filters = None for filter_option in filter_list: if filter_option: if final_filters is not None: final_filters & filter_option else: final_filters = filter_option LOG.debug(f"_get_filter: {final_filters}") return final_filters
def _get_filter(self, delta=False): """Create dictionary for filter parameters. Args: delta (Boolean): Construct timeframe for delta Returns: (Dict): query filter dictionary """ filters = QueryFilterCollection() # add time constraint filters if delta: date_delta = self._get_date_delta() start = self.start_datetime - date_delta end = self.end_datetime - date_delta else: start = self.start_datetime end = self.end_datetime start_filter = QueryFilter(field='usage_start__date', operation='gte', parameter=start) end_filter = QueryFilter(field='usage_end__date', operation='lte', parameter=end) filters.add(query_filter=start_filter) filters.add(query_filter=end_filter) return filters
def _create_accounts_mapping(self): """Returns a mapping of org ids to accounts.""" account_mapping = {} with tenant_context(self.tenant): for source in self.data_sources: # Grab columns for this query account_info = source.get("account_alias_column") # Create filters & Query filters = QueryFilterCollection() no_org_units = QueryFilter(field=f"{account_info}", operation="isnull", parameter=False) filters.add(no_org_units) composed_filters = filters.compose() account_query = source.get("db_table").objects account_query = account_query.filter(composed_filters) account_query = account_query.exclude(deleted_timestamp__lte=self.start_datetime) account_query = account_query.exclude(created_timestamp__gt=self.end_datetime) if self.access: accounts_to_filter = self.access.get("aws.account", {}).get("read", []) if accounts_to_filter and "*" not in accounts_to_filter: account_query = account_query.filter(account_alias__account_id__in=accounts_to_filter) account_query = account_query.order_by(f"{account_info}", "-created_timestamp") account_query = account_query.distinct(f"{account_info}") account_query = account_query.annotate( alias=Coalesce(F(f"{account_info}__account_alias"), F(f"{account_info}__account_id")) ) for account in account_query: org_id = account.org_unit_id alias = account.alias if account_mapping.get(org_id): account_list = account_mapping[org_id] account_list.append(alias) account_mapping[org_id] = account_list else: account_mapping[org_id] = [alias] return account_mapping
def _get_key_filter(self): """ Add new `exact` QueryFilter that filters on the key name. If filtering on value, uses the tags summary table to find the key """ filters = QueryFilterCollection() filters.add(QueryFilter(field="key", operation="exact", parameter=self.key)) return self.query_filter & filters.compose()
def test_add_filter(self): """Test the add() method using a QueryFilter instance.""" filters = [] qf_coll = QueryFilterCollection() for _ in range(0, 3): filt = QueryFilter(self.fake.word(), self.fake.word(), self.fake.word(), self.fake.word()) filters.append(filt) qf_coll.add(query_filter=filt) self.assertEqual(qf_coll._filters, filters)
def _set_operator_specified_filters(self, operator): """Set any filters using AND instead of OR.""" fields = self._mapper._provider_map.get("filters") filters = QueryFilterCollection() composed_filter = Q() for q_param, filt in fields.items(): q_param = operator + ":" + q_param group_by = self.parameters.get_group_by(q_param, list()) filter_ = self.parameters.get_filter(q_param, list()) list_ = list(set(group_by + filter_)) # uniquify the list logical_operator = operator # This is a flexibilty feature allowing a user to set # a single and: value and still get a result instead # of erroring on validation if len(list_) < 2: logical_operator = "or" if list_ and not ReportQueryHandler.has_wildcard(list_): if isinstance(filt, list): for _filt in filt: filt_filters = QueryFilterCollection() for item in list_: q_filter = QueryFilter( parameter=item, logical_operator=logical_operator, **_filt) filt_filters.add(q_filter) # List filter are a complex mix of and/or logic # Each filter in the list must be ORed together # regardless of the operator on the item in the filter # Ex: # (OR: # (AND: # ('cluster_alias__icontains', 'ni'), # ('cluster_alias__icontains', 'se') # ), # (AND: # ('cluster_id__icontains', 'ni'), # ('cluster_id__icontains', 'se') # ) # ) composed_filter = composed_filter | filt_filters.compose( ) else: list_ = self._build_custom_filter_list( q_param, filt.get("custom"), list_) for item in list_: q_filter = QueryFilter( parameter=item, logical_operator=logical_operator, **filt) filters.add(q_filter) if filters: composed_filter = composed_filter & filters.compose() return composed_filter
def get_org_units(self): """Get a list of org keys to build upon.""" org_units = list() org_id_list = list() with tenant_context(self.tenant): for source in self.data_sources: # Grab columns for this query org_id = source.get("org_id_column") org_path = source.get("org_path_column") org_name = source.get("org_name_column") level = source.get("level_column") account_info = source.get("account_alias_column") created_field = source.get("created_time_column") # Create filters & Query account_filter = QueryFilterCollection() no_accounts = QueryFilter(field=f"{account_info}", operation="isnull", parameter=True) account_filter.add(no_accounts) remove_accounts = account_filter.compose() org_unit_query = source.get("db_table").objects org_unit_query = org_unit_query.filter(remove_accounts) org_unit_query = org_unit_query.exclude( deleted_timestamp__lte=self.start_datetime) org_unit_query = org_unit_query.exclude( created_timestamp__gt=self.end_datetime) val_list = [org_id, org_name, org_path, level] org_unit_query = org_unit_query.order_by( f"{org_id}", f"-{created_field}").distinct(f"{org_id}") org_ids = org_unit_query.values_list("id", flat=True) if self.access: acceptable_ous = self.access.get("aws.organizational_unit", {}).get("read", []) if acceptable_ous and "*" not in acceptable_ous: allowed_ids_query = source.get("db_table").objects allowed_ids_query = allowed_ids_query.filter( reduce(operator.or_, (Q(org_unit_path__icontains=rbac) for rbac in acceptable_ous ))).filter(remove_accounts) allowed_ids = allowed_ids_query.values_list("id", flat=True) org_ids = list(set(org_ids) & set(allowed_ids)) org_unit_query = org_unit_query.filter(id__in=org_ids) org_id_list.extend(org_ids) # Note: you want to collect the org_id_list before you implement the self.query_filter # so that way the get_sub_ou list will still work when you do filter[org_unit_id]=OU_002 if self.query_filter: org_unit_query = org_unit_query.filter(self.query_filter) org_unit_query = org_unit_query.values(*val_list) org_units.extend(org_unit_query) return org_units, org_id_list
def _set_or_filters(self): """Create a composed filter collection of ORed filters. This is designed to handle specific cases in the provider_map not to accomodate user input via the API. """ filters = QueryFilterCollection() or_filter = self._mapper._report_type_map.get("or_filter", []) for filt in or_filter: q_filter = QueryFilter(**filt) filters.add(q_filter) return filters.compose(logical_operator="or")
def test_compose_with_or_operator(self): """Test the compose() method with or operator on the compose method.""" qf_coll = QueryFilterCollection() operation = self.fake.word() filts = [ QueryFilter(table=self.fake.word(), field=self.fake.word(), operation=operation, parameter=self.fake.word()) for _ in range(2) ] expected = filts[0].composed_Q() | filts[1].composed_Q() qf_coll.add(filts[0]) qf_coll.add(filts[1]) self.assertEqual(qf_coll.compose(logical_operator="or"), expected)
def _get_filter(self, delta=False): # noqa: C901 """Create dictionary for filter parameters. Args: delta (Boolean): Construct timeframe for delta Returns: (Dict): query filter dictionary """ filters = QueryFilterCollection() if not self.parameters.get_filter("value"): for source in self.data_sources: start_filter, end_filter = self._get_time_based_filters( source, delta) filters.add(query_filter=start_filter) filters.add(query_filter=end_filter) for filter_key in self.SUPPORTED_FILTERS: if self.parameters.get_filter("value") and filter_key == "enabled": continue filter_value = self.parameters.get_filter(filter_key) if filter_value and not TagQueryHandler.has_wildcard(filter_value): filter_obj = self.filter_map.get(filter_key) if isinstance(filter_value, bool): filters.add(QueryFilter(**filter_obj)) elif isinstance(filter_obj, list): for _filt in filter_obj: for item in filter_value: q_filter = QueryFilter(parameter=item, **_filt) filters.add(q_filter) else: for item in filter_value: q_filter = QueryFilter(parameter=item, **filter_obj) filters.add(q_filter) access = self.parameters.get_access(filter_key) filt = self.filter_map.get(filter_key) if access and filt: self.set_access_filters(access, filt, filters) # Update filters that specifiy and or or in the query parameter and_composed_filters = self._set_operator_specified_filters("and") or_composed_filters = self._set_operator_specified_filters("or") composed_filters = filters.compose() composed_filters = composed_filters & and_composed_filters & or_composed_filters LOG.debug(f"_get_filter: {composed_filters}") return composed_filters
def test_add_params(self): """Test the add() method using parameters.""" table = self.fake.word() field = self.fake.word() operation = self.fake.word() parameter = self.fake.word() filt = QueryFilter(table=table, field=field, operation=operation, parameter=parameter) qf_coll = QueryFilterCollection() qf_coll.add(table=table, field=field, operation=operation, parameter=parameter) self.assertEqual(qf_coll._filters[0], filt)
def _get_filter(self, delta=False): """Create dictionary for filter parameters. Args: delta (Boolean): Construct timeframe for delta Returns: (Dict): query filter dictionary """ filters = QueryFilterCollection() # add time constraint filters start_filter, end_filter = self._get_time_based_filters(delta) filters.add(query_filter=start_filter) filters.add(query_filter=end_filter) return filters
def test_compose(self): """Test the compose() method.""" qf_coll = QueryFilterCollection() table = self.fake.word() field = self.fake.word() operation = self.fake.word() parameter = self.fake.word() filt = QueryFilter(table=table, field=field, operation=operation, parameter=parameter) expected = filt.composed_Q() qf_coll.add(table=table, field=field, operation=operation, parameter=parameter) self.assertEqual(qf_coll.compose(), expected)
def test_compose_with_filter_with_and_operator(self): """Test the compose() method with and operator on the filter.""" qf_coll = QueryFilterCollection() table = self.fake.word() field = self.fake.word() operation = self.fake.word() filts = [ QueryFilter(table=table, field=field, operation=operation, parameter=self.fake.word(), logical_operator='and') for _ in range(2) ] expected = filts[0].composed_Q() & filts[1].composed_Q() qf_coll.add(filts[0]) qf_coll.add(filts[1]) self.assertEqual(qf_coll.compose(), expected)
def _get_gcp_filter(self, delta=False): """Create dictionary for filter parameters for GCP. For the gcp filters when the time scope is -1 or -2 we remove the usage_start & usage_end filters and only use the invoice month. Args: delta (Boolean): Construct timeframe for delta Returns: (Dict): query filter dictionary """ filters = QueryFilterCollection() if delta: date_delta = self._get_date_delta() start = self.start_datetime - date_delta end = self.end_datetime - date_delta else: start = self.start_datetime end = self.end_datetime start_filter = QueryFilter(field="usage_start", operation="gte", parameter=start.date()) end_filter = QueryFilter(field="usage_start", operation="lte", parameter=end.date()) invoice_months = self.dh.gcp_find_invoice_months_in_date_range( start.date(), end.date()) invoice_filter = QueryFilter(field="invoice_month", operation="in", parameter=invoice_months) filters.add(invoice_filter) if self.parameters.get_filter( "time_scope_value") and self.time_scope_value in [-1, -2]: # we don't add the time filters to time scopes -1 or -2 unless they are using delta. if delta: filters.add(query_filter=start_filter) filters.add(query_filter=end_filter) else: filters.add(query_filter=start_filter) filters.add(query_filter=end_filter) return filters
def _get_sub_ou_list(self, data, org_ids): """Get a list of the sub org units for a org unit.""" level = data.get("level") level = level + 1 unit_path = data.get("org_unit_path") final_sub_ou_list = [] with tenant_context(self.tenant): for source in self.data_sources: # Grab columns for this query account_info = source.get("account_alias_column") level_column = source.get("level_column") org_path = source.get("org_path_column") # Build filters filters = QueryFilterCollection() no_accounts = QueryFilter(field=f"{account_info}", operation="isnull", parameter=True) filters.add(no_accounts) exact_parent_id = QueryFilter(field=f"{level_column}", operation="exact", parameter=level) filters.add(exact_parent_id) path_on_like = QueryFilter(field=f"{org_path}", operation="icontains", parameter=unit_path) filters.add(path_on_like) composed_filters = filters.compose() # Start quering sub_org_query = source.get("db_table").objects sub_org_query = sub_org_query.filter(composed_filters) sub_org_query = sub_org_query.filter(id__in=org_ids) sub_ou_list = sub_org_query.values_list("org_unit_id", flat=True) final_sub_ou_list.extend(sub_ou_list) return final_sub_ou_list
def _get_exclusions(self, column): """Create dictionary for filter parameters for exclude clause. For tags this is to filter items that have null values for the specified tag field. Args: column (str): The tag column being queried Returns: (Dict): query filter dictionary """ exclusions = QueryFilterCollection() filt = {"field": column, "operation": "isnull", "parameter": True} q_filter = QueryFilter(**filt) exclusions.add(q_filter) composed_exclusions = exclusions.compose() LOG.debug(f"_get_exclusions: {composed_exclusions}") return composed_exclusions
def get_tag_values(self): """ Gets the values associated with a tag when filtering on a value. """ final_data = [] with tenant_context(self.tenant): tag_keys = {} for source in self.TAGS_VALUES_SOURCE: vals_filter = QueryFilterCollection() for key_field in source.get("fields"): vals_filter.add( QueryFilter(field=key_field, operation="exact", parameter=self.key, composition_key="filter_key")) tag_values_query = source.get("db_table").objects filt = self.query_filter & vals_filter.compose() tag_keys = list(tag_values_query.filter(filt)) tag_tup = self._value_filter_dict(tag_keys) converted = self._convert_to_dict(tag_tup) self.append_to_final_data_without_type(final_data, converted) self.deduplicate_and_sort(final_data) return final_data
class Forecast: """Base forecasting class.""" # the minimum number of data points needed to use the current month's data. # if we have fewer than this many data points, fall back to using the previous month's data. # # this number is chosen in part because statsmodels.stats.stattools.omni_normtest() needs at least eight data # points to test for normal distribution. MINIMUM = 8 # the precision of the floats returned in the forecast response. PRECISION = 8 REPORT_TYPE = "costs" def __init__(self, query_params): # noqa: C901 """Class Constructor. Instance Attributes: - cost_summary_table (Model) - aggregates (dict) - filters (QueryFilterCollection) - query_range (tuple) """ self.dh = DateHelper() self.params = query_params # select appropriate model based on access access = query_params.get("access", {}) access_key = "default" self.cost_summary_table = self.provider_map.views.get("costs").get(access_key) if access: access_key = tuple(access.keys()) filter_fields = self.provider_map.provider_map.get("filters") materialized_view = self.provider_map.views.get("costs").get(access_key) if materialized_view: # We found a matching materialized view, use that self.cost_summary_table = materialized_view else: # We have access constraints, but no view to accomodate, default to daily summary table self.cost_summary_table = self.provider_map.query_table self.forecast_days_required = (self.dh.this_month_end - self.dh.yesterday).days # forecasts use a rolling window self.query_range = (self.dh.n_days_ago(self.dh.yesterday, 30), self.dh.yesterday) self.filters = QueryFilterCollection() self.filters.add(field="usage_start", operation="gte", parameter=self.query_range[0]) self.filters.add(field="usage_end", operation="lte", parameter=self.query_range[1]) # filter queries based on access if access_key != "default": for q_param, filt in filter_fields.items(): access = query_params.get_access(q_param, list()) if access: self.set_access_filters(access, filt, self.filters) @property def provider_map(self): """Return the provider map instance.""" return self.provider_map_class(self.provider, self.REPORT_TYPE) @property def total_cost_term(self): """Return the provider map value for total cost.""" return self.provider_map.report_type_map.get("aggregates", {}).get("cost_total") @property def supplementary_cost_term(self): """Return the provider map value for total supplemenatry cost.""" return self.provider_map.report_type_map.get("aggregates", {}).get("sup_total") @property def infrastructure_cost_term(self): """Return the provider map value for total inftrastructure cost.""" return self.provider_map.report_type_map.get("aggregates", {}).get("infra_total") def predict(self): """Define ORM query to run forecast and return prediction.""" cost_predictions = {} with tenant_context(self.params.tenant): data = ( self.cost_summary_table.objects.filter(self.filters.compose()) .order_by("usage_start") .values("usage_start") .annotate( total_cost=self.total_cost_term, supplementary_cost=self.supplementary_cost_term, infrastructure_cost=self.infrastructure_cost_term, ) ) for fieldname in ["total_cost", "infrastructure_cost", "supplementary_cost"]: uniq_data = self._uniquify_qset(data.values("usage_start", fieldname), field=fieldname) cost_predictions[fieldname] = self._predict(uniq_data) cost_predictions = self._key_results_by_date(cost_predictions) return self.format_result(cost_predictions) def _predict(self, data): """Handle pre and post prediction work. This function handles arranging incoming data to conform with statsmodels requirements. Then after receiving the forecast output, this function handles formatting to conform to API reponse requirements. Args: data (list) a list of (datetime, float) tuples Returns: (LinearForecastResult) linear forecast results object """ LOG.debug("Forecast input data: %s", data) if len(data) < self.MINIMUM: LOG.warning( "Number of data elements (%s) is fewer than the minimum (%s). Unable to generate forecast.", len(data), self.MINIMUM, ) return [] dates, costs = zip(*data) X = self._enumerate_dates(dates) Y = [float(c) for c in costs] # calculate x-values for the prediction range pred_x = [i for i in range(X[-1] + 1, X[-1] + 1 + self.forecast_days_required)] # run the forecast results = self._run_forecast(X, Y, to_predict=pred_x) result_dict = {} for i, value in enumerate(results.prediction): # extrapolate confidence intervals to align with prediction. # this reduces the confidence interval below 95th percentile, but is a better UX. if i < len(results.confidence_lower): lower = results.confidence_lower[i] else: lower = results.confidence_lower[-1] + results.slope * (i - len(results.confidence_lower)) if i < len(results.confidence_upper): upper = results.confidence_upper[i] else: upper = results.confidence_upper[-1] + results.slope * (i - len(results.confidence_upper)) # ensure that there are no negative numbers. result_dict[self.dh.today.date() + timedelta(days=i)] = { "total_cost": max((value, 0)), "confidence_min": max((lower, 0)), "confidence_max": max((upper, 0)), } return (result_dict, results.rsquared, results.pvalues) def _enumerate_dates(self, date_list): """Given a list of dates, return a list of integers. This method works in conjunction with _remove_outliers(). This method works to preserve any gaps in the data created by _remove_outliers() so that the integers used for the X-axis are aligned appropriately. Example: If _remove_outliers() returns {"2000-01-01": 1.0, "2000-01-03": 1.5} then _enumerate_dates() returns [0, 2] """ days = self.dh.list_days( datetime.combine(date_list[0], self.dh.midnight), datetime.combine(date_list[-1], self.dh.midnight) ) out = [i for i, day in enumerate(days) if day.date() in date_list] return out def _remove_outliers(self, data): """Remove outliers from our dateset before predicting. We use a box plot method without plotting the box. """ values = list(data.values()) if values: third_quartile, first_quartile = np.percentile(values, [Decimal(75), Decimal(25)]) interquartile_range = third_quartile - first_quartile upper_boundary = third_quartile + (Decimal(1.5) * interquartile_range) lower_boundary = first_quartile - (Decimal(1.5) * interquartile_range) return {key: value for key, value in data.items() if (value >= lower_boundary and value <= upper_boundary)} return data def _key_results_by_date(self, results, check_term="total_cost"): """Take results formatted by cost type, and return results keyed by date.""" results_by_date = defaultdict(dict) date_based_dict = results[check_term][0] if results[check_term] else [] for date in date_based_dict: for cost_term in results: if results[cost_term][0].get(date): results_by_date[date][cost_term] = ( results[cost_term][0][date], {"rsquared": results[cost_term][1]}, {"pvalues": results[cost_term][2]}, ) return results_by_date def format_result(self, results): """Format results for API consumption.""" f_format = f"%.{self.PRECISION}f" # avoid converting floats to e-notation units = "USD" response = [] for key in results: if key > self.dh.this_month_end.date(): continue dikt = { "date": key, "values": [ { "date": key, "infrastructure": { "total": { "value": round(results[key]["infrastructure_cost"][0]["total_cost"], 3), "units": units, }, "confidence_max": { "value": round(results[key]["infrastructure_cost"][0]["confidence_max"], 3), "units": units, }, "confidence_min": { "value": round(max(results[key]["infrastructure_cost"][0]["confidence_min"], 0), 3), "units": units, }, "rsquared": { "value": f_format % results[key]["infrastructure_cost"][1]["rsquared"], "units": None, }, "pvalues": {"value": results[key]["infrastructure_cost"][2]["pvalues"], "units": None}, }, "supplementary": { "total": { "value": round(results[key]["supplementary_cost"][0]["total_cost"], 3), "units": units, }, "confidence_max": { "value": round(results[key]["supplementary_cost"][0]["confidence_max"], 3), "units": units, }, "confidence_min": { "value": round(max(results[key]["supplementary_cost"][0]["confidence_min"], 0), 3), "units": units, }, "rsquared": { "value": f_format % results[key]["supplementary_cost"][1]["rsquared"], "units": None, }, "pvalues": {"value": results[key]["supplementary_cost"][2]["pvalues"], "units": None}, }, "cost": { "total": {"value": round(results[key]["total_cost"][0]["total_cost"], 3), "units": units}, "confidence_max": { "value": round(results[key]["total_cost"][0]["confidence_max"], 3), "units": units, }, "confidence_min": { "value": round(max(results[key]["total_cost"][0]["confidence_min"], 0), 3), "units": units, }, "rsquared": {"value": f_format % results[key]["total_cost"][1]["rsquared"], "units": None}, "pvalues": {"value": results[key]["total_cost"][2]["pvalues"], "units": None}, }, } ], } response.append(dikt) return response def _run_forecast(self, x, y, to_predict=None): """Apply the forecast model. Args: x (list) a list of exogenous variables y (list) a list of endogenous variables to_predict (list) a list of exogenous variables used in the forecast results Note: both x and y MUST be the same number of elements Returns: (tuple) (numpy.ndarray) prediction values (numpy.ndarray) confidence interval lower bound (numpy.ndarray) confidence interval upper bound (float) R-squared value (list) P-values """ x = sm.add_constant(x) to_predict = sm.add_constant(to_predict) model = sm.OLS(y, x) results = model.fit() return LinearForecastResult(results, exog=to_predict) def _uniquify_qset(self, qset, field="total_cost"): """Take a QuerySet list, sum costs within the same day, and arrange it into a list of tuples. Args: qset (QuerySet) field (str) - field name in the QuerySet to be summed Returns: [(date, cost), ...] """ # FIXME: this QuerySet->dict->list conversion probably isn't ideal. # FIXME: there's probably a way to aggregate multiple sources by date using just the ORM. result = defaultdict(Decimal) for item in qset: result[item.get("usage_start")] += Decimal(item.get(field, 0.0)) result = self._remove_outliers(result) out = [(k, v) for k, v in result.items()] return out def set_access_filters(self, access, filt, filters): """Set access filters to ensure RBAC restrictions adhere to user's access and filters. Args: access (list) the list containing the users relevant access filt (list or dict) contains the filters to be updated filters (QueryFilterCollection) the filter collection to add the new filters to returns: None """ if isinstance(filt, list): for _filt in filt: _filt["operation"] = "in" q_filter = QueryFilter(parameter=access, **_filt) filters.add(q_filter) else: filt["operation"] = "in" q_filter = QueryFilter(parameter=access, **filt) filters.add(q_filter)
def test_add_bad(self): """Test the add() method using invalid values.""" qf_coll = QueryFilterCollection() with self.assertRaises(AttributeError): qf_coll.add(self.fake.word(), self.fake.word(), self.fake.word())
def _get_key_filter(self): """Add new `exact` QueryFilter that filters on the key name.""" filters = QueryFilterCollection() filters.add( QueryFilter(field="key", operation="exact", parameter=self.key)) return self.query_filter & filters.compose()
class Forecast: """Base forecasting class.""" # the minimum number of data points needed to use the current month's data. # if we have fewer than this many data points, fall back to using the previous month's data. # # this number is chosen in part because statsmodels.stats.stattools.omni_normtest() needs at least eight data # points to test for normal distribution. MINIMUM = 8 # the precision of the floats returned in the forecast response. PRECISION = 8 REPORT_TYPE = "costs" dh = DateHelper() def __init__(self, query_params): # noqa: C901 """Class Constructor. Instance Attributes: - cost_summary_table (Model) - aggregates (dict) - filters (QueryFilterCollection) - query_range (tuple) """ self.params = query_params # select appropriate model based on access access = query_params.get("access", {}) access_key = "default" self.cost_summary_table = self.provider_map.views.get("costs").get( access_key) if access: access_key = tuple(access.keys()) filter_fields = self.provider_map.provider_map.get("filters") materialized_view = self.provider_map.views.get("costs").get( access_key) if materialized_view: # We found a matching materialized view, use that self.cost_summary_table = materialized_view else: # We have access constraints, but no view to accomodate, default to daily summary table self.cost_summary_table = self.provider_map.report_type_map.get( "tables", {}).get("query") current_day_of_month = self.dh.today.day yesterday = (self.dh.today - timedelta(days=1)).day last_day_of_month = self.dh.this_month_end.day if current_day_of_month == 1: self.forecast_days_required = last_day_of_month else: self.forecast_days_required = last_day_of_month - yesterday if current_day_of_month <= self.MINIMUM: self.query_range = (self.dh.last_month_start, self.dh.last_month_end) else: self.query_range = (self.dh.this_month_start, self.dh.today - timedelta(days=1)) self.filters = QueryFilterCollection() self.filters.add(field="usage_start", operation="gte", parameter=self.query_range[0]) self.filters.add(field="usage_end", operation="lte", parameter=self.query_range[1]) # filter queries based on access if access_key != "default": for q_param, filt in filter_fields.items(): access = query_params.get_access(q_param, list()) if access: self.set_access_filters(access, filt, self.filters) @property def provider_map(self): """Return the provider map instance.""" return self.provider_map_class(self.provider, self.REPORT_TYPE) @property def total_cost_term(self): """Return the provider map value for total cost.""" return self.provider_map.report_type_map.get("aggregates", {}).get("cost_total") @property def supplementary_cost_term(self): """Return the provider map value for total supplemenatry cost.""" return self.provider_map.report_type_map.get("aggregates", {}).get("sup_total") @property def infrastructure_cost_term(self): """Return the provider map value for total inftrastructure cost.""" return self.provider_map.report_type_map.get("aggregates", {}).get("infra_total") def predict(self): """Define ORM query to run forecast and return prediction.""" cost_predictions = {} with tenant_context(self.params.tenant): data = (self.cost_summary_table.objects.filter( self.filters.compose()).order_by("usage_start").values( "usage_start").annotate( total_cost=self.total_cost_term, supplementary_cost=self.supplementary_cost_term, infrastructure_cost=self.infrastructure_cost_term, )) total_cost_data = data.values("usage_start", "total_cost") infra_cost_data = data.values("usage_start", "infrastructure_cost") supp_cost_data = data.values("usage_start", "supplementary_cost") cost_predictions["total_cost"] = self._predict( self._uniquify_qset(total_cost_data, field="total_cost")) cost_predictions["infrastructure_cost"] = self._predict( self._uniquify_qset(infra_cost_data, field="infrastructure_cost")) cost_predictions["supplementary_cost"] = self._predict( self._uniquify_qset(supp_cost_data, field="supplementary_cost")) cost_predictions = self._key_results_by_date(cost_predictions) return self.format_result(cost_predictions) def _predict(self, data): """Handle pre and post prediction work. Args: data (list) a list of (datetime, float) tuples Returns: (LinearForecastResult) linear forecast results object """ LOG.debug("Forecast input data: %s", data) if len(data) < 2: LOG.warning( "Unable to calculate forecast. Insufficient data for %s.", self.params.tenant) return [] if len(data) < self.MINIMUM: LOG.warning("Number of data elements is fewer than the minimum.") # arrange the data into a form that statsmodels will accept. dates, costs = zip(*data) X = [int(d.strftime("%Y%m%d")) for d in dates] Y = [float(c) for c in costs] # run the forecast results = self._run_forecast(X, Y) result_dict = {} for i, value in enumerate(results.prediction): result_dict[self.dh.today.date() + timedelta(days=i)] = { "total_cost": value, "confidence_min": results.confidence_lower[i], "confidence_max": results.confidence_upper[i], } result_dict = self._add_additional_data_points(result_dict, results.slope) return (result_dict, results.rsquared, results.pvalues) def _remove_outliers(self, data): """Remove outliers from our dateset before predicting. We use a box plot method without plotting the box. """ values = list(data.values()) if values: third_quartile, first_quartile = np.percentile( values, [Decimal(75), Decimal(25)]) interquartile_range = third_quartile - first_quartile upper_boundary = third_quartile + (Decimal(1.5) * interquartile_range) lower_boundary = first_quartile - (Decimal(1.5) * interquartile_range) return { key: value for key, value in data.items() if (value >= lower_boundary and value <= upper_boundary) } return data def _key_results_by_date(self, results, check_term="total_cost"): """Take results formatted by cost type, and return results keyed by date.""" results_by_date = defaultdict(dict) date_based_dict = results[check_term][0] if results[check_term] else [] for date in date_based_dict: for cost_term in results: results_by_date[date][cost_term] = ( results[cost_term][0][date], { "rsquared": results[cost_term][1] }, { "pvalues": results[cost_term][2] }, ) return results_by_date def format_result(self, results): """Format results for API consumption.""" f_format = f"%.{self.PRECISION}f" # avoid converting floats to e-notation units = "USD" response = [] for key in results: if key > self.dh.this_month_end.date(): continue dikt = { "date": key, "values": [{ "date": key, "infrastructure": { "total": { "value": round( results[key]["infrastructure_cost"][0] ["total_cost"], 3), "units": units, }, "confidence_max": { "value": round( results[key]["infrastructure_cost"][0] ["confidence_max"], 3), "units": units, }, "confidence_min": { "value": round( max( results[key]["infrastructure_cost"][0] ["confidence_min"], 0), 3), "units": units, }, "rsquared": { "value": f_format % results[key]["infrastructure_cost"][1]["rsquared"], "units": None, }, "pvalues": { "value": f_format % results[key]["infrastructure_cost"][2]["pvalues"], "units": None, }, }, "supplementary": { "total": { "value": round( results[key]["supplementary_cost"][0] ["total_cost"], 3), "units": units, }, "confidence_max": { "value": round( results[key]["supplementary_cost"][0] ["confidence_max"], 3), "units": units, }, "confidence_min": { "value": round( max( results[key]["supplementary_cost"][0] ["confidence_min"], 0), 3), "units": units, }, "rsquared": { "value": f_format % results[key]["supplementary_cost"][1]["rsquared"], "units": None, }, "pvalues": { "value": f_format % results[key]["supplementary_cost"][2]["pvalues"], "units": None, }, }, "cost": { "total": { "value": round(results[key]["total_cost"][0]["total_cost"], 3), "units": units }, "confidence_max": { "value": round( results[key]["total_cost"][0] ["confidence_max"], 3), "units": units, }, "confidence_min": { "value": round( max( results[key]["total_cost"][0] ["confidence_min"], 0), 3), "units": units, }, "rsquared": { "value": f_format % results[key]["total_cost"][1]["rsquared"], "units": None }, "pvalues": { "value": f_format % results[key]["total_cost"][2]["pvalues"], "units": None }, }, }], } response.append(dikt) return response def _add_additional_data_points(self, results, slope): """Add extra entries to make sure we predict the full month.""" additional_days_needed = 0 dates = results.keys() last_predicted_date = max(dates) days_already_predicted = len(dates) last_predicted_cost = results[last_predicted_date]["total_cost"] last_predicted_max = results[last_predicted_date]["confidence_max"] last_predicted_min = results[last_predicted_date]["confidence_min"] if days_already_predicted < self.forecast_days_required: additional_days_needed = self.forecast_days_required - days_already_predicted for i in range(1, additional_days_needed + 1): results[last_predicted_date + timedelta(days=i)] = { "total_cost": last_predicted_cost + (slope * i), "confidence_min": last_predicted_min + (slope * i), "confidence_max": last_predicted_max + (slope * i), } return results def _run_forecast(self, x, y): """Apply the forecast model. Args: x (list) a list of exogenous variables y (list) a list of endogenous variables Note: both x and y MUST be the same number of elements Returns: (tuple) (numpy.ndarray) prediction values (numpy.ndarray) confidence interval lower bound (numpy.ndarray) confidence interval upper bound (float) R-squared value (list) P-values """ model = sm.OLS(y, x) results = model.fit() return LinearForecastResult(results) def _uniquify_qset(self, qset, field="total_cost"): """Take a QuerySet list, sum costs within the same day, and arrange it into a list of tuples. Args: qset (QuerySet) Returns: [(date, cost), ...] """ # FIXME: this QuerySet->dict->list conversion probably isn't ideal. # FIXME: there's probably a way to aggregate multiple sources by date using just the ORM. result = defaultdict(Decimal) for item in qset: result[item.get("usage_start")] += Decimal(item.get(field, 0.0)) result = self._remove_outliers(result) out = [(k, v) for k, v in result.items()] return out def set_access_filters(self, access, filt, filters): """Set access filters to ensure RBAC restrictions adhere to user's access and filters. Args: access (list) the list containing the users relevant access filt (list or dict) contains the filters to be updated filters (QueryFilterCollection) the filter collection to add the new filters to returns: None """ if isinstance(filt, list): for _filt in filt: _filt["operation"] = "in" q_filter = QueryFilter(parameter=access, **_filt) filters.add(q_filter) else: filt["operation"] = "in" q_filter = QueryFilter(parameter=access, **filt) filters.add(q_filter)