Beispiel #1
0
    def _get_exclusions(self, delta=False):
        """Create dictionary for filter parameters for exclude clause.

        Returns:
            (Dict): query filter dictionary

        """
        exclusions = QueryFilterCollection()
        tag_column = self._mapper.tag_column
        tag_group_by = self.get_tag_group_by_keys()
        if tag_group_by:
            for tag in tag_group_by:
                tag_db_name = tag_column + '__' + strip_tag_prefix(tag)
                filt = {
                    'field': tag_db_name,
                    'operation': 'isnull',
                    'parameter': True
                }
                q_filter = QueryFilter(**filt)
                exclusions.add(q_filter)

        composed_exclusions = exclusions.compose()

        LOG.debug(f'_get_exclusions: {composed_exclusions}')
        return composed_exclusions
Beispiel #2
0
    def _set_operator_specified_filters(self, operator):
        """Set any filters using AND instead of OR."""
        filters = QueryFilterCollection()
        composed_filter = Q()
        for filter_key in self.SUPPORTED_FILTERS:
            operator_key = operator + ":" + filter_key
            filter_value = self.parameters.get_filter(operator_key)
            logical_operator = operator
            if filter_value and len(filter_value) < 2:
                logical_operator = "or"
            if filter_value and not OrgQueryHandler.has_wildcard(filter_value):
                filter_obj = self.FILTER_MAP.get(filter_key)
                if isinstance(filter_obj, list):
                    for _filt in filter_obj:
                        filt_filters = QueryFilterCollection()
                        for item in filter_value:
                            q_filter = QueryFilter(parameter=item, logical_operator=logical_operator, **_filt)
                            filt_filters.add(q_filter)
                        composed_filter = composed_filter | filt_filters.compose()
                else:
                    for item in filter_value:
                        q_filter = QueryFilter(parameter=item, logical_operator=logical_operator, **filter_obj)
                        filters.add(q_filter)
        if filters:
            composed_filter = composed_filter & filters.compose()

        return composed_filter
Beispiel #3
0
    def _get_filter(self, delta=False):  # noqa: C901
        """Create dictionary for filter parameters.

        Args:
            delta (Boolean): Construct timeframe for delta
        Returns:
            (Dict): query filter dictionary

        """
        filters = QueryFilterCollection()

        for filter_key in self.SUPPORTED_FILTERS:
            filter_value = self.parameters.get_filter(filter_key)
            if filter_value and not OrgQueryHandler.has_wildcard(filter_value):
                filter_obj = self.FILTER_MAP.get(filter_key)
                for item in filter_value:
                    q_filter = QueryFilter(parameter=item, **filter_obj)
                    filters.add(q_filter)

        # Update filters that specifiy and or or in the query parameter
        and_composed_filters = self._set_operator_specified_filters("and")
        or_composed_filters = self._set_operator_specified_filters("or")
        composed_filters = filters.compose()
        filter_list = [composed_filters, and_composed_filters, or_composed_filters]
        final_filters = None
        for filter_option in filter_list:
            if filter_option:
                if final_filters is not None:
                    final_filters & filter_option
                else:
                    final_filters = filter_option

        LOG.debug(f"_get_filter: {final_filters}")
        return final_filters
Beispiel #4
0
    def _get_filter(self, delta=False):
        """Create dictionary for filter parameters.

        Args:
            delta (Boolean): Construct timeframe for delta
        Returns:
            (Dict): query filter dictionary

        """
        filters = QueryFilterCollection()

        # add time constraint filters
        if delta:
            date_delta = self._get_date_delta()
            start = self.start_datetime - date_delta
            end = self.end_datetime - date_delta
        else:
            start = self.start_datetime
            end = self.end_datetime

        start_filter = QueryFilter(field='usage_start__date',
                                   operation='gte',
                                   parameter=start)
        end_filter = QueryFilter(field='usage_end__date',
                                 operation='lte',
                                 parameter=end)
        filters.add(query_filter=start_filter)
        filters.add(query_filter=end_filter)

        return filters
Beispiel #5
0
 def _create_accounts_mapping(self):
     """Returns a mapping of org ids to accounts."""
     account_mapping = {}
     with tenant_context(self.tenant):
         for source in self.data_sources:
             # Grab columns for this query
             account_info = source.get("account_alias_column")
             # Create filters & Query
             filters = QueryFilterCollection()
             no_org_units = QueryFilter(field=f"{account_info}", operation="isnull", parameter=False)
             filters.add(no_org_units)
             composed_filters = filters.compose()
             account_query = source.get("db_table").objects
             account_query = account_query.filter(composed_filters)
             account_query = account_query.exclude(deleted_timestamp__lte=self.start_datetime)
             account_query = account_query.exclude(created_timestamp__gt=self.end_datetime)
             if self.access:
                 accounts_to_filter = self.access.get("aws.account", {}).get("read", [])
                 if accounts_to_filter and "*" not in accounts_to_filter:
                     account_query = account_query.filter(account_alias__account_id__in=accounts_to_filter)
             account_query = account_query.order_by(f"{account_info}", "-created_timestamp")
             account_query = account_query.distinct(f"{account_info}")
             account_query = account_query.annotate(
                 alias=Coalesce(F(f"{account_info}__account_alias"), F(f"{account_info}__account_id"))
             )
             for account in account_query:
                 org_id = account.org_unit_id
                 alias = account.alias
                 if account_mapping.get(org_id):
                     account_list = account_mapping[org_id]
                     account_list.append(alias)
                     account_mapping[org_id] = account_list
                 else:
                     account_mapping[org_id] = [alias]
     return account_mapping
Beispiel #6
0
 def _get_key_filter(self):
     """
     Add new `exact` QueryFilter that filters on the key name.
     If filtering on value, uses the tags summary table to find the key
     """
     filters = QueryFilterCollection()
     filters.add(QueryFilter(field="key", operation="exact", parameter=self.key))
     return self.query_filter & filters.compose()
Beispiel #7
0
 def test_add_filter(self):
     """Test the add() method using a QueryFilter instance."""
     filters = []
     qf_coll = QueryFilterCollection()
     for _ in range(0, 3):
         filt = QueryFilter(self.fake.word(), self.fake.word(),
                            self.fake.word(), self.fake.word())
         filters.append(filt)
         qf_coll.add(query_filter=filt)
     self.assertEqual(qf_coll._filters, filters)
Beispiel #8
0
    def _set_operator_specified_filters(self, operator):
        """Set any filters using AND instead of OR."""
        fields = self._mapper._provider_map.get("filters")
        filters = QueryFilterCollection()
        composed_filter = Q()

        for q_param, filt in fields.items():
            q_param = operator + ":" + q_param
            group_by = self.parameters.get_group_by(q_param, list())
            filter_ = self.parameters.get_filter(q_param, list())
            list_ = list(set(group_by + filter_))  # uniquify the list
            logical_operator = operator
            # This is a flexibilty feature allowing a user to set
            # a single and: value and still get a result instead
            # of erroring on validation
            if len(list_) < 2:
                logical_operator = "or"
            if list_ and not ReportQueryHandler.has_wildcard(list_):
                if isinstance(filt, list):
                    for _filt in filt:
                        filt_filters = QueryFilterCollection()
                        for item in list_:
                            q_filter = QueryFilter(
                                parameter=item,
                                logical_operator=logical_operator,
                                **_filt)
                            filt_filters.add(q_filter)
                        # List filter are a complex mix of and/or logic
                        # Each filter in the list must be ORed together
                        # regardless of the operator on the item in the filter
                        # Ex:
                        # (OR:
                        #     (AND:
                        #         ('cluster_alias__icontains', 'ni'),
                        #         ('cluster_alias__icontains', 'se')
                        #     ),
                        #     (AND:
                        #         ('cluster_id__icontains', 'ni'),
                        #         ('cluster_id__icontains', 'se')
                        #     )
                        # )
                        composed_filter = composed_filter | filt_filters.compose(
                        )
                else:
                    list_ = self._build_custom_filter_list(
                        q_param, filt.get("custom"), list_)
                    for item in list_:
                        q_filter = QueryFilter(
                            parameter=item,
                            logical_operator=logical_operator,
                            **filt)
                        filters.add(q_filter)
        if filters:
            composed_filter = composed_filter & filters.compose()
        return composed_filter
Beispiel #9
0
 def get_org_units(self):
     """Get a list of org keys to build upon."""
     org_units = list()
     org_id_list = list()
     with tenant_context(self.tenant):
         for source in self.data_sources:
             # Grab columns for this query
             org_id = source.get("org_id_column")
             org_path = source.get("org_path_column")
             org_name = source.get("org_name_column")
             level = source.get("level_column")
             account_info = source.get("account_alias_column")
             created_field = source.get("created_time_column")
             # Create filters & Query
             account_filter = QueryFilterCollection()
             no_accounts = QueryFilter(field=f"{account_info}",
                                       operation="isnull",
                                       parameter=True)
             account_filter.add(no_accounts)
             remove_accounts = account_filter.compose()
             org_unit_query = source.get("db_table").objects
             org_unit_query = org_unit_query.filter(remove_accounts)
             org_unit_query = org_unit_query.exclude(
                 deleted_timestamp__lte=self.start_datetime)
             org_unit_query = org_unit_query.exclude(
                 created_timestamp__gt=self.end_datetime)
             val_list = [org_id, org_name, org_path, level]
             org_unit_query = org_unit_query.order_by(
                 f"{org_id}", f"-{created_field}").distinct(f"{org_id}")
             org_ids = org_unit_query.values_list("id", flat=True)
             if self.access:
                 acceptable_ous = self.access.get("aws.organizational_unit",
                                                  {}).get("read", [])
                 if acceptable_ous and "*" not in acceptable_ous:
                     allowed_ids_query = source.get("db_table").objects
                     allowed_ids_query = allowed_ids_query.filter(
                         reduce(operator.or_,
                                (Q(org_unit_path__icontains=rbac)
                                 for rbac in acceptable_ous
                                 ))).filter(remove_accounts)
                     allowed_ids = allowed_ids_query.values_list("id",
                                                                 flat=True)
                     org_ids = list(set(org_ids) & set(allowed_ids))
                     org_unit_query = org_unit_query.filter(id__in=org_ids)
             org_id_list.extend(org_ids)
             # Note: you want to collect the org_id_list before you implement the self.query_filter
             # so that way the get_sub_ou list will still work when you do filter[org_unit_id]=OU_002
             if self.query_filter:
                 org_unit_query = org_unit_query.filter(self.query_filter)
             org_unit_query = org_unit_query.values(*val_list)
             org_units.extend(org_unit_query)
     return org_units, org_id_list
Beispiel #10
0
    def _set_or_filters(self):
        """Create a composed filter collection of ORed filters.

        This is designed to handle specific cases in the provider_map
        not to accomodate user input via the API.

        """
        filters = QueryFilterCollection()
        or_filter = self._mapper._report_type_map.get("or_filter", [])
        for filt in or_filter:
            q_filter = QueryFilter(**filt)
            filters.add(q_filter)

        return filters.compose(logical_operator="or")
Beispiel #11
0
    def test_compose_with_or_operator(self):
        """Test the compose() method with or operator on the compose method."""
        qf_coll = QueryFilterCollection()
        operation = self.fake.word()
        filts = [
            QueryFilter(table=self.fake.word(),
                        field=self.fake.word(),
                        operation=operation,
                        parameter=self.fake.word()) for _ in range(2)
        ]

        expected = filts[0].composed_Q() | filts[1].composed_Q()
        qf_coll.add(filts[0])
        qf_coll.add(filts[1])
        self.assertEqual(qf_coll.compose(logical_operator="or"), expected)
Beispiel #12
0
    def _get_filter(self, delta=False):  # noqa: C901
        """Create dictionary for filter parameters.

        Args:
            delta (Boolean): Construct timeframe for delta
        Returns:
            (Dict): query filter dictionary

        """
        filters = QueryFilterCollection()
        if not self.parameters.get_filter("value"):
            for source in self.data_sources:
                start_filter, end_filter = self._get_time_based_filters(
                    source, delta)
                filters.add(query_filter=start_filter)
                filters.add(query_filter=end_filter)

        for filter_key in self.SUPPORTED_FILTERS:
            if self.parameters.get_filter("value") and filter_key == "enabled":
                continue
            filter_value = self.parameters.get_filter(filter_key)
            if filter_value and not TagQueryHandler.has_wildcard(filter_value):
                filter_obj = self.filter_map.get(filter_key)
                if isinstance(filter_value, bool):
                    filters.add(QueryFilter(**filter_obj))
                elif isinstance(filter_obj, list):
                    for _filt in filter_obj:
                        for item in filter_value:
                            q_filter = QueryFilter(parameter=item, **_filt)
                            filters.add(q_filter)
                else:
                    for item in filter_value:
                        q_filter = QueryFilter(parameter=item, **filter_obj)
                        filters.add(q_filter)
            access = self.parameters.get_access(filter_key)
            filt = self.filter_map.get(filter_key)
            if access and filt:
                self.set_access_filters(access, filt, filters)

        # Update filters that specifiy and or or in the query parameter
        and_composed_filters = self._set_operator_specified_filters("and")
        or_composed_filters = self._set_operator_specified_filters("or")

        composed_filters = filters.compose()
        composed_filters = composed_filters & and_composed_filters & or_composed_filters

        LOG.debug(f"_get_filter: {composed_filters}")
        return composed_filters
Beispiel #13
0
 def test_add_params(self):
     """Test the add() method using parameters."""
     table = self.fake.word()
     field = self.fake.word()
     operation = self.fake.word()
     parameter = self.fake.word()
     filt = QueryFilter(table=table,
                        field=field,
                        operation=operation,
                        parameter=parameter)
     qf_coll = QueryFilterCollection()
     qf_coll.add(table=table,
                 field=field,
                 operation=operation,
                 parameter=parameter)
     self.assertEqual(qf_coll._filters[0], filt)
Beispiel #14
0
    def _get_filter(self, delta=False):
        """Create dictionary for filter parameters.

        Args:
            delta (Boolean): Construct timeframe for delta
        Returns:
            (Dict): query filter dictionary

        """
        filters = QueryFilterCollection()

        # add time constraint filters
        start_filter, end_filter = self._get_time_based_filters(delta)
        filters.add(query_filter=start_filter)
        filters.add(query_filter=end_filter)

        return filters
Beispiel #15
0
 def test_compose(self):
     """Test the compose() method."""
     qf_coll = QueryFilterCollection()
     table = self.fake.word()
     field = self.fake.word()
     operation = self.fake.word()
     parameter = self.fake.word()
     filt = QueryFilter(table=table,
                        field=field,
                        operation=operation,
                        parameter=parameter)
     expected = filt.composed_Q()
     qf_coll.add(table=table,
                 field=field,
                 operation=operation,
                 parameter=parameter)
     self.assertEqual(qf_coll.compose(), expected)
Beispiel #16
0
    def test_compose_with_filter_with_and_operator(self):
        """Test the compose() method with and operator on the filter."""
        qf_coll = QueryFilterCollection()
        table = self.fake.word()
        field = self.fake.word()
        operation = self.fake.word()
        filts = [
            QueryFilter(table=table,
                        field=field,
                        operation=operation,
                        parameter=self.fake.word(),
                        logical_operator='and') for _ in range(2)
        ]

        expected = filts[0].composed_Q() & filts[1].composed_Q()
        qf_coll.add(filts[0])
        qf_coll.add(filts[1])
        self.assertEqual(qf_coll.compose(), expected)
Beispiel #17
0
    def _get_gcp_filter(self, delta=False):
        """Create dictionary for filter parameters for GCP.

        For the gcp filters when the time scope is -1 or -2 we remove
        the usage_start & usage_end filters and only use the invoice month.

        Args:
            delta (Boolean): Construct timeframe for delta
        Returns:
            (Dict): query filter dictionary
        """
        filters = QueryFilterCollection()
        if delta:
            date_delta = self._get_date_delta()
            start = self.start_datetime - date_delta
            end = self.end_datetime - date_delta
        else:
            start = self.start_datetime
            end = self.end_datetime
        start_filter = QueryFilter(field="usage_start",
                                   operation="gte",
                                   parameter=start.date())
        end_filter = QueryFilter(field="usage_start",
                                 operation="lte",
                                 parameter=end.date())

        invoice_months = self.dh.gcp_find_invoice_months_in_date_range(
            start.date(), end.date())
        invoice_filter = QueryFilter(field="invoice_month",
                                     operation="in",
                                     parameter=invoice_months)
        filters.add(invoice_filter)
        if self.parameters.get_filter(
                "time_scope_value") and self.time_scope_value in [-1, -2]:
            # we don't add the time filters to time scopes -1 or -2 unless they are using delta.
            if delta:
                filters.add(query_filter=start_filter)
                filters.add(query_filter=end_filter)
        else:
            filters.add(query_filter=start_filter)
            filters.add(query_filter=end_filter)
        return filters
Beispiel #18
0
 def _get_sub_ou_list(self, data, org_ids):
     """Get a list of the sub org units for a org unit."""
     level = data.get("level")
     level = level + 1
     unit_path = data.get("org_unit_path")
     final_sub_ou_list = []
     with tenant_context(self.tenant):
         for source in self.data_sources:
             # Grab columns for this query
             account_info = source.get("account_alias_column")
             level_column = source.get("level_column")
             org_path = source.get("org_path_column")
             # Build filters
             filters = QueryFilterCollection()
             no_accounts = QueryFilter(field=f"{account_info}", operation="isnull", parameter=True)
             filters.add(no_accounts)
             exact_parent_id = QueryFilter(field=f"{level_column}", operation="exact", parameter=level)
             filters.add(exact_parent_id)
             path_on_like = QueryFilter(field=f"{org_path}", operation="icontains", parameter=unit_path)
             filters.add(path_on_like)
             composed_filters = filters.compose()
             # Start quering
             sub_org_query = source.get("db_table").objects
             sub_org_query = sub_org_query.filter(composed_filters)
             sub_org_query = sub_org_query.filter(id__in=org_ids)
             sub_ou_list = sub_org_query.values_list("org_unit_id", flat=True)
             final_sub_ou_list.extend(sub_ou_list)
     return final_sub_ou_list
Beispiel #19
0
    def _get_exclusions(self, column):
        """Create dictionary for filter parameters for exclude clause.

        For tags this is to filter items that have null values for the
        specified tag field.

        Args:
            column (str): The tag column being queried

        Returns:
            (Dict): query filter dictionary

        """
        exclusions = QueryFilterCollection()
        filt = {"field": column, "operation": "isnull", "parameter": True}
        q_filter = QueryFilter(**filt)
        exclusions.add(q_filter)

        composed_exclusions = exclusions.compose()

        LOG.debug(f"_get_exclusions: {composed_exclusions}")
        return composed_exclusions
Beispiel #20
0
 def get_tag_values(self):
     """
     Gets the values associated with a tag when filtering on a value.
     """
     final_data = []
     with tenant_context(self.tenant):
         tag_keys = {}
         for source in self.TAGS_VALUES_SOURCE:
             vals_filter = QueryFilterCollection()
             for key_field in source.get("fields"):
                 vals_filter.add(
                     QueryFilter(field=key_field,
                                 operation="exact",
                                 parameter=self.key,
                                 composition_key="filter_key"))
             tag_values_query = source.get("db_table").objects
             filt = self.query_filter & vals_filter.compose()
             tag_keys = list(tag_values_query.filter(filt))
             tag_tup = self._value_filter_dict(tag_keys)
             converted = self._convert_to_dict(tag_tup)
             self.append_to_final_data_without_type(final_data, converted)
     self.deduplicate_and_sort(final_data)
     return final_data
Beispiel #21
0
class Forecast:
    """Base forecasting class."""

    # the minimum number of data points needed to use the current month's data.
    # if we have fewer than this many data points, fall back to using the previous month's data.
    #
    # this number is chosen in part because statsmodels.stats.stattools.omni_normtest() needs at least eight data
    # points to test for normal distribution.
    MINIMUM = 8

    # the precision of the floats returned in the forecast response.
    PRECISION = 8

    REPORT_TYPE = "costs"

    def __init__(self, query_params):  # noqa: C901
        """Class Constructor.

        Instance Attributes:
            - cost_summary_table (Model)
            - aggregates (dict)
            - filters (QueryFilterCollection)
            - query_range (tuple)
        """
        self.dh = DateHelper()
        self.params = query_params

        # select appropriate model based on access
        access = query_params.get("access", {})
        access_key = "default"
        self.cost_summary_table = self.provider_map.views.get("costs").get(access_key)
        if access:
            access_key = tuple(access.keys())
            filter_fields = self.provider_map.provider_map.get("filters")
            materialized_view = self.provider_map.views.get("costs").get(access_key)
            if materialized_view:
                # We found a matching materialized view, use that
                self.cost_summary_table = materialized_view
            else:
                # We have access constraints, but no view to accomodate, default to daily summary table
                self.cost_summary_table = self.provider_map.query_table

        self.forecast_days_required = (self.dh.this_month_end - self.dh.yesterday).days

        # forecasts use a rolling window
        self.query_range = (self.dh.n_days_ago(self.dh.yesterday, 30), self.dh.yesterday)

        self.filters = QueryFilterCollection()
        self.filters.add(field="usage_start", operation="gte", parameter=self.query_range[0])
        self.filters.add(field="usage_end", operation="lte", parameter=self.query_range[1])

        # filter queries based on access
        if access_key != "default":
            for q_param, filt in filter_fields.items():
                access = query_params.get_access(q_param, list())
                if access:
                    self.set_access_filters(access, filt, self.filters)

    @property
    def provider_map(self):
        """Return the provider map instance."""
        return self.provider_map_class(self.provider, self.REPORT_TYPE)

    @property
    def total_cost_term(self):
        """Return the provider map value for total cost."""
        return self.provider_map.report_type_map.get("aggregates", {}).get("cost_total")

    @property
    def supplementary_cost_term(self):
        """Return the provider map value for total supplemenatry cost."""
        return self.provider_map.report_type_map.get("aggregates", {}).get("sup_total")

    @property
    def infrastructure_cost_term(self):
        """Return the provider map value for total inftrastructure cost."""
        return self.provider_map.report_type_map.get("aggregates", {}).get("infra_total")

    def predict(self):
        """Define ORM query to run forecast and return prediction."""
        cost_predictions = {}
        with tenant_context(self.params.tenant):
            data = (
                self.cost_summary_table.objects.filter(self.filters.compose())
                .order_by("usage_start")
                .values("usage_start")
                .annotate(
                    total_cost=self.total_cost_term,
                    supplementary_cost=self.supplementary_cost_term,
                    infrastructure_cost=self.infrastructure_cost_term,
                )
            )

            for fieldname in ["total_cost", "infrastructure_cost", "supplementary_cost"]:
                uniq_data = self._uniquify_qset(data.values("usage_start", fieldname), field=fieldname)
                cost_predictions[fieldname] = self._predict(uniq_data)

            cost_predictions = self._key_results_by_date(cost_predictions)
            return self.format_result(cost_predictions)

    def _predict(self, data):
        """Handle pre and post prediction work.

        This function handles arranging incoming data to conform with statsmodels requirements.
        Then after receiving the forecast output, this function handles formatting to conform to
        API reponse requirements.

        Args:
            data (list) a list of (datetime, float) tuples

        Returns:
            (LinearForecastResult) linear forecast results object
        """
        LOG.debug("Forecast input data: %s", data)

        if len(data) < self.MINIMUM:
            LOG.warning(
                "Number of data elements (%s) is fewer than the minimum (%s). Unable to generate forecast.",
                len(data),
                self.MINIMUM,
            )
            return []

        dates, costs = zip(*data)

        X = self._enumerate_dates(dates)
        Y = [float(c) for c in costs]

        # calculate x-values for the prediction range
        pred_x = [i for i in range(X[-1] + 1, X[-1] + 1 + self.forecast_days_required)]

        # run the forecast
        results = self._run_forecast(X, Y, to_predict=pred_x)

        result_dict = {}
        for i, value in enumerate(results.prediction):
            # extrapolate confidence intervals to align with prediction.
            # this reduces the confidence interval below 95th percentile, but is a better UX.
            if i < len(results.confidence_lower):
                lower = results.confidence_lower[i]
            else:
                lower = results.confidence_lower[-1] + results.slope * (i - len(results.confidence_lower))

            if i < len(results.confidence_upper):
                upper = results.confidence_upper[i]
            else:
                upper = results.confidence_upper[-1] + results.slope * (i - len(results.confidence_upper))

            # ensure that there are no negative numbers.
            result_dict[self.dh.today.date() + timedelta(days=i)] = {
                "total_cost": max((value, 0)),
                "confidence_min": max((lower, 0)),
                "confidence_max": max((upper, 0)),
            }

        return (result_dict, results.rsquared, results.pvalues)

    def _enumerate_dates(self, date_list):
        """Given a list of dates, return a list of integers.

        This method works in conjunction with _remove_outliers(). This method works to preserve any gaps
        in the data created by _remove_outliers() so that the integers used for the X-axis are aligned
        appropriately.

        Example:
            If _remove_outliers() returns {"2000-01-01": 1.0, "2000-01-03": 1.5}
            then _enumerate_dates() returns [0, 2]
        """
        days = self.dh.list_days(
            datetime.combine(date_list[0], self.dh.midnight), datetime.combine(date_list[-1], self.dh.midnight)
        )
        out = [i for i, day in enumerate(days) if day.date() in date_list]
        return out

    def _remove_outliers(self, data):
        """Remove outliers from our dateset before predicting.

        We use a box plot method without plotting the box.
        """
        values = list(data.values())
        if values:
            third_quartile, first_quartile = np.percentile(values, [Decimal(75), Decimal(25)])
            interquartile_range = third_quartile - first_quartile

            upper_boundary = third_quartile + (Decimal(1.5) * interquartile_range)
            lower_boundary = first_quartile - (Decimal(1.5) * interquartile_range)

            return {key: value for key, value in data.items() if (value >= lower_boundary and value <= upper_boundary)}
        return data

    def _key_results_by_date(self, results, check_term="total_cost"):
        """Take results formatted by cost type, and return results keyed by date."""
        results_by_date = defaultdict(dict)
        date_based_dict = results[check_term][0] if results[check_term] else []
        for date in date_based_dict:
            for cost_term in results:
                if results[cost_term][0].get(date):
                    results_by_date[date][cost_term] = (
                        results[cost_term][0][date],
                        {"rsquared": results[cost_term][1]},
                        {"pvalues": results[cost_term][2]},
                    )
        return results_by_date

    def format_result(self, results):
        """Format results for API consumption."""
        f_format = f"%.{self.PRECISION}f"  # avoid converting floats to e-notation
        units = "USD"

        response = []
        for key in results:
            if key > self.dh.this_month_end.date():
                continue
            dikt = {
                "date": key,
                "values": [
                    {
                        "date": key,
                        "infrastructure": {
                            "total": {
                                "value": round(results[key]["infrastructure_cost"][0]["total_cost"], 3),
                                "units": units,
                            },
                            "confidence_max": {
                                "value": round(results[key]["infrastructure_cost"][0]["confidence_max"], 3),
                                "units": units,
                            },
                            "confidence_min": {
                                "value": round(max(results[key]["infrastructure_cost"][0]["confidence_min"], 0), 3),
                                "units": units,
                            },
                            "rsquared": {
                                "value": f_format % results[key]["infrastructure_cost"][1]["rsquared"],
                                "units": None,
                            },
                            "pvalues": {"value": results[key]["infrastructure_cost"][2]["pvalues"], "units": None},
                        },
                        "supplementary": {
                            "total": {
                                "value": round(results[key]["supplementary_cost"][0]["total_cost"], 3),
                                "units": units,
                            },
                            "confidence_max": {
                                "value": round(results[key]["supplementary_cost"][0]["confidence_max"], 3),
                                "units": units,
                            },
                            "confidence_min": {
                                "value": round(max(results[key]["supplementary_cost"][0]["confidence_min"], 0), 3),
                                "units": units,
                            },
                            "rsquared": {
                                "value": f_format % results[key]["supplementary_cost"][1]["rsquared"],
                                "units": None,
                            },
                            "pvalues": {"value": results[key]["supplementary_cost"][2]["pvalues"], "units": None},
                        },
                        "cost": {
                            "total": {"value": round(results[key]["total_cost"][0]["total_cost"], 3), "units": units},
                            "confidence_max": {
                                "value": round(results[key]["total_cost"][0]["confidence_max"], 3),
                                "units": units,
                            },
                            "confidence_min": {
                                "value": round(max(results[key]["total_cost"][0]["confidence_min"], 0), 3),
                                "units": units,
                            },
                            "rsquared": {"value": f_format % results[key]["total_cost"][1]["rsquared"], "units": None},
                            "pvalues": {"value": results[key]["total_cost"][2]["pvalues"], "units": None},
                        },
                    }
                ],
            }
            response.append(dikt)
        return response

    def _run_forecast(self, x, y, to_predict=None):
        """Apply the forecast model.

        Args:
            x (list) a list of exogenous variables
            y (list) a list of endogenous variables
            to_predict (list) a list of exogenous variables used in the forecast results

        Note:
            both x and y MUST be the same number of elements

        Returns:
            (tuple)
                (numpy.ndarray) prediction values
                (numpy.ndarray) confidence interval lower bound
                (numpy.ndarray) confidence interval upper bound
                (float) R-squared value
                (list) P-values
        """
        x = sm.add_constant(x)
        to_predict = sm.add_constant(to_predict)
        model = sm.OLS(y, x)
        results = model.fit()
        return LinearForecastResult(results, exog=to_predict)

    def _uniquify_qset(self, qset, field="total_cost"):
        """Take a QuerySet list, sum costs within the same day, and arrange it into a list of tuples.

        Args:
            qset (QuerySet)
            field (str) - field name in the QuerySet to be summed

        Returns:
            [(date, cost), ...]
        """
        # FIXME: this QuerySet->dict->list conversion probably isn't ideal.
        # FIXME: there's probably a way to aggregate multiple sources by date using just the ORM.
        result = defaultdict(Decimal)
        for item in qset:
            result[item.get("usage_start")] += Decimal(item.get(field, 0.0))
        result = self._remove_outliers(result)
        out = [(k, v) for k, v in result.items()]
        return out

    def set_access_filters(self, access, filt, filters):
        """Set access filters to ensure RBAC restrictions adhere to user's access and filters.

        Args:
            access (list) the list containing the users relevant access
            filt (list or dict) contains the filters to be updated
            filters (QueryFilterCollection) the filter collection to add the new filters to
        returns:
            None
        """
        if isinstance(filt, list):
            for _filt in filt:
                _filt["operation"] = "in"
                q_filter = QueryFilter(parameter=access, **_filt)
                filters.add(q_filter)
        else:
            filt["operation"] = "in"
            q_filter = QueryFilter(parameter=access, **filt)
            filters.add(q_filter)
Beispiel #22
0
    def test_add_bad(self):
        """Test the add() method using invalid values."""
        qf_coll = QueryFilterCollection()

        with self.assertRaises(AttributeError):
            qf_coll.add(self.fake.word(), self.fake.word(), self.fake.word())
Beispiel #23
0
 def _get_key_filter(self):
     """Add new `exact` QueryFilter that filters on the key name."""
     filters = QueryFilterCollection()
     filters.add(
         QueryFilter(field="key", operation="exact", parameter=self.key))
     return self.query_filter & filters.compose()
Beispiel #24
0
class Forecast:
    """Base forecasting class."""

    # the minimum number of data points needed to use the current month's data.
    # if we have fewer than this many data points, fall back to using the previous month's data.
    #
    # this number is chosen in part because statsmodels.stats.stattools.omni_normtest() needs at least eight data
    # points to test for normal distribution.
    MINIMUM = 8

    # the precision of the floats returned in the forecast response.
    PRECISION = 8

    REPORT_TYPE = "costs"

    dh = DateHelper()

    def __init__(self, query_params):  # noqa: C901
        """Class Constructor.

        Instance Attributes:
            - cost_summary_table (Model)
            - aggregates (dict)
            - filters (QueryFilterCollection)
            - query_range (tuple)
        """
        self.params = query_params

        # select appropriate model based on access
        access = query_params.get("access", {})
        access_key = "default"
        self.cost_summary_table = self.provider_map.views.get("costs").get(
            access_key)
        if access:
            access_key = tuple(access.keys())
            filter_fields = self.provider_map.provider_map.get("filters")
            materialized_view = self.provider_map.views.get("costs").get(
                access_key)
            if materialized_view:
                # We found a matching materialized view, use that
                self.cost_summary_table = materialized_view
            else:
                # We have access constraints, but no view to accomodate, default to daily summary table
                self.cost_summary_table = self.provider_map.report_type_map.get(
                    "tables", {}).get("query")

        current_day_of_month = self.dh.today.day
        yesterday = (self.dh.today - timedelta(days=1)).day
        last_day_of_month = self.dh.this_month_end.day

        if current_day_of_month == 1:
            self.forecast_days_required = last_day_of_month
        else:
            self.forecast_days_required = last_day_of_month - yesterday

        if current_day_of_month <= self.MINIMUM:
            self.query_range = (self.dh.last_month_start,
                                self.dh.last_month_end)
        else:
            self.query_range = (self.dh.this_month_start,
                                self.dh.today - timedelta(days=1))

        self.filters = QueryFilterCollection()
        self.filters.add(field="usage_start",
                         operation="gte",
                         parameter=self.query_range[0])
        self.filters.add(field="usage_end",
                         operation="lte",
                         parameter=self.query_range[1])

        # filter queries based on access
        if access_key != "default":
            for q_param, filt in filter_fields.items():
                access = query_params.get_access(q_param, list())
                if access:
                    self.set_access_filters(access, filt, self.filters)

    @property
    def provider_map(self):
        """Return the provider map instance."""
        return self.provider_map_class(self.provider, self.REPORT_TYPE)

    @property
    def total_cost_term(self):
        """Return the provider map value for total cost."""
        return self.provider_map.report_type_map.get("aggregates",
                                                     {}).get("cost_total")

    @property
    def supplementary_cost_term(self):
        """Return the provider map value for total supplemenatry cost."""
        return self.provider_map.report_type_map.get("aggregates",
                                                     {}).get("sup_total")

    @property
    def infrastructure_cost_term(self):
        """Return the provider map value for total inftrastructure cost."""
        return self.provider_map.report_type_map.get("aggregates",
                                                     {}).get("infra_total")

    def predict(self):
        """Define ORM query to run forecast and return prediction."""
        cost_predictions = {}
        with tenant_context(self.params.tenant):
            data = (self.cost_summary_table.objects.filter(
                self.filters.compose()).order_by("usage_start").values(
                    "usage_start").annotate(
                        total_cost=self.total_cost_term,
                        supplementary_cost=self.supplementary_cost_term,
                        infrastructure_cost=self.infrastructure_cost_term,
                    ))
            total_cost_data = data.values("usage_start", "total_cost")
            infra_cost_data = data.values("usage_start", "infrastructure_cost")
            supp_cost_data = data.values("usage_start", "supplementary_cost")
            cost_predictions["total_cost"] = self._predict(
                self._uniquify_qset(total_cost_data, field="total_cost"))
            cost_predictions["infrastructure_cost"] = self._predict(
                self._uniquify_qset(infra_cost_data,
                                    field="infrastructure_cost"))
            cost_predictions["supplementary_cost"] = self._predict(
                self._uniquify_qset(supp_cost_data,
                                    field="supplementary_cost"))
            cost_predictions = self._key_results_by_date(cost_predictions)
            return self.format_result(cost_predictions)

    def _predict(self, data):
        """Handle pre and post prediction work.

        Args:
            data (list) a list of (datetime, float) tuples

        Returns:
            (LinearForecastResult) linear forecast results object
        """
        LOG.debug("Forecast input data: %s", data)

        if len(data) < 2:
            LOG.warning(
                "Unable to calculate forecast. Insufficient data for %s.",
                self.params.tenant)
            return []

        if len(data) < self.MINIMUM:
            LOG.warning("Number of data elements is fewer than the minimum.")

        # arrange the data into a form that statsmodels will accept.
        dates, costs = zip(*data)
        X = [int(d.strftime("%Y%m%d")) for d in dates]
        Y = [float(c) for c in costs]
        # run the forecast
        results = self._run_forecast(X, Y)
        result_dict = {}
        for i, value in enumerate(results.prediction):
            result_dict[self.dh.today.date() + timedelta(days=i)] = {
                "total_cost": value,
                "confidence_min": results.confidence_lower[i],
                "confidence_max": results.confidence_upper[i],
            }
        result_dict = self._add_additional_data_points(result_dict,
                                                       results.slope)

        return (result_dict, results.rsquared, results.pvalues)

    def _remove_outliers(self, data):
        """Remove outliers from our dateset before predicting.

        We use a box plot method without plotting the box.
        """
        values = list(data.values())
        if values:
            third_quartile, first_quartile = np.percentile(
                values, [Decimal(75), Decimal(25)])
            interquartile_range = third_quartile - first_quartile

            upper_boundary = third_quartile + (Decimal(1.5) *
                                               interquartile_range)
            lower_boundary = first_quartile - (Decimal(1.5) *
                                               interquartile_range)

            return {
                key: value
                for key, value in data.items()
                if (value >= lower_boundary and value <= upper_boundary)
            }
        return data

    def _key_results_by_date(self, results, check_term="total_cost"):
        """Take results formatted by cost type, and return results keyed by date."""
        results_by_date = defaultdict(dict)
        date_based_dict = results[check_term][0] if results[check_term] else []
        for date in date_based_dict:
            for cost_term in results:
                results_by_date[date][cost_term] = (
                    results[cost_term][0][date],
                    {
                        "rsquared": results[cost_term][1]
                    },
                    {
                        "pvalues": results[cost_term][2]
                    },
                )
        return results_by_date

    def format_result(self, results):
        """Format results for API consumption."""
        f_format = f"%.{self.PRECISION}f"  # avoid converting floats to e-notation
        units = "USD"

        response = []
        for key in results:
            if key > self.dh.this_month_end.date():
                continue
            dikt = {
                "date":
                key,
                "values": [{
                    "date": key,
                    "infrastructure": {
                        "total": {
                            "value":
                            round(
                                results[key]["infrastructure_cost"][0]
                                ["total_cost"], 3),
                            "units":
                            units,
                        },
                        "confidence_max": {
                            "value":
                            round(
                                results[key]["infrastructure_cost"][0]
                                ["confidence_max"], 3),
                            "units":
                            units,
                        },
                        "confidence_min": {
                            "value":
                            round(
                                max(
                                    results[key]["infrastructure_cost"][0]
                                    ["confidence_min"], 0), 3),
                            "units":
                            units,
                        },
                        "rsquared": {
                            "value":
                            f_format %
                            results[key]["infrastructure_cost"][1]["rsquared"],
                            "units":
                            None,
                        },
                        "pvalues": {
                            "value":
                            f_format %
                            results[key]["infrastructure_cost"][2]["pvalues"],
                            "units":
                            None,
                        },
                    },
                    "supplementary": {
                        "total": {
                            "value":
                            round(
                                results[key]["supplementary_cost"][0]
                                ["total_cost"], 3),
                            "units":
                            units,
                        },
                        "confidence_max": {
                            "value":
                            round(
                                results[key]["supplementary_cost"][0]
                                ["confidence_max"], 3),
                            "units":
                            units,
                        },
                        "confidence_min": {
                            "value":
                            round(
                                max(
                                    results[key]["supplementary_cost"][0]
                                    ["confidence_min"], 0), 3),
                            "units":
                            units,
                        },
                        "rsquared": {
                            "value":
                            f_format %
                            results[key]["supplementary_cost"][1]["rsquared"],
                            "units":
                            None,
                        },
                        "pvalues": {
                            "value":
                            f_format %
                            results[key]["supplementary_cost"][2]["pvalues"],
                            "units":
                            None,
                        },
                    },
                    "cost": {
                        "total": {
                            "value":
                            round(results[key]["total_cost"][0]["total_cost"],
                                  3),
                            "units":
                            units
                        },
                        "confidence_max": {
                            "value":
                            round(
                                results[key]["total_cost"][0]
                                ["confidence_max"], 3),
                            "units":
                            units,
                        },
                        "confidence_min": {
                            "value":
                            round(
                                max(
                                    results[key]["total_cost"][0]
                                    ["confidence_min"], 0), 3),
                            "units":
                            units,
                        },
                        "rsquared": {
                            "value":
                            f_format %
                            results[key]["total_cost"][1]["rsquared"],
                            "units":
                            None
                        },
                        "pvalues": {
                            "value":
                            f_format %
                            results[key]["total_cost"][2]["pvalues"],
                            "units":
                            None
                        },
                    },
                }],
            }
            response.append(dikt)
        return response

    def _add_additional_data_points(self, results, slope):
        """Add extra entries to make sure we predict the full month."""
        additional_days_needed = 0
        dates = results.keys()
        last_predicted_date = max(dates)
        days_already_predicted = len(dates)

        last_predicted_cost = results[last_predicted_date]["total_cost"]
        last_predicted_max = results[last_predicted_date]["confidence_max"]
        last_predicted_min = results[last_predicted_date]["confidence_min"]

        if days_already_predicted < self.forecast_days_required:
            additional_days_needed = self.forecast_days_required - days_already_predicted

        for i in range(1, additional_days_needed + 1):
            results[last_predicted_date + timedelta(days=i)] = {
                "total_cost": last_predicted_cost + (slope * i),
                "confidence_min": last_predicted_min + (slope * i),
                "confidence_max": last_predicted_max + (slope * i),
            }

        return results

    def _run_forecast(self, x, y):
        """Apply the forecast model.

        Args:
            x (list) a list of exogenous variables
            y (list) a list of endogenous variables

        Note:
            both x and y MUST be the same number of elements

        Returns:
            (tuple)
                (numpy.ndarray) prediction values
                (numpy.ndarray) confidence interval lower bound
                (numpy.ndarray) confidence interval upper bound
                (float) R-squared value
                (list) P-values
        """
        model = sm.OLS(y, x)
        results = model.fit()
        return LinearForecastResult(results)

    def _uniquify_qset(self, qset, field="total_cost"):
        """Take a QuerySet list, sum costs within the same day, and arrange it into a list of tuples.

        Args:
            qset (QuerySet)

        Returns:
            [(date, cost), ...]
        """
        # FIXME: this QuerySet->dict->list conversion probably isn't ideal.
        # FIXME: there's probably a way to aggregate multiple sources by date using just the ORM.
        result = defaultdict(Decimal)
        for item in qset:
            result[item.get("usage_start")] += Decimal(item.get(field, 0.0))
        result = self._remove_outliers(result)
        out = [(k, v) for k, v in result.items()]
        return out

    def set_access_filters(self, access, filt, filters):
        """Set access filters to ensure RBAC restrictions adhere to user's access and filters.

        Args:
            access (list) the list containing the users relevant access
            filt (list or dict) contains the filters to be updated
            filters (QueryFilterCollection) the filter collection to add the new filters to
        returns:
            None
        """
        if isinstance(filt, list):
            for _filt in filt:
                _filt["operation"] = "in"
                q_filter = QueryFilter(parameter=access, **_filt)
                filters.add(q_filter)
        else:
            filt["operation"] = "in"
            q_filter = QueryFilter(parameter=access, **filt)
            filters.add(q_filter)