Exemple #1
0
 def __init__(self,
              data,
              domain=None,
              max_value=None,
              project=None,
              *args,
              **kwargs):
     """
     Args:
         data (pd.Series): the series is stretched along the domain when
                           the length of the series is 1 but a longer domain is specified.
         domain (TimeIndexSeries): defaults to None.
         max_value (int): specifies the maximum amount of units that can be reached in a phase,
                          defaults to None.
         project (str): specifies to which project the line belongs
     """
     super().__init__(data=data, *args, **kwargs)
     if (len(self.data) == 1) & (domain is not None):
         self.domain = domain
         self.data = pd.Series(data=self.data.iloc[0],
                               index=self.domain.domain)
     elif domain:
         self.domain = domain
     else:
         self.domain = DateDomain(self.data.index[0],
                                  self.data.index[-1],
                                  freq=data.index.freq)
     self.max_value = max_value
     self.project = project
    def calculate_poc_ideal_rate_line(self):
        """This function calculates the percentage of completion (poc) line given what has been realised so far and what still
        needs to be done to make the target deadline. This line is expressed in rate per day. The line is based on the
        poc real rate line and is extended with the daily rate that is required to make the target deadline.
        In the calculation of the required daily rate also holiday periods with zero activity are taken into account.

        Returns:
            poc ideal rate line (object)
        """
        poc_real_rate_line = self.calculate_poc_real_rate_line()
        target_rate_line = self.calculate_target_rate_line()
        distance_to_max_value = poc_real_rate_line.distance_to_max_value()
        daysleft = poc_real_rate_line.daysleft(end=target_rate_line.domain.end)
        # normal case: when there is still work to do and there is time left before the target deadline
        if (distance_to_max_value > 0) & (daysleft > 0):
            domain = DateDomain(
                begin=poc_real_rate_line.domain.end, end=target_rate_line.domain.end
            )
            holidays_in_date_range = self.count_holidays_in_date_range(
                self.holiday_periods, domain.domain
            )
            domain = DateDomain(
                begin=poc_real_rate_line.domain.end,
                end=target_rate_line.domain.end - timedelta(holidays_in_date_range),
            )
            slope = distance_to_max_value / (daysleft - holidays_in_date_range)

            line = poc_real_rate_line.append(
                TimeseriesLine(data=slope, domain=domain), skip=1
            )
        # exception: when there is still work to do but the target deadline has already passed
        elif (distance_to_max_value > 0) & (daysleft <= 0):
            slope = (
                distance_to_max_value / 7
            )  # past deadline, production needs to be finish within a week
            domain = DateDomain(
                begin=poc_real_rate_line.domain.end,
                end=pd.Timestamp.now() + timedelta(7),
            )
            line = poc_real_rate_line.append(
                TimeseriesLine(data=slope, domain=domain), skip=1
            )
        # no more work to do, so ideal line == realised line
        else:
            line = poc_real_rate_line
        holiday_periods = self.slice_holiday_periods(
            holiday_periods=self.holiday_periods,
            periods_to_remove=poc_real_rate_line.domain.domain,
        )
        line = self.add_holiday_periods_to_line(line, holiday_periods)
        line.name = "poc_ideal_indicator"
        line.max_value = self.phase_data["total_units"]
        return line
Exemple #3
0
 def slice(self, begin=None, end=None, **kwargs):
     if begin is None:
         begin = self.domain.begin
     if end is None:
         end = self.domain.end
     data = self.make_series()[begin:end]
     domain = DateDomain(begin, end)
     return TimeseriesLine(data, domain, **kwargs)
Exemple #4
0
    def create_line(value):
        """
        Creates a timseriesline from a single data point, on todays date.

        Args:
            value: value to be made into a timeseriesline

        Returns: a TimeseriesLine with index today and one value

        """
        domain = DateDomain(pd.datetime.today(), pd.datetime.today())
        return TimeseriesLine(domain=domain, data=value)
Exemple #5
0
    def extrapolate(self, data_partition=None, **kwargs):
        """
        Extrapolates a Linearline from datapoints in current line

        Args:
            data_partition: Fraction of data to use to calculate extrapolation.
            **kwargs:

        Returns: LinearLine with extrapolated data.

        """
        slope, intercept = self.linear_regression(data_partition)
        domain = DateDomain(self.data.index[0], self.data.index[-1])
        return LinearLine(slope=slope,
                          intercept=intercept,
                          domain=domain,
                          **kwargs)
    def _add_holiday_period(self, line, holiday_period):
        """
        Helper function to add a single rest period to a TimeseriesLine
        Args:
            line:
            rest_period:

        Returns:

        """
        holiday_period_line = TimeseriesLine(
            domain=DateDomain(begin=holiday_period[0], end=holiday_period[-1]), data=0
        )
        before_line = line.slice(end=holiday_period.min())
        after_line = line.slice(begin=holiday_period.min()).translate_x(
            len(holiday_period)
        )
        return before_line.append(holiday_period_line, skip=1, skip_base=True).append(
            after_line
        )
Exemple #7
0
    def focus_domain(self, lower_treshold=None, upper_treshold=np.Inf):
        """
        Focus the domain of a line between extreme values.
        Args:
            lower_treshold: Lowest y-value to shrink the domain to.
            upper_treshold: Highest y-value to shrink the domain to.

        Returns: New line with shrunk domain.

        """
        if lower_treshold is not None:
            intersect = lower_treshold
        else:
            lower_treshold = -np.Inf
            intersect = self.intercept
        series = self.make_series()
        focused_series = series[(series >= lower_treshold)
                                & (series <= upper_treshold)]
        domain = DateDomain(focused_series.index.min(),
                            focused_series.index.max())
        return LinearLine(slope=self.slope, intercept=intersect, domain=domain)
Exemple #8
0
class TimeseriesLine(PointLine):
    """
    A point line is a collection of datapoints on a shared datetime index.
    """
    def __init__(self,
                 data,
                 domain=None,
                 max_value=None,
                 project=None,
                 *args,
                 **kwargs):
        """
        Args:
            data (pd.Series): the series is stretched along the domain when
                              the length of the series is 1 but a longer domain is specified.
            domain (TimeIndexSeries): defaults to None.
            max_value (int): specifies the maximum amount of units that can be reached in a phase,
                             defaults to None.
            project (str): specifies to which project the line belongs
        """
        super().__init__(data=data, *args, **kwargs)
        if (len(self.data) == 1) & (domain is not None):
            self.domain = domain
            self.data = pd.Series(data=self.data.iloc[0],
                                  index=self.domain.domain)
        elif domain:
            self.domain = domain
        else:
            self.domain = DateDomain(self.data.index[0],
                                     self.data.index[-1],
                                     freq=data.index.freq)
        self.max_value = max_value
        self.project = project

    def make_series(self):
        """
        Make pandas series of line.
        Returns: pandas.Series of data in Line.

        """
        filled_data = self.data.reindex(self.domain.domain, fill_value=0)
        return filled_data

    def extrapolate(self, data_partition=None, **kwargs):
        """
        Extrapolates a Linearline from datapoints in current line

        Args:
            data_partition: Fraction of data to use to calculate extrapolation.
            **kwargs:

        Returns: LinearLine with extrapolated data.

        """
        slope, intercept = self.linear_regression(data_partition)
        domain = DateDomain(self.data.index[0], self.data.index[-1])
        return LinearLine(slope=slope,
                          intercept=intercept,
                          domain=domain,
                          **kwargs)

    def integrate(self):
        """
        https://en.wikipedia.org/wiki/Numerical_integration
        """
        # Temporarily use old cumsum method to mimic old implementation
        integral = self.make_series().cumsum()
        return TimeseriesLine(data=integral)

    def append(self, other, skip=0, skip_base=False, **kwargs):
        """

        Args:
            other: Instance of timeseries line of which the values will be added to the end of the current line.
            skip: keyword argument to skip start of index of input line, to be able to append lines that have partially
            overlapping indices.

        Returns:
            A new timeseries line
        """
        if self.domain.end > other.domain.begin:
            raise NotImplementedError(
                "You can only add lines that have a higher index than the line in the object"
            )

        if skip_base:
            series = self.make_series()[:-skip]
            other_series = other.make_series()
        else:
            series = self.make_series()
            other_series = other.make_series()[skip:]

        intersect = series.index.intersection(other_series.index)
        if len(intersect):
            raise ValueError(
                f"Cannot append Lines that have overlapping indices: {intersect}"
            )

        return TimeseriesLine(series.add(other_series, fill_value=0), **kwargs)

    def translate_x(self, delta=0, **kwargs):
        data = self.data
        data.index = data.index + timedelta(days=delta)
        domain = self.domain.shift(delta)
        return TimeseriesLine(data=data, domain=domain, **kwargs)

    def slice(self, begin=None, end=None, **kwargs):
        if begin is None:
            begin = self.domain.begin
        if end is None:
            end = self.domain.end
        data = self.make_series()[begin:end]
        domain = DateDomain(begin, end)
        return TimeseriesLine(data, domain, **kwargs)

    def linear_regression(self, data_partition=None):
        """
        Given a set of points, do a linear regression to extrapolate future data
        """
        if data_partition:
            shift = int(len(self.domain) * data_partition)
            time_shift = relativedelta(days=shift)
            start = self.data.index[0] + time_shift
            end = self.data.index[-1]
            data = self.data[start:end]
            index = list(range(shift, len(data) + shift))
        else:
            index = list(range(0, len(self.data)))
            data = self.data
        if len(data) >= 2:
            slope, intersect = np.polyfit(index, data, 1)
        else:
            slope = 0
            intersect = 0
        return slope, intersect

    #  this function requires a line based on speed, not distance
    def get_line_aggregate(
        self,
        freq="MS",
        aggregate_type="series",
        loffset="0",
        closed="left",
        index_as_str=False,
    ):
        """This function takes the line specified in the object and aggregates its values to a chosen type of output.
        The output can be a series or a value which is aggregated to a frequency of MS, W-MON or Y by the method sum
        or mean.

        Args:
            freq (str): type of frequency for aggregation. Defaults to 'MS'.
            aggregate_type (str): output can be aggregated as value or series. Defaults to 'series'.
            loffset (str): the number of bins the values in the bins is shifted to the left. Defaults to '0'.
            closed (str): boundary that belongs to the current bin. Defaults to 'left'.
            index_as_str (bool): determines if the index of the aggregated series is formatted to string. Defaults to False.

        Raises:
            NotImplementedError: this type of aggregation is not implemented.

        Returns:
            aggregate (pd.Series or int or float)
        """
        if aggregate_type == "series":
            series = self.make_normalised_series(maximum=self.max_value,
                                                 percentage=True)
            aggregate = (series.resample(freq,
                                         loffset=loffset + freq,
                                         closed=closed).sum().cumsum())
            if index_as_str:
                aggregate.index = aggregate.index.format()
        elif aggregate_type == "value_sum":
            series = self.make_series()
            aggregate = series.resample(freq,
                                        loffset=loffset + freq,
                                        closed=closed).sum()
            period_for_output = self.period_for_output(freq)
            if period_for_output in series.index:
                aggregate = aggregate[period_for_output]
            else:
                aggregate = 0
        elif aggregate_type == "value_mean":
            series = self.make_series()
            aggregate = series.resample(freq,
                                        loffset=loffset + freq,
                                        closed=closed).mean()
            period_for_output = self.period_for_output(freq)
            if period_for_output in series.index:
                aggregate = aggregate[period_for_output]
            else:
                aggregate = 0
        else:
            raise NotImplementedError(
                "No method implemented for aggregate type {}".format(
                    aggregate_type))

        return aggregate

    def period_for_output(self, freq: str):
        """This functions returns the index of next week based on W-MON or next month based on MS.

        Args:
            freq (str): frequency for which the index of the next period has to be returned.

        Raises:
            NotImplementedError: there is no method implemented for this type of frequency.

        Returns:
            index for next period (str)
        """
        if freq == "MS":
            period = pd.Timestamp(pd.Timestamp.now().year,
                                  pd.Timestamp.now().month,
                                  1) + relativedelta(months=1)
        elif freq == "W-MON":
            period = pd.to_datetime(pd.Timestamp.now().date() + relativedelta(
                days=7 - pd.Timestamp.now().weekday()))
        else:
            raise NotImplementedError(
                "There is no output period implemented for this frequency {}".
                format(freq))
        return period

    def distance_to_max_value(self, line_type="rate"):
        """This function calculates the distance between the end of the line and the maximum value specified for the line.

        Args:
            line_type (str): when the line type is rate (line is expressed in rates), first the integral of the line is calculated
            before the distance is calculated. For line type cumulative, the distance is directly calculated. Defaults to 'rate'.

        Raises:
            NotImplementedError: when there is no method implemented for given line_type.

        Returns:
            distance (float)
        """
        if self.max_value:
            if line_type == "rate":
                distance = self.max_value - self.integrate(
                ).get_most_recent_point()
            elif line_type == "cumulative":
                distance = self.max_value - self.get_most_recent_point()
            else:
                raise NotImplementedError(
                    "There is no method implemented for line_type {}".format(
                        line_type))
        else:
            raise ValueError
        return distance

    def daysleft(self, end=None, slope=None):
        """This function calculates the number of days between the date of the latest data point in the line and
        the date of the intended deadline.

        Args:
            end (str or pd.DateTimeIndex): date of the intended deadline. Defaults to None.
            slope (float or int): the daily rate to be applied between the date of the latest data point and the date
            of the intended deadline. Defaults to None.

        Returns:
            daysleft (int)
        """
        if end:
            daysleft = (pd.to_datetime(end) - self.domain.end).days
        elif (slope is not None) & (self.distance_to_max_value() is not None):
            daysleft = int(self.distance_to_max_value() / slope)
        else:
            daysleft = None
        return daysleft

    def resample(self, freq="MS", method="sum", label="left", closed="left"):
        """This function takes the line specified in the object and resamples its values.
        The output is a TimeseriesLine.

        Args:
            freq (str): type of frequency for aggregation. Options are 'MS' and 'W-MON', default is 'MS'.
            loffset (str): the number of bins the values in the bins is shifted to the left. Defaults to '0'.
            closed (str): boundary that belongs to the current bin. Defaults to 'left'.
            index_as_str (bool): determines if the index of the aggregated series is formatted to string. Defaults to False.
            method (str): the method used to aggregate. Choose 'sum' or 'mean', defaul is 'sum'

        Returns:
            aggregate series (pd.Series)
        """

        if not (freq == "MS" or freq == "W-MON" or freq == "YS"):
            raise NotImplementedError(
                "No method implemented for frequency type {}, "
                'choose "D", "W-MON" or "MS"'.format(freq))

        series = self.make_series()

        aggregate = series.resample(freq, closed=closed,
                                    label=label).agg(method)
        return TimeseriesLine(data=aggregate)

    def split_by_year(self):
        """
        The function checks wichs years are present in the index and splits the timeseries per year

        Returns: a list of TimeseriesLine objects per year
        """
        series = self.make_series()
        timeseries_per_year = []
        for year in range(
                self.get_extreme_period_of_series("year", "min"),
                self.get_extreme_period_of_series("year", "max") + 1,
        ):
            year_serie = series[(
                (series.index >= pd.Timestamp(year=year, month=1, day=1))
                & (series.index <= pd.Timestamp(year=year, month=12, day=31)))]
            timeseries_per_year.append(TimeseriesLine(year_serie))
        return timeseries_per_year

    def get_extreme_period_of_series(self, period, extreme="min"):
        """
        This function returns the first year, month or day present in a TimeseriesLine
        Args:
            period: str {year, month, day}: period to be returned
            extreme: str {min, max}: minimal or maximum to be returned

        Returns: int: returns the first or last day, month of year present in a TimeseriesLine

        """
        series = self.make_series()
        if extreme == "min":
            extreme_date = series.index.min()
        elif extreme == "max":
            extreme_date = series.index.max()
        else:
            raise ValueError(
                f'This extreme "{extreme}" is not configured, pick "min" / "max"'
            )

        if period == "year":
            return extreme_date.year
        elif period == "month":
            return extreme_date.month
        elif period == "day":
            return extreme_date.day
        else:
            raise ValueError(
                f'This period "{period}" is not configured, pick "year" / "month" / "day"'
            )