def __init__(self, data, domain=None, max_value=None, project=None, *args, **kwargs): """ Args: data (pd.Series): the series is stretched along the domain when the length of the series is 1 but a longer domain is specified. domain (TimeIndexSeries): defaults to None. max_value (int): specifies the maximum amount of units that can be reached in a phase, defaults to None. project (str): specifies to which project the line belongs """ super().__init__(data=data, *args, **kwargs) if (len(self.data) == 1) & (domain is not None): self.domain = domain self.data = pd.Series(data=self.data.iloc[0], index=self.domain.domain) elif domain: self.domain = domain else: self.domain = DateDomain(self.data.index[0], self.data.index[-1], freq=data.index.freq) self.max_value = max_value self.project = project
def calculate_poc_ideal_rate_line(self): """This function calculates the percentage of completion (poc) line given what has been realised so far and what still needs to be done to make the target deadline. This line is expressed in rate per day. The line is based on the poc real rate line and is extended with the daily rate that is required to make the target deadline. In the calculation of the required daily rate also holiday periods with zero activity are taken into account. Returns: poc ideal rate line (object) """ poc_real_rate_line = self.calculate_poc_real_rate_line() target_rate_line = self.calculate_target_rate_line() distance_to_max_value = poc_real_rate_line.distance_to_max_value() daysleft = poc_real_rate_line.daysleft(end=target_rate_line.domain.end) # normal case: when there is still work to do and there is time left before the target deadline if (distance_to_max_value > 0) & (daysleft > 0): domain = DateDomain( begin=poc_real_rate_line.domain.end, end=target_rate_line.domain.end ) holidays_in_date_range = self.count_holidays_in_date_range( self.holiday_periods, domain.domain ) domain = DateDomain( begin=poc_real_rate_line.domain.end, end=target_rate_line.domain.end - timedelta(holidays_in_date_range), ) slope = distance_to_max_value / (daysleft - holidays_in_date_range) line = poc_real_rate_line.append( TimeseriesLine(data=slope, domain=domain), skip=1 ) # exception: when there is still work to do but the target deadline has already passed elif (distance_to_max_value > 0) & (daysleft <= 0): slope = ( distance_to_max_value / 7 ) # past deadline, production needs to be finish within a week domain = DateDomain( begin=poc_real_rate_line.domain.end, end=pd.Timestamp.now() + timedelta(7), ) line = poc_real_rate_line.append( TimeseriesLine(data=slope, domain=domain), skip=1 ) # no more work to do, so ideal line == realised line else: line = poc_real_rate_line holiday_periods = self.slice_holiday_periods( holiday_periods=self.holiday_periods, periods_to_remove=poc_real_rate_line.domain.domain, ) line = self.add_holiday_periods_to_line(line, holiday_periods) line.name = "poc_ideal_indicator" line.max_value = self.phase_data["total_units"] return line
def slice(self, begin=None, end=None, **kwargs): if begin is None: begin = self.domain.begin if end is None: end = self.domain.end data = self.make_series()[begin:end] domain = DateDomain(begin, end) return TimeseriesLine(data, domain, **kwargs)
def create_line(value): """ Creates a timseriesline from a single data point, on todays date. Args: value: value to be made into a timeseriesline Returns: a TimeseriesLine with index today and one value """ domain = DateDomain(pd.datetime.today(), pd.datetime.today()) return TimeseriesLine(domain=domain, data=value)
def extrapolate(self, data_partition=None, **kwargs): """ Extrapolates a Linearline from datapoints in current line Args: data_partition: Fraction of data to use to calculate extrapolation. **kwargs: Returns: LinearLine with extrapolated data. """ slope, intercept = self.linear_regression(data_partition) domain = DateDomain(self.data.index[0], self.data.index[-1]) return LinearLine(slope=slope, intercept=intercept, domain=domain, **kwargs)
def _add_holiday_period(self, line, holiday_period): """ Helper function to add a single rest period to a TimeseriesLine Args: line: rest_period: Returns: """ holiday_period_line = TimeseriesLine( domain=DateDomain(begin=holiday_period[0], end=holiday_period[-1]), data=0 ) before_line = line.slice(end=holiday_period.min()) after_line = line.slice(begin=holiday_period.min()).translate_x( len(holiday_period) ) return before_line.append(holiday_period_line, skip=1, skip_base=True).append( after_line )
def focus_domain(self, lower_treshold=None, upper_treshold=np.Inf): """ Focus the domain of a line between extreme values. Args: lower_treshold: Lowest y-value to shrink the domain to. upper_treshold: Highest y-value to shrink the domain to. Returns: New line with shrunk domain. """ if lower_treshold is not None: intersect = lower_treshold else: lower_treshold = -np.Inf intersect = self.intercept series = self.make_series() focused_series = series[(series >= lower_treshold) & (series <= upper_treshold)] domain = DateDomain(focused_series.index.min(), focused_series.index.max()) return LinearLine(slope=self.slope, intercept=intersect, domain=domain)
class TimeseriesLine(PointLine): """ A point line is a collection of datapoints on a shared datetime index. """ def __init__(self, data, domain=None, max_value=None, project=None, *args, **kwargs): """ Args: data (pd.Series): the series is stretched along the domain when the length of the series is 1 but a longer domain is specified. domain (TimeIndexSeries): defaults to None. max_value (int): specifies the maximum amount of units that can be reached in a phase, defaults to None. project (str): specifies to which project the line belongs """ super().__init__(data=data, *args, **kwargs) if (len(self.data) == 1) & (domain is not None): self.domain = domain self.data = pd.Series(data=self.data.iloc[0], index=self.domain.domain) elif domain: self.domain = domain else: self.domain = DateDomain(self.data.index[0], self.data.index[-1], freq=data.index.freq) self.max_value = max_value self.project = project def make_series(self): """ Make pandas series of line. Returns: pandas.Series of data in Line. """ filled_data = self.data.reindex(self.domain.domain, fill_value=0) return filled_data def extrapolate(self, data_partition=None, **kwargs): """ Extrapolates a Linearline from datapoints in current line Args: data_partition: Fraction of data to use to calculate extrapolation. **kwargs: Returns: LinearLine with extrapolated data. """ slope, intercept = self.linear_regression(data_partition) domain = DateDomain(self.data.index[0], self.data.index[-1]) return LinearLine(slope=slope, intercept=intercept, domain=domain, **kwargs) def integrate(self): """ https://en.wikipedia.org/wiki/Numerical_integration """ # Temporarily use old cumsum method to mimic old implementation integral = self.make_series().cumsum() return TimeseriesLine(data=integral) def append(self, other, skip=0, skip_base=False, **kwargs): """ Args: other: Instance of timeseries line of which the values will be added to the end of the current line. skip: keyword argument to skip start of index of input line, to be able to append lines that have partially overlapping indices. Returns: A new timeseries line """ if self.domain.end > other.domain.begin: raise NotImplementedError( "You can only add lines that have a higher index than the line in the object" ) if skip_base: series = self.make_series()[:-skip] other_series = other.make_series() else: series = self.make_series() other_series = other.make_series()[skip:] intersect = series.index.intersection(other_series.index) if len(intersect): raise ValueError( f"Cannot append Lines that have overlapping indices: {intersect}" ) return TimeseriesLine(series.add(other_series, fill_value=0), **kwargs) def translate_x(self, delta=0, **kwargs): data = self.data data.index = data.index + timedelta(days=delta) domain = self.domain.shift(delta) return TimeseriesLine(data=data, domain=domain, **kwargs) def slice(self, begin=None, end=None, **kwargs): if begin is None: begin = self.domain.begin if end is None: end = self.domain.end data = self.make_series()[begin:end] domain = DateDomain(begin, end) return TimeseriesLine(data, domain, **kwargs) def linear_regression(self, data_partition=None): """ Given a set of points, do a linear regression to extrapolate future data """ if data_partition: shift = int(len(self.domain) * data_partition) time_shift = relativedelta(days=shift) start = self.data.index[0] + time_shift end = self.data.index[-1] data = self.data[start:end] index = list(range(shift, len(data) + shift)) else: index = list(range(0, len(self.data))) data = self.data if len(data) >= 2: slope, intersect = np.polyfit(index, data, 1) else: slope = 0 intersect = 0 return slope, intersect # this function requires a line based on speed, not distance def get_line_aggregate( self, freq="MS", aggregate_type="series", loffset="0", closed="left", index_as_str=False, ): """This function takes the line specified in the object and aggregates its values to a chosen type of output. The output can be a series or a value which is aggregated to a frequency of MS, W-MON or Y by the method sum or mean. Args: freq (str): type of frequency for aggregation. Defaults to 'MS'. aggregate_type (str): output can be aggregated as value or series. Defaults to 'series'. loffset (str): the number of bins the values in the bins is shifted to the left. Defaults to '0'. closed (str): boundary that belongs to the current bin. Defaults to 'left'. index_as_str (bool): determines if the index of the aggregated series is formatted to string. Defaults to False. Raises: NotImplementedError: this type of aggregation is not implemented. Returns: aggregate (pd.Series or int or float) """ if aggregate_type == "series": series = self.make_normalised_series(maximum=self.max_value, percentage=True) aggregate = (series.resample(freq, loffset=loffset + freq, closed=closed).sum().cumsum()) if index_as_str: aggregate.index = aggregate.index.format() elif aggregate_type == "value_sum": series = self.make_series() aggregate = series.resample(freq, loffset=loffset + freq, closed=closed).sum() period_for_output = self.period_for_output(freq) if period_for_output in series.index: aggregate = aggregate[period_for_output] else: aggregate = 0 elif aggregate_type == "value_mean": series = self.make_series() aggregate = series.resample(freq, loffset=loffset + freq, closed=closed).mean() period_for_output = self.period_for_output(freq) if period_for_output in series.index: aggregate = aggregate[period_for_output] else: aggregate = 0 else: raise NotImplementedError( "No method implemented for aggregate type {}".format( aggregate_type)) return aggregate def period_for_output(self, freq: str): """This functions returns the index of next week based on W-MON or next month based on MS. Args: freq (str): frequency for which the index of the next period has to be returned. Raises: NotImplementedError: there is no method implemented for this type of frequency. Returns: index for next period (str) """ if freq == "MS": period = pd.Timestamp(pd.Timestamp.now().year, pd.Timestamp.now().month, 1) + relativedelta(months=1) elif freq == "W-MON": period = pd.to_datetime(pd.Timestamp.now().date() + relativedelta( days=7 - pd.Timestamp.now().weekday())) else: raise NotImplementedError( "There is no output period implemented for this frequency {}". format(freq)) return period def distance_to_max_value(self, line_type="rate"): """This function calculates the distance between the end of the line and the maximum value specified for the line. Args: line_type (str): when the line type is rate (line is expressed in rates), first the integral of the line is calculated before the distance is calculated. For line type cumulative, the distance is directly calculated. Defaults to 'rate'. Raises: NotImplementedError: when there is no method implemented for given line_type. Returns: distance (float) """ if self.max_value: if line_type == "rate": distance = self.max_value - self.integrate( ).get_most_recent_point() elif line_type == "cumulative": distance = self.max_value - self.get_most_recent_point() else: raise NotImplementedError( "There is no method implemented for line_type {}".format( line_type)) else: raise ValueError return distance def daysleft(self, end=None, slope=None): """This function calculates the number of days between the date of the latest data point in the line and the date of the intended deadline. Args: end (str or pd.DateTimeIndex): date of the intended deadline. Defaults to None. slope (float or int): the daily rate to be applied between the date of the latest data point and the date of the intended deadline. Defaults to None. Returns: daysleft (int) """ if end: daysleft = (pd.to_datetime(end) - self.domain.end).days elif (slope is not None) & (self.distance_to_max_value() is not None): daysleft = int(self.distance_to_max_value() / slope) else: daysleft = None return daysleft def resample(self, freq="MS", method="sum", label="left", closed="left"): """This function takes the line specified in the object and resamples its values. The output is a TimeseriesLine. Args: freq (str): type of frequency for aggregation. Options are 'MS' and 'W-MON', default is 'MS'. loffset (str): the number of bins the values in the bins is shifted to the left. Defaults to '0'. closed (str): boundary that belongs to the current bin. Defaults to 'left'. index_as_str (bool): determines if the index of the aggregated series is formatted to string. Defaults to False. method (str): the method used to aggregate. Choose 'sum' or 'mean', defaul is 'sum' Returns: aggregate series (pd.Series) """ if not (freq == "MS" or freq == "W-MON" or freq == "YS"): raise NotImplementedError( "No method implemented for frequency type {}, " 'choose "D", "W-MON" or "MS"'.format(freq)) series = self.make_series() aggregate = series.resample(freq, closed=closed, label=label).agg(method) return TimeseriesLine(data=aggregate) def split_by_year(self): """ The function checks wichs years are present in the index and splits the timeseries per year Returns: a list of TimeseriesLine objects per year """ series = self.make_series() timeseries_per_year = [] for year in range( self.get_extreme_period_of_series("year", "min"), self.get_extreme_period_of_series("year", "max") + 1, ): year_serie = series[( (series.index >= pd.Timestamp(year=year, month=1, day=1)) & (series.index <= pd.Timestamp(year=year, month=12, day=31)))] timeseries_per_year.append(TimeseriesLine(year_serie)) return timeseries_per_year def get_extreme_period_of_series(self, period, extreme="min"): """ This function returns the first year, month or day present in a TimeseriesLine Args: period: str {year, month, day}: period to be returned extreme: str {min, max}: minimal or maximum to be returned Returns: int: returns the first or last day, month of year present in a TimeseriesLine """ series = self.make_series() if extreme == "min": extreme_date = series.index.min() elif extreme == "max": extreme_date = series.index.max() else: raise ValueError( f'This extreme "{extreme}" is not configured, pick "min" / "max"' ) if period == "year": return extreme_date.year elif period == "month": return extreme_date.month elif period == "day": return extreme_date.day else: raise ValueError( f'This period "{period}" is not configured, pick "year" / "month" / "day"' )