def to_array(self, ts, start_date=None, end_date=None, month_index=None, include_months=None, include_missing=True, match_other_nonmissing=None, paired_ts=None, return_type=None): """ Return an array containing the data values of the time series for the specified period. If the start date or end date are outside the period of record for the time series, use the missing data value from the time series for those values. If the start date or end date are null, the start and end dates of the time series are used. This is a utility routine mainly used by other versions of this routine. @return The array of data for the time series. If an error, return null. @param ts Time series to convert data to array format. @param start_date Date corresponding to the first date of the returned array. @param end_date Date corresponding to the last date of the returned array. @param month_index Month of interest (1=Jan, 12=Dec). If zero, process all months. @param include_months @param include_missing indicate whether missing values should be included in the result. @param match_other_nonmissing @param paired_ts @param return_type """ if month_index == None: # Called with no month index month_indices = None if month_index != 0: month_indices = [month_index] # Recursively call return self.to_array(ts, start_date=start_date, end_date=end_date, include_months=include_months, include_missing=include_missing) # If here do the processing based on input arguments if paired_ts is not None: if not TimeInterval.is_regular_interval(ts.get_data_interval_base()): # throw new IrregularTimeSeriesNotSupportedException( raise ValueError( "Irregular interval time series cannot have data array extracted using paired time series.") if not self.intervals_match(ts, paired_ts): # throw new UnequalTimeIntervalException( raise ValueError( "Time series from which to extract data has a different interval than paired time series.") # Get valid dates because the ones passed in may have been null... valid_dates = self.get_valid_period(ts, start_date, end_date) start = valid_dates.get_date1() end = valid_dates.get_date2() interval_base = ts.get_data_interval_base() interval_mult = ts.get_data_interval_mult() size = 0 # if ts.get_data_interval_base() == TimeInterval.IRREGULAR: # size = self.calculate_data_size(ts, start, end) # else: size = self.calculate_data_size(start, end, interval_base, interval_mult) if return_type is None: return_type = TSToArrayReturnType.DATA_VALUE if return_type == TSToArrayReturnType.DATE_TIME: # Only 1Year, 1Month, 1Day intervals are supported if (interval_mult != 1) or ((interval_base != TimeInterval.YEAR) and (interval_base != TimeInterval.YEAR) and (interval_base != TimeInterval.YEAR)): # throw new InvalidTimeIntervalException( raise ValueError( "Interval must be Year, Month, or Day (no multiplier) to return date/time as array.") include_months_mask = [] if (include_months is None) or (len(include_months) == 0): for i in range(12): include_months_mask[i] = True else: for i in range(12): include_months_mask[i] = False for i in range(len(include_months)): include_months_mask[include_months[i] - 1] = True if size == 0: return [] data_array = [] # Initial size including missing count = 0 # Number of values in array. month = 0 # Month if interval_base == TimeInterval.IRREGULAR: # Get the data and loop through the vector... irrts = ts alltsdata = irrts.get_data() if alltsdata is None: # No data for the time series... return None nalltsdata = len(alltsdata) tsdata = None date = None for i in range(nalltsdata): tsdata = alltsdata[i] date = tsdata.get_date() if date.greater_than(end): # Past the end of where we want to go so quit... break if date.greater_than_or_equal_to(start): month = date.get_month() if include_months_mask[month - 1]: value = tsdata.get_data_value() if include_missing or not ts.is_data_missing(value): if return_type == TSToArrayReturnType.DATA_VALUE: data_array[count] = value count += 1 elif return_type == TSToArrayReturnType.DATE_TIME: if interval_base == TimeInterval.YEAR: data_array[count] = date.get_year() count += 1 elif interval_base == TimeInterval.MONTH: data_array[count] = date.get_absolute_month() count += 1 elif interval_base == TimeInterval.DAY: data_array[count] = date.get_absolute_day() count += 1 else: # Regular, increment the data by interval... date = DateTime(date_time=start) count = 0 do_transfer = False is_missing = False # for ; date.lessThanOrEqualTo( end); date.addInterval(interval_base, interval_mult): first_iteration = True while date.less_than_or_equal_to(end): if first_iteration: first_iteration = False else: date.add_interval(interval_base, interval_mult) # First figure out if the data should be skipped because not in a requested month month = date.get_month() if not include_months_mask[month - 1]: continue # Now transfer the value while checking the paired time series do_transfer = False # Do not transfer unless criteria are met below value = ts.get_data_value(date) is_missing = ts.is_data_missing(value) if paired_ts is not None: # Value in "ts" time series MUST be non-missing if not is_missing: value2 = paired_ts.get_data_value(date) is_missing2 = paired_ts.is_data_missing(value2) if match_other_nonmissing: # Want non-missing in both "ts" and "pairedTS" if not is_missing2: do_transfer = True else: # Want non-missing in "ts" and missing in "pairedTS" if is_missing2: do_transfer = True else: if include_missing or not is_missing: # Value is not missing. do_transfer = True # OK to transfer the value... if do_transfer: if return_type == TSToArrayReturnType.DATA_VALUE: data_array[count] = value count += 1 elif return_type == TSToArrayReturnType.DATE_TIME: if interval_base == TimeInterval.YEAR: data_array[count] = date.get_year() count += 1 elif interval_base == TimeInterval.MONTH: data_array[count] = date.get_absolute_month() count += 1 elif interval_base == TimeInterval.DAY: # TODO smalers 2020-01-04 need to enable # data_array[count] = date.get_absolute_day() count += 1 if count != size: # The original array is too big and needs to be cut down to the exact size due to limited # months or missing data being excluded)... new_data_array = [count] for j in range(count): new_data_array[j] = data_array[j] return new_data_array # Return the full array... return data_array
def calculate_data_limits(self, ts, start0, end0, refresh_flag): """ Calculate the total data limits for a time series between two dates. This code was taken from the TSUtil.getDataLimits method. @param ts Time series of interest. @param start0 Starting date for the check. @param end0 Ending date for the check. @param refresh_flag Indicates whether the time series should be refreshed first (in general this is used only within the TS package and the version of this routine without the flag should be called). """ max = 1.0 mean = 0.0 min = 0.0 sum = 0.0 value = 0.0 base = 0 missing_count = 0 mult = 0 non_missing_count = 0 found = False max_date = None min_date = None non_missing_data_date1 = None non_missing_data_date2 = None t = None logger = logging.getLogger(__name__) debug = False try: # Main try... if ts is None: message = "NULL time series" logger.warning(message) # throw new TSException ( message ) raise ValueError(message) # Initialize the sum and the mean... missing = ts.get_missing() sum = missing mean = missing # Get valid date limits because the ones passed in may have been null... valid_dates = self.get_valid_period(ts, start0, end0) start = valid_dates.get_date1() end = valid_dates.get_date2() valid_dates = None # Make sure that the time series has current limits... base = ts.get_data_interval_base() mult = ts.get_data_interval_mult() if refresh_flag: # Force a refresh of the time series. ts.refresh() # Get the variables that are used often in this function. ts_date1 = ts.get_date1() ts_date2 = ts.get_date2() # Figure out if we are treating data <= 0 as missing... ignore_lezero = False if (self.flags & TSLimits.IGNORE_LESS_THAN_OR_EQUAL_ZERO) != 0: ignore_lezero = True # Loop through the dates and get max and min data values # TODO SAM 2010-06-15 Need to consolidate code to use iterator if base == TimeInterval.IRREGULAR: # Loop through the dates and get max and min data values # Need to cast as an irregular TS... # IrregularTS its = (IrregularTS)ts its = ts data_array = its.get_data if data_array is None: message = "Null data for " + str(ts) logger.warning(message) # throw new TSException ( message ) raise ValueError(message) size = len(data_array) ptr = None for i in range(size): ptr = data_array[i] date = ptr.get_date() if date.less_than(ts_date1): # Still looking for data... continue elif date.greater_than(ts_date2): # No need to continue processing... break value = ptr.get_data_value() if ts.is_data_missing(value) or (ignore_lezero and (value <= 0.0)): # The value is missing missing_count += 1 continue # Else, data value is not missing... if ts.is_data_missing(sum): # Reset the sum... sum = value else: # Add to the sum... sum += value non_missing_count += 1 if found: # Already found the first non-missing point so # all we need to do is check the limits. These # should only result in new DateTime a few times... if value > max: max = value max_date = DateTime(date_time=date) if value < min: min = value min_date = DateTime(date_time=date) else: # Set the limits to the first value found... # date = new DateTime ( t ) max = value max_date = DateTime(date_time=date) min = value min_date = max_date non_missing_data_date1 = max_date non_missing_data_date2 = max_date found = True continue # Now search backwards to find the first non-missing date... if found: for i in range((size - 1), 0, -1): ptr = data_array[i] date = ptr.get_date() value = ptr.get_data_value() if date.greater_than(end): # Have not found data... continue elif date.less_than(start): # Passed start... break if (not ignore_lezero and not ts.is_data_missing(value)) or \ (ignore_lezero and ((value > 0.0) and not ts.is_data_missing(value))): # Found the one date we are after... non_missing_data_date2 = DateTime(date_time=date) break else: # A regular TS... easier to iterate... # First loop through and find the data limits and the minimum non-missing date... t = DateTime(date_time=start, flag=DateTime.DATE_FAST) # Python for loops are not as clean as original Java code # for ( ; t.lessThanOrEqualTo(end); t.addInterval( base, mult )) { first_iteration = True while t.less_than_or_equal_to(end): if first_iteration: first_iteration = False else: t.add_interval(base, mult) value = ts.get_data_value(t) if ts.is_data_missing(value) or (ignore_lezero and (value <= 0.0)): # The value is missing missing_count += 1 continue # Else, data value is not missing... if ts.is_data_missing(sum): # Reset the sum... sum = value else: # Add to the sum... sum += value non_missing_count += 1 if found: # Already found the first non-missing point so # all we need to do is check the limits. These # should only result in new DateTime a few times... if value > max: max = value max_date = DateTime(date_time=t) if value < min: min = value min_date = DateTime(date_time=t) else: # First non-missing point so set the initial values... date = DateTime(date_time=t) max = value max_date = date min = value min_date = date non_missing_data_date1 = date non_missing_data_date2 = date found = True # Now loop backwards and find the last non-missing value... t = DateTime(date_time=end, flag=DateTime.DATE_FAST) if found: # for(; t.greaterThanOrEqualTo(start); t.addInterval( base, -mult )) { first_iteration = True while t.greater_than_or_equal_to(start): if first_iteration: first_iteration = False else: t.add_interval(base, -mult) value = ts.get_data_value(t) if (not ignore_lezero and not ts.is_data_missing(value)) or \ (ignore_lezero and ((value > 0.0) and not ts.is_data_missing(value))): # The value is not missing... non_missing_data_date2 = DateTime(date_time=t) break # TODO SAM 2010-06-15 This is a performance hit, but not too bad # TODO SAM 2010-06-15 Consider treating other statistics similarly but need to define unit tests # TODO SAM 2010-06-15 This code would need to be changed if doing Lag-1 correlation because order matters # For newly added statistics, use helper method to get data, ignoring missing... data_array = self.to_array(ts, start, end, 0, False) # Check for <= 0 values if necessary n_data_array = len(data_array) if ignore_lezero: for i in range(n_data_array): if data_array[i] <= 0.0: # Just exchange with the last value and reduce the size temp = data_array[i] data_array[i] = data_array[n_data_array - 1] data_array[n_data_array - 1] = temp n_data_array -= 1 if n_data_array > 0: self.set_median(MathUtil.median(n_data_array, data_array)) if n_data_array > 1: try: self.set_std_dev(MathUtil.standard_deviation(n_data_array, data_array)) except Exception as e: # Likely due to small sample size pass if n_data_array > 2: try: self.set_skew(MathUtil.skew(n_data_array, data_array)) except Exception as e: # Likely due to small sample size pass if not found: message = "\"" + ts.getIdentifierString() + "\": problems finding limits, whole POR missing!" logger.warning(message) # throw new TSException ( message ) raise ValueError(message) if debug: logger.debug("Overall date limits are: " + str(start) + " to " + str(end)) logger.debug("Found limits to be: " + str(min) + " on " + str(min_date) + " to " + str(max) + " on " + str(max_date)) logger.debug("Found non-missing data dates to be: " + str(non_missing_data_date1) + " -> " + str(non_missing_data_date2)) # Set the basic information... self.set_date1(start) self.set_date2(end) self.set_max_value(max, max_date) self.set_min_value(min, min_date) self.set_non_missing_data_date1(non_missing_data_date1) self.set_non_missing_data_date2(non_missing_data_date2) self.set_missing_data_count(missing_count) self.set_non_missing_data_count(non_missing_count) # //int data_size = calculate_data_size(ts, start, end) # //limits.set_non_missing_data_count(data_size - missing_count) if not ts.is_data_missing(sum) and (non_missing_count > 0): mean = sum/float(non_missing_count) else: mean = missing self.set_sum(sum) self.set_mean(mean) except Exception as e: message = "Error computing limits." logger.warning(message) # Put in debug because output sometimes is overwhelming when data are not available. if debug: logger.warning(e) # throw new TSException ( message ) raise Exception(message)