def transform2delta(self): datanew = dict() for year in self.years: datanew.update({year: [float('nan')] * DateFormat.datesperyear()}) for date in DateFormat.decadal_daterange(DateFormat(self.years[0], 2), self.last_observation()): datanew[date.year][date.decade_of_year - 1] = self.data[date.year][date.decade_of_year - 1] - \ self.data[date.timedelta(-1).year][ date.timedelta(-1).decade_of_year - 1] return TransformedDataset(self.datatype, self.years, datanew)
def normalized(self): maxvalue = self.max() minvalue = self.min() datanew = dict() for year in self.years: datanew.update({year: [float('nan')] * DateFormat.datesperyear()}) for date in DateFormat.decadal_daterange( DateFormat(self.years[0], 1), self.last_observation().timedelta(-1)): datanew[date.year][ date.decade_of_year - 1] = (self.data[date.year][date.decade_of_year - 1] - minvalue) / (maxvalue - minvalue) return TransformedDataset(self.datatype, self.years, datanew)
def cross_validate(self, daterange=None, folds=10, output=None): if output is None: output = self.scoremethod if daterange is None: miny = 9999 maxy = 0 for dataset in self.datasets: if dataset.years[0] < miny: miny = dataset.years[0] if dataset.years[-1] > maxy: maxy = dataset.years[-1] startyear = DateFormat(miny, 1) endyear = DateFormat(maxy + 1, 1) daterange = DateFormat.decadal_daterange(startyear, endyear) targetset, featureset, datelist = self.cleanup_daterange(daterange) decades = [] for date in datelist: decades.append(date.decade_of_year) kf = StratifiedKFold(decades, folds) score = [] for train_index, test_index in kf: self.model.fit(featureset[train_index], targetset[train_index]) score.append(self.evaluate(datelist[test_index], output)) return np.nanmean(score)
def csv(self, daterange, filename='result.csv'): if path.isfile(filename): remove(filename) writer = csv.writer(open(filename, 'wb'), delimiter=',') label = [ 'year', 'decade', 'observed', 'forecast', 'historic average', 'STDEV Q', 'STDEV deltaQ' ] writer.writerow(label) targetset = self.datasets[0] targetdiff = targetset.transform2delta() for i, date in enumerate(daterange): decades = [] for date2 in DateFormat.decadal_daterange( date.timedelta(self.lead_times[0][0]), date.timedelta(self.lead_times[0][1])): decades.append(date2.decade_of_year) data = [ date.year, date.decade_of_year, self.single_targetset(date), self.forecast(date), self.forecastwithaverage(date), targetset.decadal_standard_deviation(decades), targetdiff.decadal_standard_deviation(decades) ] writer.writerow(data) return None
def min(self): minvalue = 9999 for date in DateFormat.decadal_daterange( DateFormat(self.years[0], 1), self.last_observation().timedelta(-1)): if self.data[date.year][date.decade_of_year - 1] < minvalue: minvalue = self.data[date.year][date.decade_of_year - 1] return minvalue
def forecastwithaverage(self, date): """ Naive forecaster: Forecast is average value for this time of the year""" decades = [] for date2 in DateFormat.decadal_daterange( date.timedelta(self.lead_times[0][0]), date.timedelta(self.lead_times[0][1])): decades.append(date2.decade_of_year) return self.datasets[0].decadal_average(decades)
def evaluate(self, daterange, output=None): if output is None: output = self.scoremethod predicted = np.empty(len(daterange)) average = np.empty(len(daterange)) observed = np.empty(len(daterange)) datelist = np.full(len(daterange), None, dtype=DateFormat) for i, date in enumerate(daterange): predicted[i] = self.forecast(date) average[i] = self.forecastwithaverage(date) observed[i] = self.single_targetset(date) datelist[i] = date.firstdate() if output == 'R2': mask = np.logical_or(np.isnan(predicted), np.isnan(observed)) return r2_score(observed[~mask], predicted[~mask]) elif output == 'soviet_longterm': stdev = [] for date in daterange: decades = [] for date2 in DateFormat.decadal_daterange( date.timedelta(self.lead_times[0][0]), date.timedelta(self.lead_times[0][1])): decades.append(date2.decade_of_year) stdev.append( self.datasets[0].decadal_standard_deviation(decades)) stdev = np.array(stdev, dtype=np.float) error = np.absolute( np.array(observed, dtype=np.float) - np.array(predicted, dtype=np.float)) nonnans = ~np.isnan(error) return np.mean(error[nonnans] / stdev[nonnans]) elif output == 'soviet_shortterm': if self.shortterm_validation(): stdev = np.empty(len(daterange)) targetdiff = self.datasets[0].transform2delta() for i, date in enumerate(daterange): stdev[i] = targetdiff.decadal_standard_deviation( [date.decade_of_year]) stdev = np.array(stdev, dtype=np.float) error = np.absolute( np.array(observed, dtype=np.float) - np.array(predicted, dtype=np.float)) nonnans = ~np.isnan(error) return np.mean(error[nonnans] / stdev[nonnans]) else: print 'shortterm evaluator is onyl valid for target leadtime [0,0]' return None else: print 'score method can be: R2,soviet_longterm,soviet_shortterm' return np.nan
def forecast(self, date): """ Wrapper for Skilearn predict method. Takes date as argument Returns predicted value. If featureset is incomplete, returns NaN. """ if isinstance(date, datetime.date): date = DateFormat.datetime2date(date) feature = self.single_featureset(date) if not any(np.isnan(feature)): feature2 = feature.reshape(1, feature.shape[0]) return self.model.predict(feature2)[0] else: return np.nan
def train_model(self, training_daterange=None): if training_daterange is None: miny = 9999 maxy = 0 for dataset in self.datasets: if dataset.years[0] < miny: miny = dataset.years[0] if dataset.years[-1] > maxy: maxy = dataset.years[-1] startyear = DateFormat(miny, 1) endyear = DateFormat(maxy + 1, 1) training_daterange = DateFormat.decadal_daterange( startyear, endyear) targetset, featureset, datelist = self.cleanup_daterange( training_daterange) self.model.fit(featureset, targetset) return None
def single_targetset(self, date): """ Return targetvalue for specified date from first entry in datasets. Returns average value over lead_times specified for targetset -> lead_times[0] If any value is missing, return NaN """ if isinstance(date, datetime.date): date = DateFormat.datetime2date(date) dataset = self.datasets[0] forecasting_leadtime = self.lead_times[0] targets = [] for dT in range(forecasting_leadtime[0], forecasting_leadtime[1] + 1): targets.append(dataset.get_feature(date.timedelta(dT))[0]) if any(np.isnan(targets)): return np.nan else: return np.mean(targets)
def decadal_daterange(start_date, end_date): dates = [] for n in range( int(DateFormat.decadal_difference(end_date, start_date) + 1)): dates.append(start_date.timedelta(n)) return dates
def plot(self, daterange, output=None, filename='evaluate.png'): if output is None: output = self.scoremethod predicted = np.empty(len(daterange)) average = np.empty(len(daterange)) observed = np.empty(len(daterange)) datelist = np.full(len(daterange), None, dtype=DateFormat) for i, date in enumerate(daterange): predicted[i] = self.forecast(date) average[i] = self.forecastwithaverage(date) observed[i] = self.single_targetset(date) datelist[i] = date.firstdate() if output == 'timeseries': plt.figure(figsize=(15, 5)) # plt.figure(figsize=(7,5)) plt.plot(datelist, predicted, label='predicted', color='red') plt.plot(datelist, average, label='historical average', linestyle='--', linewidth=0.5, color='green') plt.plot(datelist, observed, label='observed', color='blue') plt.ylabel(str(self.datasets[0].datatype)) plt.xticks(rotation=90) plt.tight_layout() # plt.gca().xaxis.set_major_locator(plt.NullLocator()) plt.legend(loc='upper left') plt.draw() plt.savefig(filename) return None elif output == 'correlation': plt.figure(figsize=(5, 5)) plt.plot(observed, predicted, linestyle='None', marker='.') maxval = int(np.nanmax([np.max(observed), np.nanmax(predicted)])) minval = int(np.min([np.nanmin(observed), np.nanmin(predicted)])) plt.xlim((minval * 0.9, maxval * 1.1)) plt.ylim((minval * 0.9, maxval * 1.1)) plt.plot(plt.gca().get_xlim(), plt.gca().get_ylim(), ls="--", c=".3") plt.xlabel('observed ' + str(self.datasets[0].datatype)) plt.ylabel('forecasted ' + str(self.datasets[0].datatype)) plt.draw() plt.savefig(filename) return None elif output == 'soviet_longterm': # Does not account for unsorted dateranges! years = range(daterange[0].year, daterange[-1].year + 1) dates = range(1, daterange[0].datesperyear() + 1) nbofdates = dates[-1] error = np.full([len(years), nbofdates], np.nan) stdev = np.full(nbofdates, np.nan) axis = np.full(nbofdates, np.nan) for date in daterange: relativeyear = date.year - daterange[0].year decades = [] for date2 in DateFormat.decadal_daterange( date.timedelta(self.lead_times[0][0]), date.timedelta(self.lead_times[0][1])): decades.append(date2.decade_of_year) error[relativeyear, date.decade_of_year - 1] = np.absolute( self.forecast(date) - self.single_targetset(date)) / ( datasets[0].decadal_standard_deviation(decades)) axis[date.decade_of_year - 1] = date.decade_of_year error2 = np.ma.masked_invalid(error) error = [[y for y in row if y] for row in error2.T] plt.figure(figsize=(15, 5)) plt.boxplot(error, positions=dates, showmeans=True) plt.plot(axis, stdev, label='standard deviation') plt.axhline(0.8, linestyle='--', color='b', label='80% of standard deviation') plt.axhline(0.6, linestyle='--', color='g', label='60% of standard deviation') axes = plt.gca() plt.xlabel('issue date (decade of year)') plt.ylabel('error/STDEV') axes.set_ylim([0, 1.5]) plt.legend() plt.draw() plt.savefig(filename) return None elif output == 'soviet_shortterm': if self.shortterm_validation(): targetdiff = datasets[0].transform2delta() years = range(daterange[0].year, daterange[-1].year + 1) dates = range(1, daterange[0].datesperyear() + 1) nbofdates = dates[-1] error = np.full([len(years), nbofdates], np.nan) stdev = np.full(nbofdates, np.nan) axis = np.full(nbofdates, np.nan) for i, date in enumerate(daterange): relativeyear = date.year - daterange[0].year error[relativeyear, date.decade_of_year - 1] = np.absolute( predicted[i] - observed[i]) / targetdiff.decadal_standard_deviation( [date.decade_of_year]) error2 = np.ma.masked_invalid(error) error = [[y for y in row if y] for row in error2.T] plt.figure(figsize=(15, 5)) plt.plot(axis, stdev, label='standard deviation') plt.axhline(0.674, linestyle='--', color='g', label='67.4% of standard deviation') plt.boxplot(error, positions=dates, showmeans=True) axes = plt.gca() axes.set_ylim([0, 1.5]) plt.ylabel('error/STDEV') plt.xlabel('issue date (decade of year)') plt.legend() plt.draw() plt.savefig(filename) return None else: print 'shortterm evaluator is onyl valid for target leadtime [0,0]' return None elif output == 'importance': vec = self.model.feature_importances_ importance = [] data = [] tailtimes = [] j = 0 for i, dataset in enumerate(self.datasets[1:]): length = self.lead_time2length([self.lead_times[i + 1]]) vecpart = vec[j:j + length] tailtimes.append( range(self.lead_times[i + 1][0], self.lead_times[i + 1][1] + 1)) data.append(dataset.datatype) importance.append(vecpart) j += length plt.figure(figsize=(10, 5)) for i, vec in enumerate(importance): plt.plot(tailtimes[i], vec, label=data[i]) plt.legend(loc='upper left') plt.yscale('log') plt.xlabel('tailtime') plt.ylabel('importance [-]') plt.draw() plt.savefig(filename) return None
leadtimes = [[1, 3], [-4, -1], [-4, -1], [-4, -1], [1, 1]] # Select Model model_type = Earth(max_degree=10, smooth=True) #model_type= Lasso(alpha=0.05,normalize=True, max_iter=3000) #model_type = Regressor( # layers=[ # Layer("Sigmoid",units=5), # Layer("Linear", units=1)], # learning_rate=0.1, # n_iter=1000) # Set training interval startyear = DateFormat(1900, 1) endyear = DateFormat(2005, 36) training_daterange = DateFormat.decadal_daterange(startyear, endyear) # Set testing interval startyear = DateFormat(2006, 1) endyear = DateFormat(2015, 36) testing_daterange = DateFormat.decadal_daterange(startyear, endyear) newtesting_daterange = [] for date in testing_daterange: if date.decade_of_year > 0: # Selecting last decade of each month as issue date newtesting_daterange.append(date) startyear = DateFormat(2006, 1) endyear = DateFormat(2010, 36) plotdaterange = DateFormat.decadal_daterange(startyear, endyear) # Creates forecasting model with selected parameters