def transform2delta(self):
        datanew = dict()
        for year in self.years:
            datanew.update({year: [float('nan')] * DateFormat.datesperyear()})

        for date in DateFormat.decadal_daterange(DateFormat(self.years[0], 2),
                                                 self.last_observation()):
            datanew[date.year][date.decade_of_year - 1] = self.data[date.year][date.decade_of_year - 1] - \
                                                          self.data[date.timedelta(-1).year][
                                                              date.timedelta(-1).decade_of_year - 1]
        return TransformedDataset(self.datatype, self.years, datanew)
    def normalized(self):
        maxvalue = self.max()
        minvalue = self.min()
        datanew = dict()
        for year in self.years:
            datanew.update({year: [float('nan')] * DateFormat.datesperyear()})

        for date in DateFormat.decadal_daterange(
                DateFormat(self.years[0], 1),
                self.last_observation().timedelta(-1)):
            datanew[date.year][
                date.decade_of_year -
                1] = (self.data[date.year][date.decade_of_year - 1] -
                      minvalue) / (maxvalue - minvalue)
        return TransformedDataset(self.datatype, self.years, datanew)
    def cross_validate(self, daterange=None, folds=10, output=None):
        if output is None:
            output = self.scoremethod

        if daterange is None:
            miny = 9999
            maxy = 0
            for dataset in self.datasets:
                if dataset.years[0] < miny:
                    miny = dataset.years[0]
                if dataset.years[-1] > maxy:
                    maxy = dataset.years[-1]
            startyear = DateFormat(miny, 1)
            endyear = DateFormat(maxy + 1, 1)
            daterange = DateFormat.decadal_daterange(startyear, endyear)

        targetset, featureset, datelist = self.cleanup_daterange(daterange)
        decades = []
        for date in datelist:
            decades.append(date.decade_of_year)

        kf = StratifiedKFold(decades, folds)
        score = []
        for train_index, test_index in kf:
            self.model.fit(featureset[train_index], targetset[train_index])
            score.append(self.evaluate(datelist[test_index], output))
        return np.nanmean(score)
    def csv(self, daterange, filename='result.csv'):
        if path.isfile(filename):
            remove(filename)
        writer = csv.writer(open(filename, 'wb'), delimiter=',')

        label = [
            'year', 'decade', 'observed', 'forecast', 'historic average',
            'STDEV Q', 'STDEV deltaQ'
        ]
        writer.writerow(label)

        targetset = self.datasets[0]
        targetdiff = targetset.transform2delta()
        for i, date in enumerate(daterange):
            decades = []
            for date2 in DateFormat.decadal_daterange(
                    date.timedelta(self.lead_times[0][0]),
                    date.timedelta(self.lead_times[0][1])):
                decades.append(date2.decade_of_year)
            data = [
                date.year, date.decade_of_year,
                self.single_targetset(date),
                self.forecast(date),
                self.forecastwithaverage(date),
                targetset.decadal_standard_deviation(decades),
                targetdiff.decadal_standard_deviation(decades)
            ]
            writer.writerow(data)
        return None
 def min(self):
     minvalue = 9999
     for date in DateFormat.decadal_daterange(
             DateFormat(self.years[0], 1),
             self.last_observation().timedelta(-1)):
         if self.data[date.year][date.decade_of_year - 1] < minvalue:
             minvalue = self.data[date.year][date.decade_of_year - 1]
     return minvalue
 def forecastwithaverage(self, date):
     """ Naive forecaster: Forecast is average value for this time of the year"""
     decades = []
     for date2 in DateFormat.decadal_daterange(
             date.timedelta(self.lead_times[0][0]),
             date.timedelta(self.lead_times[0][1])):
         decades.append(date2.decade_of_year)
     return self.datasets[0].decadal_average(decades)
    def evaluate(self, daterange, output=None):
        if output is None:
            output = self.scoremethod

        predicted = np.empty(len(daterange))
        average = np.empty(len(daterange))
        observed = np.empty(len(daterange))
        datelist = np.full(len(daterange), None, dtype=DateFormat)
        for i, date in enumerate(daterange):
            predicted[i] = self.forecast(date)
            average[i] = self.forecastwithaverage(date)
            observed[i] = self.single_targetset(date)
            datelist[i] = date.firstdate()

        if output == 'R2':
            mask = np.logical_or(np.isnan(predicted), np.isnan(observed))
            return r2_score(observed[~mask], predicted[~mask])

        elif output == 'soviet_longterm':
            stdev = []
            for date in daterange:
                decades = []
                for date2 in DateFormat.decadal_daterange(
                        date.timedelta(self.lead_times[0][0]),
                        date.timedelta(self.lead_times[0][1])):
                    decades.append(date2.decade_of_year)
                stdev.append(
                    self.datasets[0].decadal_standard_deviation(decades))

            stdev = np.array(stdev, dtype=np.float)
            error = np.absolute(
                np.array(observed, dtype=np.float) -
                np.array(predicted, dtype=np.float))
            nonnans = ~np.isnan(error)
            return np.mean(error[nonnans] / stdev[nonnans])

        elif output == 'soviet_shortterm':
            if self.shortterm_validation():
                stdev = np.empty(len(daterange))
                targetdiff = self.datasets[0].transform2delta()
                for i, date in enumerate(daterange):
                    stdev[i] = targetdiff.decadal_standard_deviation(
                        [date.decade_of_year])

                stdev = np.array(stdev, dtype=np.float)
                error = np.absolute(
                    np.array(observed, dtype=np.float) -
                    np.array(predicted, dtype=np.float))
                nonnans = ~np.isnan(error)
                return np.mean(error[nonnans] / stdev[nonnans])
            else:
                print 'shortterm evaluator is onyl valid for target leadtime [0,0]'
                return None

        else:
            print 'score method can be: R2,soviet_longterm,soviet_shortterm'
            return np.nan
    def forecast(self, date):
        """ Wrapper for Skilearn predict method. Takes date as argument

        Returns predicted value. If featureset is incomplete, returns NaN.
        """
        if isinstance(date, datetime.date):
            date = DateFormat.datetime2date(date)

        feature = self.single_featureset(date)
        if not any(np.isnan(feature)):
            feature2 = feature.reshape(1, feature.shape[0])
            return self.model.predict(feature2)[0]
        else:
            return np.nan
    def train_model(self, training_daterange=None):
        if training_daterange is None:
            miny = 9999
            maxy = 0
            for dataset in self.datasets:
                if dataset.years[0] < miny:
                    miny = dataset.years[0]
                if dataset.years[-1] > maxy:
                    maxy = dataset.years[-1]
            startyear = DateFormat(miny, 1)
            endyear = DateFormat(maxy + 1, 1)
            training_daterange = DateFormat.decadal_daterange(
                startyear, endyear)

        targetset, featureset, datelist = self.cleanup_daterange(
            training_daterange)
        self.model.fit(featureset, targetset)
        return None
    def single_targetset(self, date):
        """ Return targetvalue for specified date from first entry in datasets.

        Returns average value over lead_times specified for targetset -> lead_times[0]
        If any value is missing, return NaN
        """
        if isinstance(date, datetime.date):
            date = DateFormat.datetime2date(date)

        dataset = self.datasets[0]
        forecasting_leadtime = self.lead_times[0]
        targets = []
        for dT in range(forecasting_leadtime[0], forecasting_leadtime[1] + 1):
            targets.append(dataset.get_feature(date.timedelta(dT))[0])
        if any(np.isnan(targets)):
            return np.nan
        else:
            return np.mean(targets)
 def decadal_daterange(start_date, end_date):
     dates = []
     for n in range(
             int(DateFormat.decadal_difference(end_date, start_date) + 1)):
         dates.append(start_date.timedelta(n))
     return dates
    def plot(self, daterange, output=None, filename='evaluate.png'):
        if output is None:
            output = self.scoremethod

        predicted = np.empty(len(daterange))
        average = np.empty(len(daterange))
        observed = np.empty(len(daterange))
        datelist = np.full(len(daterange), None, dtype=DateFormat)
        for i, date in enumerate(daterange):
            predicted[i] = self.forecast(date)
            average[i] = self.forecastwithaverage(date)
            observed[i] = self.single_targetset(date)
            datelist[i] = date.firstdate()

        if output == 'timeseries':
            plt.figure(figsize=(15, 5))
            # plt.figure(figsize=(7,5))

            plt.plot(datelist, predicted, label='predicted', color='red')
            plt.plot(datelist,
                     average,
                     label='historical average',
                     linestyle='--',
                     linewidth=0.5,
                     color='green')
            plt.plot(datelist, observed, label='observed', color='blue')
            plt.ylabel(str(self.datasets[0].datatype))
            plt.xticks(rotation=90)
            plt.tight_layout()
            # plt.gca().xaxis.set_major_locator(plt.NullLocator())
            plt.legend(loc='upper left')
            plt.draw()
            plt.savefig(filename)
            return None

        elif output == 'correlation':
            plt.figure(figsize=(5, 5))
            plt.plot(observed, predicted, linestyle='None', marker='.')
            maxval = int(np.nanmax([np.max(observed), np.nanmax(predicted)]))
            minval = int(np.min([np.nanmin(observed), np.nanmin(predicted)]))
            plt.xlim((minval * 0.9, maxval * 1.1))
            plt.ylim((minval * 0.9, maxval * 1.1))
            plt.plot(plt.gca().get_xlim(),
                     plt.gca().get_ylim(),
                     ls="--",
                     c=".3")
            plt.xlabel('observed ' + str(self.datasets[0].datatype))
            plt.ylabel('forecasted ' + str(self.datasets[0].datatype))
            plt.draw()
            plt.savefig(filename)
            return None

        elif output == 'soviet_longterm':

            # Does not account for unsorted dateranges!
            years = range(daterange[0].year, daterange[-1].year + 1)

            dates = range(1, daterange[0].datesperyear() + 1)
            nbofdates = dates[-1]
            error = np.full([len(years), nbofdates], np.nan)
            stdev = np.full(nbofdates, np.nan)
            axis = np.full(nbofdates, np.nan)

            for date in daterange:
                relativeyear = date.year - daterange[0].year
                decades = []
                for date2 in DateFormat.decadal_daterange(
                        date.timedelta(self.lead_times[0][0]),
                        date.timedelta(self.lead_times[0][1])):
                    decades.append(date2.decade_of_year)
                error[relativeyear, date.decade_of_year - 1] = np.absolute(
                    self.forecast(date) - self.single_targetset(date)) / (
                        datasets[0].decadal_standard_deviation(decades))
                axis[date.decade_of_year - 1] = date.decade_of_year

            error2 = np.ma.masked_invalid(error)
            error = [[y for y in row if y] for row in error2.T]

            plt.figure(figsize=(15, 5))
            plt.boxplot(error, positions=dates, showmeans=True)
            plt.plot(axis, stdev, label='standard deviation')
            plt.axhline(0.8,
                        linestyle='--',
                        color='b',
                        label='80% of standard deviation')
            plt.axhline(0.6,
                        linestyle='--',
                        color='g',
                        label='60% of standard deviation')
            axes = plt.gca()
            plt.xlabel('issue date (decade of year)')
            plt.ylabel('error/STDEV')
            axes.set_ylim([0, 1.5])
            plt.legend()
            plt.draw()
            plt.savefig(filename)
            return None

        elif output == 'soviet_shortterm':

            if self.shortterm_validation():
                targetdiff = datasets[0].transform2delta()
                years = range(daterange[0].year, daterange[-1].year + 1)
                dates = range(1, daterange[0].datesperyear() + 1)
                nbofdates = dates[-1]
                error = np.full([len(years), nbofdates], np.nan)
                stdev = np.full(nbofdates, np.nan)
                axis = np.full(nbofdates, np.nan)

                for i, date in enumerate(daterange):
                    relativeyear = date.year - daterange[0].year
                    error[relativeyear, date.decade_of_year - 1] = np.absolute(
                        predicted[i] -
                        observed[i]) / targetdiff.decadal_standard_deviation(
                            [date.decade_of_year])

                error2 = np.ma.masked_invalid(error)
                error = [[y for y in row if y] for row in error2.T]

                plt.figure(figsize=(15, 5))
                plt.plot(axis, stdev, label='standard deviation')
                plt.axhline(0.674,
                            linestyle='--',
                            color='g',
                            label='67.4% of standard deviation')
                plt.boxplot(error, positions=dates, showmeans=True)
                axes = plt.gca()
                axes.set_ylim([0, 1.5])
                plt.ylabel('error/STDEV')
                plt.xlabel('issue date (decade of year)')
                plt.legend()
                plt.draw()
                plt.savefig(filename)
                return None
            else:
                print 'shortterm evaluator is onyl valid for target leadtime [0,0]'
                return None

        elif output == 'importance':
            vec = self.model.feature_importances_
            importance = []
            data = []
            tailtimes = []

            j = 0
            for i, dataset in enumerate(self.datasets[1:]):
                length = self.lead_time2length([self.lead_times[i + 1]])
                vecpart = vec[j:j + length]
                tailtimes.append(
                    range(self.lead_times[i + 1][0],
                          self.lead_times[i + 1][1] + 1))
                data.append(dataset.datatype)
                importance.append(vecpart)
                j += length

            plt.figure(figsize=(10, 5))
            for i, vec in enumerate(importance):
                plt.plot(tailtimes[i], vec, label=data[i])
            plt.legend(loc='upper left')
            plt.yscale('log')
            plt.xlabel('tailtime')
            plt.ylabel('importance [-]')
            plt.draw()
            plt.savefig(filename)
            return None
    leadtimes = [[1, 3], [-4, -1], [-4, -1], [-4, -1], [1, 1]]

    # Select Model
    model_type = Earth(max_degree=10, smooth=True)
    #model_type= Lasso(alpha=0.05,normalize=True, max_iter=3000)
    #model_type = Regressor(
    #    layers=[
    #        Layer("Sigmoid",units=5),
    #        Layer("Linear", units=1)],
    #    learning_rate=0.1,
    #    n_iter=1000)

    # Set training interval
    startyear = DateFormat(1900, 1)
    endyear = DateFormat(2005, 36)
    training_daterange = DateFormat.decadal_daterange(startyear, endyear)

    # Set testing interval
    startyear = DateFormat(2006, 1)
    endyear = DateFormat(2015, 36)
    testing_daterange = DateFormat.decadal_daterange(startyear, endyear)
    newtesting_daterange = []
    for date in testing_daterange:
        if date.decade_of_year > 0:  # Selecting last decade of each month as issue date
            newtesting_daterange.append(date)

    startyear = DateFormat(2006, 1)
    endyear = DateFormat(2010, 36)
    plotdaterange = DateFormat.decadal_daterange(startyear, endyear)

    # Creates forecasting model with selected parameters