def __init__(self,
                 seriesToPredictKey,
                 kSingularValuesToKeep,
                 M,
                 probObservation=1.0,
                 modelType='svd',
                 svdMethod='numpy',
                 otherSeriesKeysArray=[]):

        self.seriesToPredictKey = seriesToPredictKey
        self.otherSeriesKeysArray = otherSeriesKeysArray

        self.N = 1  # each series is on its own row
        self.M = M

        self.kSingularValues = kSingularValuesToKeep
        self.svdMethod = svdMethod

        self.p = probObservation

        if (modelType == 'als'):
            self.model = ALSModel(
                self.seriesToPredictKey,
                self.kSingularValues,
                self.N,
                self.M,
                probObservation=self.p,
                otherSeriesKeysArray=self.otherSeriesKeysArray,
                includePastDataOnly=False)

        elif (modelType == 'svd'):
            self.model = SVDModel(
                self.seriesToPredictKey,
                self.kSingularValues,
                self.N,
                self.M,
                probObservation=self.p,
                svdMethod='numpy',
                otherSeriesKeysArray=self.otherSeriesKeysArray,
                includePastDataOnly=False)

        else:
            self.model = SVDModel(
                self.seriesToPredictKey,
                self.kSingularValues,
                self.N,
                self.M,
                probObservation=self.p,
                svdMethod='numpy',
                otherSeriesKeysArray=self.otherSeriesKeysArray,
                includePastDataOnly=False)

        self.control = None  # these are the synthetic control weights
 def create_model(self, X_train):
     M = len(X_train)
     if self.modelType == "als":
         self.model = ALSModel(
             self.seriesToPredictKey,
             self.kSingularValues,
             self.N,
             M,
             probObservation=self.p,
             otherSeriesKeysArray=self.otherSeriesKeysArray,
             includePastDataOnly=False,
         )
     else:  # default: SVD
         self.model = SVDModel(
             self.seriesToPredictKey,
             self.kSingularValues,
             self.N,
             M,
             probObservation=self.p,
             svdMethod="numpy",
             otherSeriesKeysArray=self.otherSeriesKeysArray,
             includePastDataOnly=False,
         )
def testSingleTS():
    print("------------------- Test # 1 (Single TS). ------------------------")
    p = 0.7
    N = 50
    M = 400
    timeSteps = N * M

    # train/test split
    trainProp = 0.9
    M1 = int(trainProp * M)
    M2 = M - M1

    trainPoints = N * M1
    testPoints = N * M2

    print("Generating data...")
    harmonicsTS = harmonicDataTest(timeSteps)
    trendTS = trendDataTest(timeSteps)
    (armaTS, armaMeanTS) = armaDataTest(timeSteps)

    meanTS = harmonicsTS + trendTS + armaMeanTS
    combinedTS = harmonicsTS + trendTS + armaTS

    # normalize the values to all lie within [-1, 1] -- helps with RMSE comparisons
    # can use the tsUtils.unnormalize() function to convert everything back to the original range at the end, if needed
    max1 = np.nanmax(combinedTS)
    min1 = np.nanmin(combinedTS)
    max2 = np.nanmax(meanTS)
    min2 = np.nanmin(meanTS)
    max = np.max([max1, max2])
    min = np.min([min1, min2])

    combinedTS = tsUtils.normalize(combinedTS, max, min)
    meanTS = tsUtils.normalize(meanTS, max, min)

    # produce timestamps
    timestamps = np.arange('2017-09-10 20:30:00',
                           timeSteps,
                           dtype='datetime64[1m]')  # arbitrary start date

    # split the data
    trainDataMaster = combinedTS[
        0:
        trainPoints]  # need this as the true realized values for comparisons later
    meanTrainData = meanTS[
        0:
        trainPoints]  # this is only needed for various statistical comparisons later

    # randomly hide training data: choose between randomly hiding entries or randomly hiding consecutive entries
    (trainData,
     pObservation) = tsUtils.randomlyHideValues(copy.deepcopy(trainDataMaster),
                                                p)

    # now further hide consecutive entries for a very small fraction of entries in the eventual training matrix
    (trainData, pObservation) = tsUtils.randomlyHideConsecutiveEntries(
        copy.deepcopy(trainData), 0.9, int(M1 * 0.25), M1)

    # interpolating Nans with linear interpolation
    # trainData = tsUtils.nanInterpolateHelper(trainData)

    # test data and hidden truth
    testData = combinedTS[-1 * testPoints:]
    meanTestData = meanTS[
        -1 *
        testPoints:]  # this is only needed for various statistical comparisons

    # time stamps
    trainTimestamps = timestamps[0:trainPoints]
    testTimestamps = timestamps[-1 * testPoints:]

    # once we have interpolated, pObservation should be set back to 1.0
    pObservation = 1.0

    # create pandas df
    key1 = 't1'
    trainMasterDF = pd.DataFrame(index=trainTimestamps,
                                 data={key1: trainDataMaster
                                       })  # needed for reference later
    trainDF = pd.DataFrame(index=trainTimestamps, data={key1: trainData})
    meanTrainDF = pd.DataFrame(index=trainTimestamps,
                               data={key1: meanTrainData})

    testDF = pd.DataFrame(index=testTimestamps, data={key1: testData})
    meanTestDF = pd.DataFrame(index=testTimestamps, data={key1: meanTestData})

    # train the model
    print("Training the model (imputing)...")
    print('SVD')
    nbrSingValuesToKeep = 5
    mod = SVDModel(key1,
                   nbrSingValuesToKeep,
                   N,
                   M1,
                   probObservation=pObservation,
                   svdMethod='numpy',
                   otherSeriesKeysArray=[],
                   includePastDataOnly=True)
    mod.fit(trainDF)
    imputedDf = mod.denoisedDF()

    print(" RMSE (training imputation vs mean) = %f" %
          tsUtils.rmse(meanTrainDF[key1].values, imputedDf[key1].values))
    print(" RMSE (training imputation vs obs)  = %f" %
          tsUtils.rmse(trainMasterDF[key1].values, imputedDf[key1].values))
    return
    print('ALS')
    # uncomment below to run the ALS algorithm ; comment out the above line
    mod = ALSModel(key1,
                   nbrSingValuesToKeep,
                   N,
                   M1,
                   probObservation=pObservation,
                   otherSeriesKeysArray=[],
                   includePastDataOnly=True)
    mod.fit(trainDF)

    # imputed + denoised data
    imputedDf = mod.denoisedDF()

    print(" RMSE (training imputation vs mean) = %f" %
          tsUtils.rmse(meanTrainDF[key1].values, imputedDf[key1].values))
    print(" RMSE (training imputation vs obs)  = %f" %
          tsUtils.rmse(trainMasterDF[key1].values, imputedDf[key1].values))

    print("Forecasting (#points = %d)..." % len(testDF))
    # test data is used for point-predictions
    forecastArray = []
    for i in range(0, len(testDF)):
        pastPoints = np.zeros(N - 1)  # need an N-1 length vector of past point
        j = 0
        if (i < N -
                1):  # the first prediction uses the end of the training data
            while (j < N - 1 - i):
                pastPoints[j] = trainMasterDF[key1].values[len(trainDF) -
                                                           (N - 1 - i) + j]
                j += 1

        if (j < N - 1):  # use the new test data
            pastPoints[j:] = testDF[key1].values[i - (N - 1) + j:i]

        keyToSeriesDFNew = pd.DataFrame(data={key1: pastPoints})
        prediction = mod.predict(pd.DataFrame(data={}),
                                 keyToSeriesDFNew,
                                 bypassChecks=False)
        forecastArray.append(prediction)

    print(" RMSE (prediction vs mean) = %f" %
          tsUtils.rmse(meanTestDF[key1].values, forecastArray))
    print(" RMSE (prediction vs obs)  = %f" %
          tsUtils.rmse(testDF[key1].values, forecastArray))

    print("Plotting...")
    plt.plot(np.concatenate((trainMasterDF[key1].values, testDF[key1].values),
                            axis=0),
             color='gray',
             label='Observed')
    plt.plot(np.concatenate(
        (meanTrainDF[key1].values, meanTestDF[key1].values), axis=0),
             color='red',
             label='True Means')
    plt.plot(np.concatenate((imputedDf[key1].values, forecastArray), axis=0),
             color='blue',
             label='Forecasts')
    plt.axvline(x=len(trainDF),
                linewidth=1,
                color='black',
                label='Training End')
    legend = plt.legend(loc='upper left', shadow=True)
    plt.title('Single Time Series (ARMA + Periodic + Trend) - $p = %.2f$' % p)
    plt.show()
def testMultipleTS():

    print(
        "------------------- Test # 2 (Multiple TS). ------------------------")
    p = 1.0
    N = 50
    M = 400
    timeSteps = N * M

    # train/test split
    trainProp = 0.7
    M1 = int(trainProp * M)
    M2 = M - M1

    trainPoints = N * M1
    testPoints = N * M2

    key1 = 't1'
    key2 = 't2'
    key3 = 't3'
    otherkeys = [key2, key3]

    includePastDataOnly = True

    print("Generating data...")
    harmonicsTS = harmonicDataTest(timeSteps)
    trendTS = trendDataTest(timeSteps)
    (armaTS, armaMeanTS) = armaDataTest(timeSteps)

    meanTS = harmonicsTS + trendTS + armaMeanTS
    combinedTS = harmonicsTS + trendTS + armaTS

    combinedTS2 = (0.3 * combinedTS) + np.random.normal(
        0.0, 0.5, len(combinedTS))
    combinedTS3 = (-0.4 * combinedTS)

    #normalize the values to all lie within [-1, 1] -- helps with RMSE comparisons
    # can use the tsUtils.unnormalize() function to convert everything back to the original range at the end, if needed
    max1 = np.nanmax([combinedTS, combinedTS2, combinedTS3])
    min1 = np.nanmin([combinedTS, combinedTS2, combinedTS3])
    max2 = np.nanmax(meanTS)
    min2 = np.nanmin(meanTS)
    max = np.max([max1, max2])
    min = np.min([min1, min2])

    combinedTS = tsUtils.normalize(combinedTS, max, min)
    combinedTS2 = tsUtils.normalize(combinedTS2, max, min)
    combinedTS3 = tsUtils.normalize(combinedTS3, max, min)
    meanTS = tsUtils.normalize(meanTS, max, min)

    # produce timestamps
    timestamps = np.arange('2017-09-10 20:30:00',
                           timeSteps,
                           dtype='datetime64[1m]')  # arbitrary start date

    # split the data
    trainDataMaster = combinedTS[
        0:
        trainPoints]  # need this as the true realized values for comparisons later
    trainDataMaster2 = combinedTS2[0:trainPoints]
    trainDataMaster3 = combinedTS3[0:trainPoints]

    meanTrainData = meanTS[
        0:
        trainPoints]  # this is only needed for various statistical comparisons later

    # randomly hide training data
    (trainData,
     pObservation) = tsUtils.randomlyHideValues(copy.deepcopy(trainDataMaster),
                                                p)
    (trainData2, pObservation) = tsUtils.randomlyHideValues(
        copy.deepcopy(trainDataMaster2), p)
    (trainData3, pObservation) = tsUtils.randomlyHideValues(
        copy.deepcopy(trainDataMaster3), p)

    # now further hide consecutive entries for a very small fraction of entries in the eventual training matrix
    (trainData, pObservation) = tsUtils.randomlyHideConsecutiveEntries(
        copy.deepcopy(trainData), 0.95, int(M1 * 0.25), M1)
    (trainData2, pObservation) = tsUtils.randomlyHideConsecutiveEntries(
        copy.deepcopy(trainData2), 0.95, int(M1 * 0.25), M1)
    (trainData3, pObservation) = tsUtils.randomlyHideConsecutiveEntries(
        copy.deepcopy(trainData3), 0.95, int(M1 * 0.25), M1)

    # once we have interpolated, pObservation should be set back to 1.0
    pObservation = 1.0

    # interpolating Nans with linear interpolation
    #trainData = tsUtils.nanInterpolateHelper(trainData)
    #trainData2 = tsUtils.nanInterpolateHelper(trainData2)
    #trainData3 = tsUtils.nanInterpolateHelper(trainData3)

    # test data and hidden truth
    testData = combinedTS[-1 * testPoints:]
    testData2 = combinedTS2[-1 * testPoints:]
    testData3 = combinedTS3[-1 * testPoints:]

    meanTestData = meanTS[
        -1 *
        testPoints:]  # this is only needed for various statistical comparisons

    # time stamps
    trainTimestamps = timestamps[0:trainPoints]
    testTimestamps = timestamps[-1 * testPoints:]

    # create pandas df
    trainMasterDF = pd.DataFrame(index=trainTimestamps,
                                 data={
                                     key1: trainDataMaster,
                                     key2: trainDataMaster2,
                                     key3: trainDataMaster3
                                 })  # needed for reference later
    trainDF = pd.DataFrame(index=trainTimestamps,
                           data={
                               key1: trainData,
                               key2: trainData2,
                               key3: trainData3
                           })
    meanTrainDF = pd.DataFrame(index=trainTimestamps,
                               data={key1: meanTrainData})

    testDF = pd.DataFrame(index=testTimestamps,
                          data={
                              key1: testData,
                              key2: testData2,
                              key3: testData3
                          })
    meanTestDF = pd.DataFrame(index=testTimestamps, data={key1: meanTestData})

    # train the model
    print("Training the model (imputing)...")
    nbrSingValuesToKeep = 5
    mod = SVDModel(key1,
                   nbrSingValuesToKeep,
                   N,
                   M1,
                   probObservation=pObservation,
                   svdMethod='numpy',
                   otherSeriesKeysArray=otherkeys,
                   includePastDataOnly=includePastDataOnly)

    # uncomment below to run the ALS algorithm ; comment out the above line
    #mod = ALSModel(key1, nbrSingValuesToKeep, N, M1, probObservation=pObservation, otherSeriesKeysArray=otherkeys, includePastDataOnly=True)
    mod.fit(trainDF)

    # imputed + denoised data
    imputedDf = mod.denoisedDF()

    print(" RMSE (training imputation vs mean) = %f" %
          tsUtils.rmse(meanTrainDF[key1].values, imputedDf[key1].values))
    print(" RMSE (training imputation vs obs)  = %f" %
          tsUtils.rmse(trainMasterDF[key1].values, imputedDf[key1].values))

    print("Forecasting (#points = %d)..." % len(testDF))

    # test data is used for point-predictions
    otherTSPoints = N
    if (includePastDataOnly == True):
        otherTSPoints = N - 1
    forecastArray = []
    for i in range(0, len(testDF)):

        pastPointsPrediction = np.zeros(
            N - 1
        )  # for the time series of interest, we only use the past N - 1 points

        # first fill in the time series of interest
        j = 0
        if (i < N -
                1):  # the first prediction uses the end of the training data
            while (j < N - 1 - i):
                pastPointsPrediction[j] = trainMasterDF[key1].values[
                    len(trainDF) - (N - 1 - i) + j]
                j += 1

        if (j < N - 1):  # use the new test data
            pastPointsPrediction[j:] = testDF[key1].values[i - (N - 1) + j:i]

        # now fill in the other series
        otherSeriesDataDict = {}
        for key in otherkeys:
            pastPointsOthers = np.zeros(
                otherTSPoints
            )  # need an appropriate length vector of past points for each series
            j = 0
            if (i < N - 1
                ):  # the first prediction uses the end of the training data
                while (j < N - 1 - i):
                    pastPointsOthers[j] = trainMasterDF[key].values[
                        len(trainDF) - (N - 1 - i) + j]
                    j += 1

            if (j < otherTSPoints):  # use the new test data
                if (includePastDataOnly == True):
                    pastPointsOthers[j:] = testDF[key].values[i - (N - 1) +
                                                              j:i]
                else:
                    pastPointsOthers[j:] = testDF[key].values[i - (N - 1) +
                                                              j:i + 1]

            otherSeriesDataDict.update({key: pastPointsOthers})

        otherKeysToSeriesDFNew = pd.DataFrame(data=otherSeriesDataDict)
        keyToSeriesDFNew = pd.DataFrame(data={key1: pastPointsPrediction})

        prediction = mod.predict(otherKeysToSeriesDFNew,
                                 keyToSeriesDFNew,
                                 bypassChecks=False)
        forecastArray.append(prediction)

    print(" RMSE (prediction vs mean) = %f" %
          tsUtils.rmse(meanTestDF[key1].values, forecastArray))
    print(" RMSE (prediction vs obs)  = %f" %
          tsUtils.rmse(testDF[key1].values, forecastArray))

    print("Plotting...")
    plt.plot(np.concatenate((trainMasterDF[key1].values, testDF[key1].values),
                            axis=0),
             color='gray',
             label='Observed')
    plt.plot(np.concatenate(
        (meanTrainDF[key1].values, meanTestDF[key1].values), axis=0),
             color='red',
             label='True Means')
    plt.plot(np.concatenate((imputedDf[key1].values, forecastArray), axis=0),
             color='blue',
             label='Forecasts')
    plt.axvline(x=len(trainDF),
                linewidth=1,
                color='black',
                label='Training End')
    legend = plt.legend(loc='upper left', shadow=True)
    plt.title('Single Time Series (ARMA + Periodic + Trend) - $p = %.2f$' % p)
    plt.show()
class RobustSyntheticControl(object):

    # seriesToPredictKey:       (string) the series of interest (key)
    # kSingularValuesToKeep:    (int) the number of singular values to retain
    # M:                        (int) the number of columns for the matrix
    # probObservation:          (float) the independent probability of observation of each entry in the matrix
    # modelType:                (string) SVD or ALS. Default is "SVD"
    # svdMethod:                (string) the SVD method to use (optional)
    # otherSeriesKeysArray:     (array) an array of keys for other series which will be used to predict

    def __init__(self,
                 seriesToPredictKey,
                 kSingularValuesToKeep,
                 M,
                 probObservation=1.0,
                 modelType='svd',
                 svdMethod='numpy',
                 otherSeriesKeysArray=[]):

        self.seriesToPredictKey = seriesToPredictKey
        self.otherSeriesKeysArray = otherSeriesKeysArray

        self.N = 1  # each series is on its own row
        self.M = M

        self.kSingularValues = kSingularValuesToKeep
        self.svdMethod = svdMethod

        self.p = probObservation

        if (modelType == 'als'):
            self.model = ALSModel(
                self.seriesToPredictKey,
                self.kSingularValues,
                self.N,
                self.M,
                probObservation=self.p,
                otherSeriesKeysArray=self.otherSeriesKeysArray,
                includePastDataOnly=False)

        elif (modelType == 'svd'):
            self.model = SVDModel(
                self.seriesToPredictKey,
                self.kSingularValues,
                self.N,
                self.M,
                probObservation=self.p,
                svdMethod='numpy',
                otherSeriesKeysArray=self.otherSeriesKeysArray,
                includePastDataOnly=False)

        else:
            self.model = SVDModel(
                self.seriesToPredictKey,
                self.kSingularValues,
                self.N,
                self.M,
                probObservation=self.p,
                svdMethod='numpy',
                otherSeriesKeysArray=self.otherSeriesKeysArray,
                includePastDataOnly=False)

        self.control = None  # these are the synthetic control weights

    # keyToSeriesDictionary: (Pandas dataframe) a key-value Series
    # Note that the keys provided in the constructor MUST all be present
    # The values must be all numpy arrays of floats.
    def fit(self, keyToSeriesDF):

        self.model.fit(keyToSeriesDF)

    # otherKeysToSeriesDFNew:     (Pandas dataframe) needs to contain all keys provided in the model;
    #                               all series/array MUST be of length >= 1,
    #                               If longer than 1, then the most recent point will be used (for each series)
    def predict(self, otherKeysToSeriesDFNew):
        prediction = np.dot(
            self.model.weights,
            otherKeysToSeriesDFNew[self.otherSeriesKeysArray].T)
        return prediction

    # return the synthetic control weights
    def getControl():

        if (self.model.weights is None):
            raise Exception(
                'Before calling getControl() you need to call the fit() method first.'
            )

        else:
            return self.model.weights
class RobustSyntheticControl(object):

    # seriesToPredictKey:       (string) the series of interest (key)
    # kSingularValuesToKeep:    (int) the number of singular values to retain
    # M:                        (int) the number of columns for the matrix
    # probObservation:          (float) the independent probability of observation of each entry in the matrix
    # modelType:                (string) SVD or ALS. Default is "SVD"
    # svdMethod:                (string) the SVD method to use (optional)
    # otherSeriesKeysArray:     (array) an array of keys for other series which will be used to predict

    def __init__(
        self,
        seriesToPredictKey=None,
        kSingularValues=5,
        p=1.0,
        modelType="svd",
        svdMethod="numpy",
        otherSeriesKeysArray=[],
    ):
        self.seriesToPredictKey = seriesToPredictKey
        self.kSingularValues = kSingularValues
        self.p = p
        self.modelType = modelType
        self.svdMethod = svdMethod
        self.otherSeriesKeysArray = otherSeriesKeysArray

        self.N = 1  # each series is on its own row
        self.model = None
        self.control = None  # these are the synthetic control weights

    def set_params(self, **parameters):
        for parameter, value in parameters.items():
            setattr(self, parameter, value)
        return self

    def get_params(self, deep=True):
        return dict(
            seriesToPredictKey=self.seriesToPredictKey,
            kSingularValues=self.kSingularValues,
            p=self.p,
            modelType=self.modelType,
            svdMethod=self.svdMethod,
            otherSeriesKeysArray=self.otherSeriesKeysArray,
        )

    def create_model(self, X_train):
        M = len(X_train)
        if self.modelType == "als":
            self.model = ALSModel(
                self.seriesToPredictKey,
                self.kSingularValues,
                self.N,
                M,
                probObservation=self.p,
                otherSeriesKeysArray=self.otherSeriesKeysArray,
                includePastDataOnly=False,
            )
        else:  # default: SVD
            self.model = SVDModel(
                self.seriesToPredictKey,
                self.kSingularValues,
                self.N,
                M,
                probObservation=self.p,
                svdMethod="numpy",
                otherSeriesKeysArray=self.otherSeriesKeysArray,
                includePastDataOnly=False,
            )

    # X_train: (Pandas dataframe) a key-value Series
    # y_train is ignored but required for compatibility with some sklearn methods
    # Note that the keys provided in the constructor MUST all be present
    # The values must be all numpy arrays of floats.
    def fit(self, X_train, y_train=None):
        # Model must be (re)created here for compatibility with scikit API
        self.create_model(X_train)
        self.model.fit(X_train)

    # otherKeysToSeriesDFNew:     (Pandas dataframe) needs to contain all keys provided in the model;
    #                               all series/array MUST be of length >= 1,
    #                               If longer than 1, then the most recent point will be used (for each series)
    def predict(self, otherKeysToSeriesDFNew):
        if self.model is None:
            raise NotFittedError("Cannot call predict() before fit()")
        prediction = np.dot(
            self.model.weights,
            otherKeysToSeriesDFNew[self.otherSeriesKeysArray].T)
        return prediction

    def rmse(self, y_pred, y_true):
        return np.sqrt(np.mean((y_pred - y_true)**2))

    # Score = -RMSE
    def score(self, X, y_true=None):
        if y_true is None:
            y_true = X[self.seriesToPredictKey]
        y_pred = self.predict(X)
        return -self.rmse(y_pred, y_true)

    # return the synthetic control weights
    def getControl(self):
        if self.model is None:
            raise NotFittedError(
                "Cannot get model weights before calling fit()")
        else:
            return self.model.weights