Esempio n. 1
0
    def get(self, request, *args, **kwargs):
        n_steps = int(self.request.query_params.get('nsteps', 10))

        last_date = MultivarientData.objects.latest(
            'date').date + datetime.timedelta(days=30)

        data = read_frame(MultivarientData.objects.all())
        data['date'] = pd.to_datetime(data['date'])
        data = data.drop('id', axis=1)
        data = data.set_index('date')
        oildf = data['oil_price']
        date_index = pd.date_range(start=last_date, periods=n_steps, freq='M')
        df = pd.DataFrame()
        arima = SARIMAX(endog=oildf, order=(3, 0, 4), freq='M', seasonal_order=(0, 1, 1, 6), trend='t',
                        enforce_stationarity=False, enforce_invertibility=False).fit()
        df['oilpriceprediction'] = arima.predict(
            date_index.min(), date_index.max())
        arima = SARIMAX(endog=data['iron_price'], exog=data[['oil_price']], order=(1, 0, 0), freq='M',
                        seasonal_order=(0, 1, 1, 6), trend='t', enforce_stationarity=False,
                        enforce_invertibility=False).fit()

        df['ironpriceprediction'] = arima.predict(df.index.min(), df.index.max(),
                                                  exog=df[['oilpriceprediction']])
        df['date'] = df.index
        oil_data = df[['date', 'oilpriceprediction']].values.tolist()
        iron_data = df[['date', 'ironpriceprediction']].values.tolist()

        return Response({'oil_data': oil_data, 'iron_data': iron_data})
def run_sarimax(language):
    create_predictions_folder()
    series = read_csv(os.path.join(DATA_FOLDER, language + CSV_FILE_SUFFIX),
                      header=0,
                      parse_dates=[0],
                      index_col=0,
                      squeeze=True,
                      date_parser=parser)

    data = series.values.tolist()
    train, test = data[:-12], data[-12:]
    model_fit = SARIMAX(train, order=(2, 1, 4),
                        seasonal_order=(1, 1, 1, 12)).fit()
    dates_list = get_future_date_list()
    print(len(dates_list))
    test_pred = model_fit.predict(len(train) + 1, len(data), dynamic=True)
    future_pred = model_fit.predict(len(data), len(data) + 59, dynamic=True)
    pyplot.figure()
    pyplot.title("Predictions based on SARIMAX model : " + language +
                 " repositories")
    pyplot.plot(series, label='Historical data')
    pyplot.plot(series.keys().tolist(),
                [None for i in range(len(train))] + test_pred.tolist(),
                label='Predictions - Test data')
    pyplot.plot(dates_list, future_pred, label='Predictions - 2019 to 2023')
    pyplot.legend()
    pyplot.savefig(
        os.path.join(PREDICTIONS_FOLDER,
                     language + "_predictions_SARIMAX.png"))

    rmse = RMSE(test, test_pred)
    print('SARIMAX RMSE: %.3f' % rmse + " for " + language + " repos test set")
    write_to_csv([str(date_)[:-3] for date_ in dates_list], future_pred,
                 language, SARIMAX_)
    return future_pred
Esempio n. 3
0
    def get(self, request, *args, **kwargs):
        first_date = str(MultivarientData.objects.earliest('date').date)
        last_date = str(MultivarientData.objects.latest(
            'date').date - datetime.timedelta(days=250))

        start_date = self.request.query_params.get('startdate', first_date)
        end_date = self.request.query_params.get('enddate', last_date)

        if end_date > last_date:
            end_date = last_date

        date_valid = MultivarientData.objects.exclude(
            date__gt=end_date).exclude(date__lt=start_date)
        if not date_valid:
            start_date = first_date
            end_date = last_date

        data = read_frame(MultivarientData.objects.all())
        data['date'] = pd.to_datetime(data['date'])
        data = data.drop('id', axis=1)
        data = data.set_index('date')

        startdate = dat.strptime(start_date, '%Y-%m-%d')
        enddate = dat.strptime(end_date, '%Y-%m-%d')

        nextmonth = enddate + relativedelta.relativedelta(months=1)
        train, test = data[startdate:nextmonth], data[nextmonth:]
        oiltrain = train['oil_price']
        arima = SARIMAX(endog=oiltrain, order=(3, 0, 4), freq='M', seasonal_order=(0, 1, 1, 6), trend='t',
                        enforce_stationarity=False, enforce_invertibility=False).fit()
        test['oilpriceprediction'] = arima.predict(
            test.index.min(), test.index.max())
        arima = SARIMAX(endog=train['iron_price'], exog=train[['oil_price']], order=(1, 0, 0), freq='M',
                        seasonal_order=(0, 1, 1, 6), trend='t', enforce_stationarity=False,
                        enforce_invertibility=False).fit()

        test['ironpriceprediction'] = arima.predict(test.index.min(), test.index.max(),
                                                    exog=test[['oilpriceprediction']])
        test['date'] = test.index.astype('str')

        ironactual_data = test[['date', 'iron_price']].values.tolist()
        ironpredicted_data = test[[
            'date', 'ironpriceprediction']].values.tolist()
        oilactual_data = test[['date', 'oil_price']].values.tolist()
        oilpredicted_data = test[[
            'date', 'oilpriceprediction']].values.tolist()
        ironmetrics = forecast_accuracy(
            test['iron_price'], test['ironpriceprediction'])
        oilmetrics = forecast_accuracy(
            test['oil_price'], test['oilpriceprediction'])
        return Response({
            'actual_irondata': ironactual_data, 
            'predicted_irondata': ironpredicted_data, 
            'actual_oildata': oilactual_data, 
            'predicted_oildata': oilpredicted_data, 
            'ironmape': ironmetrics.get('mape', 0)*100, 
            'oilmape': oilmetrics.get('mape', 0)*100}
        )
Esempio n. 4
0
def sarimax(train,test):
	train_pred=pd.DataFrame(data=None,index=train.index,columns=train.columns) # in sample predictions on train set
	test_pred=pd.DataFrame(data=None,index=test.index,columns=test.columns) # out of sample prediction on test set	
	for (i,train_day,test_day) in [(i, dp.split(train,nsplits=7)[i], dp.split(test,nsplits=7)[i]) for i in dp.split(train,nsplits=7)]: # for each day
		train_pred_day=pd.DataFrame(data=None,index=train_day.index,columns=train_day.columns) # in sample predictions on train set
		test_pred_day=pd.DataFrame(data=None,index=test_day.index,columns=test_day.columns) # out of sample prediction on test set
		for hour in train_day: # for each hour in a day
			train_day_hour=train_day[hour] # train samples for particular hour
			test_day_hour=test_day[hour] # test samples for particular hour
			model_train = SARIMAX(train_day_hour, order=(0,1,1),seasonal_order=(0,1,1,7),trend='c',measurement_error=True).fit() # train model
			model_test=SARIMAX(pd.concat([train_day_hour,test_day_hour]), order=(0,1,1),seasonal_order=(0,1,1,7),trend='c',measurement_error=True).filter(model_train.params) # workaround for rolling day ahead forecast
			train_pred_day[hour]=model_test.predict(start=0,end=len(train_day)-1) # predict in sample on train set
			test_pred_day[hour]=model_test.predict(start=len(train_day)) # predict out of sample on test set
		train_pred.iloc[i::7,:]=train_pred_day # fill corresponding rows with in sample predictions
		test_pred.iloc[i::7,:]=test_pred_day # fill corresponding rows with out of sample predictions
	return train_pred,test_pred
Esempio n. 5
0
def sarima_model(request):
    df = pd.read_csv('sales/data/IPN31152N.csv', index_col=0)
    df.index = pd.date_range(start='1972-01-01', end='2020-01-01', freq='M')
    train_df = df[df.index <= '2017-12-31']
    test_df = df[df.index > '2017-12-31']
    model1 = SARIMAX(train_df['IPN31152N'],
                     order=(3, 1, 3),
                     seasonal_order=(0, 1, 1, 12)).fit()
    pred = model1.predict(start=len(train_df),
                          end=len(train_df) + len(test_df) - 1,
                          type='levels')
    df_pred = pd.DataFrame(pred)
    df_pred.columns = ['IPN31152N']
    results = {
        'test': [[time_unix(test_df.index[i]), test_df.iloc[i]['IPN31152N']]
                 for i in range(0, len(test_df))],
        'predict':
        [[time_unix(df_pred.index[i]), df_pred.iloc[i]['IPN31152N']]
         for i in range(0, len(df_pred))]
    }
    re = {}
    re['2018'] = [
        round(
            measure_metric(test_df['IPN31152N'][:12].values,
                           df_pred['IPN31152N'][:12]) * 100, 2)
    ]
    re['2019'] = [
        round(
            measure_metric(test_df['IPN31152N'][-12:].values,
                           df_pred['IPN31152N'][-12:]) * 100, 2)
    ]
    context = {"data": json.dumps(results), 'mape': json.dumps(re)}
    return render(request, 'charts_model.html', context=context)
Esempio n. 6
0
def testTime(request):
    try:
        df = pd.read_csv('MyApp/data/IPN31152N.csv', index_col=0)
        df.index = pd.date_range(start='1972-01-01',
                                 end='2020-01-01',
                                 freq='M')
        train_df = df[df.index <= '2017-12-31']
        test_df = df[df.index > '2017-12-31']
        model1 = SARIMAX(train_df['IPN31152N'],
                         order=(3, 1, 1),
                         seasonal_order=(0, 1, 1, 12)).fit()
        pred = model1.predict(start=len(train_df),
                              end=len(train_df) + len(test_df) - 1,
                              type='levels')
        df_pred = pd.DataFrame(pred)
        df_pred.columns = ['IPN31152N']
        results = {
            'test':
            [[time_unix(test_df.index[i]), test_df.iloc[i]['IPN31152N']]
             for i in range(0,
                            len(test_df) - 1)],
            'predict':
            [[time_unix(df_pred.index[i]), df_pred.iloc[i]['IPN31152N']]
             for i in range(0,
                            len(df_pred) - 1)]
        }
        #context = {"data":json.dumps(results), "aa":"hihi","haha":"hahahaha"}
        return Response({'result': results})
    except ValueError as e:
        # return JsonResponse(e.args[0],status.HTTP_400_BAD_REQUEST)
        return Response(status=status.HTTP_400_BAD_REQUEST)
Esempio n. 7
0
def sarima(data, col, train, test, order_val, s_ord, tr, frequency):
    """
    data - Entire Dataframe
    col - Target value
    train - Train Data Frame
    test - Test Data Frame
    order_val - (p,d,q)
    s_ord - (P,D,Q,s)
    tr =  str{‘n’,’c’,’t’,’ct’} or iterable, optional
    
    
    """
    
    y_hat_avg = test.copy()
    fit1 = SARIMAX(train[col], order=order_val,seasonal_order=s_ord, trend=tr ).fit()
    y_hat_avg['SARIMA'] = fit1.predict(start=train.index[-1], end=test.index[-1], dynamic=True)

    print('Rmse= ', rmse(test[col], y_hat_avg['SARIMA']))
    #print(y_hat_avg)
    plt.figure(figsize=(16,8))
    plt.plot( train[col], label='Train')
    plt.plot(test[col], label='Test')
    plt.plot(y_hat_avg['SARIMA'], label='SARIMA')
    plt.legend(loc='best')
    plt.savefig(frequency+'sarima.png')
def sarima_models_for_27_zipcodes():
    #sarima orders after running grid search
    sarima_orders = [((1, 1, 0), (1, 1, 0, 12)), ((1, 1, 1), (1, 1, 0, 12)),
                     ((1, 1, 1), (1, 1, 0, 12)), ((1, 1, 0), (1, 1, 0, 12)),
                     ((1, 1, 1), (1, 1, 1, 12)), ((1, 1, 1), (1, 1, 0, 12)),
                     ((1, 1, 1), (1, 1, 0, 12)), ((1, 1, 1), (1, 1, 1, 12)),
                     ((1, 1, 0), (1, 1, 0, 12)), ((1, 1, 0), (1, 1, 0, 12)),
                     ((1, 1, 1), (1, 1, 1, 12)), ((1, 1, 1), (1, 1, 1, 12)),
                     ((1, 1, 0), (1, 1, 0, 12)), ((1, 1, 1), (1, 1, 0, 12)),
                     ((1, 1, 0), (1, 1, 0, 12)), ((1, 1, 0), (1, 1, 0, 12)),
                     ((1, 1, 1), (1, 1, 0, 12)), ((1, 1, 0), (1, 1, 0, 12)),
                     ((1, 1, 1), (1, 1, 0, 12)), ((1, 1, 1), (1, 1, 0, 12)),
                     ((1, 1, 1), (1, 1, 0, 12)), ((1, 1, 0), (1, 1, 0, 12)),
                     ((1, 1, 0), (1, 1, 0, 12)), ((1, 1, 1), (1, 1, 1, 12)),
                     ((1, 1, 0), (1, 1, 0, 12)), ((1, 1, 1), (1, 1, 0, 12)),
                     ((1, 1, 0), (1, 1, 0, 12))]

    #training models based on optimal sarima orders
    regions = zipcodes_top27()

    data = load_data_top_27()

    train, test = train_test_split(data, '2013-01-01', '2017-10-01')

    sarima_test_predictions = []
    sarima_models = []
    for i in range(len(regions)):
        model = SARIMAX(train.iloc[:, i],
                        order=sarima_orders[i][0],
                        seasonal_order=sarima_orders[i][1],
                        enforce_invertibility=False,
                        enforce_stationarity=False).fit()
        test_preds = model.predict(start=test.iloc[:, i].index[0],
                                   end=test.iloc[:, i].index[-1],
                                   typ='levels')
        sarima_test_predictions.append(test_preds)
        sarima_models.append(model)

    sns.set(font_scale=1)
    sns.set_style('white')
    pd.plotting.register_matplotlib_converters()
    fig, ax = plt.subplots(9, 3, figsize=(20, 18))
    i = 0
    for row in range(9):
        for col in range(3):
            err = round(
                np.sqrt(mse(test.iloc[:, i], sarima_test_predictions[i])), 0)
            test.iloc[:, i].plot(ax=ax[row][col],
                                 color='blue',
                                 label='Actual :' + str(regions[i]))
            sarima_test_predictions[i].plot(ax=ax[row][col],
                                            color='k',
                                            label='Preds, RMSE = ' + str(err))
            ax[row][col].legend(loc='upper left')
            i += 1

    return plt.show()
Esempio n. 9
0
 def arimax(self, gr, feat, param):
     # if no external features, no forecast result
     if self.ext is None:
         return pd.DataFrame(columns = ['ds', 'y'])
     # input monthly data
     df = self.df_m.copy()
     df = self.monthlyfeat(self.df_m, col=feat)
     df['y'] = self.valtogr(df) if gr else df['y']
     # clean data - drop null from growth calculation, fill 0 when no external data
     df = df.iloc[len([x for x in df['y'] if pd.isnull(x)]):, :]
     df = df.fillna(0).reset_index(drop=True)
     # prepare data
     x = df['y'].values
     ex = df.iloc[:, 2:].values
     # fit model1 with external
     m1 = SARIMAX(x, exog=ex, order=(param['p'], param['d'], param['q']), initialization='approximate_diffuse')
     m1 = m1.fit(disp = False)
     # prepare external data
     df_pred = pd.DataFrame(columns = ['ds', 'y'])
     for i in self.dt_m:
         df_pred = df_pred.append({'ds' : i} , ignore_index=True)
         df_pred = self.monthlyfeat(df_pred, col=feat)
         if np.isnan(list(df_pred.iloc[-1, 2:].values)).any():
             df_pred = df_pred.iloc[:-1, :]
             break
     # forecast model1
     ex_pred = df_pred.iloc[:, 2:].values
     r1 = m1.predict(start=df.index[-1] + 1, end=df.index[-1] + ex_pred.shape[0], exog=ex_pred)
     # model2 (used when there is no external features in future prediction)
     if len(r1) < self.fcst_pr:
         # fit model2 without external
         m2 = SARIMAX(x, order=(param['p'], param['d'], param['q']), initialization='approximate_diffuse')
         m2 = m2.fit(disp = False)
         # forecast model2
         r2 = m2.predict(start=df.index[-1] + ex_pred.shape[0] + 1, end=df.index[-1] + self.fcst_pr)
     else:
         r2 = []
     # summarize result
     r = list(r1) + list(r2)
     r = pd.DataFrame(zip(self.dt_m, r), columns =['ds', 'y'])
     r['y'] = self.grtoval(r, self.df_m) if gr else r['y']
     return self.correctzero(r)
def walk_forward_validation_single_run(model, train, test):
    fitted = SARIMAX(train.values,
                     order=(model.ar, model.d, model.ma),
                     seasonal_order=(model.s_ar, model.s_d, model.s_ma,
                                     model.s_period),
                     trend='c').fit()
    predicted_vals = fitted.predict(
        model.d, train.shape[0] - model.d + test.shape[0] -
        1)  # typ arg only exists for ARIMA, not SARIMAX model
    rmsse = get_rmsse(train, test, predicted_vals[-test.shape[0]:])
    return rmsse
Esempio n. 11
0
def meta_grid_search(ts,
                     TEST_SIZE=0.2,
                     model_kws={},
                     verbose=True,
                     return_kws=False):
    import pmdarima as pm
    from statsmodels.tsa.statespace.sarimax import SARIMAX

    ## Train Test Split
    idx_split = get_train_test_split_index(ts, TEST_SIZE=TEST_SIZE)

    ts_train = ts.iloc[:idx_split].copy()
    ts_test = ts.iloc[idx_split:].copy()

    ## Combine Default kwargs and model_kws
    model_kwargs = dict(start_p=0,
                        start_q=0,
                        start_P=0,
                        start_Q=0,
                        max_p=5,
                        max_q=6,
                        max_P=5,
                        max_Q=5,
                        max_D=3,
                        suppress_warnings=True,
                        stepwise=False,
                        trace=False,
                        m=6,
                        seasonal=True,
                        with_intercept=True,
                        stionarity=False)

    for k, v in model_kws.items():
        model_kwargs[k] = v

    if verbose:
        print("pm.auto_arima args:")
        print(model_kwargs)

    model = pm.auto_arima(ts_train, **model_kwargs)
    display(model.summary())

    model_sarimax = SARIMAX(ts_train, **model.get_params()).fit()

    preds = model_sarimax.predict(ts_test.index[0], ts_test.index[-1])
    res = get_model_metrics(ts_test, preds, ts_train)
    display(res)

    return model_sarimax
Esempio n. 12
0
 def arima(self, gr, param):
     # input monthly data
     df = self.df_m.copy()
     df['y'] = self.valtogr(df) if gr else df['y']
     df = df.dropna().reset_index(drop=True)
     # prepare tranining data
     x = df['y'].values
     # fit model
     m = SARIMAX(x, order=(param['p'], param['d'], param['q']), initialization='approximate_diffuse')
     m = m.fit(disp = False)
     # forecast
     r = m.predict(start=df.index[-1] + 1, end=df.index[-1] + self.fcst_pr)
     r = pd.DataFrame(zip(self.dt_m, r), columns =['ds', 'y'])
     r['y'] = self.grtoval(r, self.df_m) if gr else r['y']
     return self.correctzero(r)
def pipeline(data, cfg):
    if cfg['autoencoder']:
        # encoder, cfg = pre_training(data=avocado_data, cfg=cfg)
        autoencoder = Autoencoder(data, cfg)
        autoencoder.train()
        autoencoder.test()
    else:
        autoencoder = None
    # Extract data
    train_x, train_y, train_f = data.get_train_sequence()
    plt.figure()
    plt.plot(data.data)
    plt.show()

    # Fit model
    mc_model = MonteCarloNetwork(data, autoencoder, cfg)
    mc_model.train(train_x, train_y, train_f)

    # model = train_model(train_x, train_y, cfg)

    # Fit seasonal arima
    # sarimax = Sarimax(data.get, cfg)

    test_x, test_y, test_f = data.get_test_sequence()

    # Forecast on the last proportion of the data set
    mse_test, pred_test = monte_carlo_dropout(mc_model, test_x, test_y, test_f)

    model_es = ExponentialSmoothing(data.train)
    model_es = model_es.fit()
    pred_es = model_es.predict(start=data.test.index[cfg['sequence_length']],
                               end=data.test.index[-1])
    model_arima = SARIMAX(data.train, order=(3, 1, 0))
    model_arima = model_arima.fit()
    pred_arima = model_arima.predict(
        start=data.test.index[cfg['sequence_length']], end=data.test.index[-1])

    pred_es = np.asarray(pred_es).reshape(test_y.shape)
    pred_arima = np.asarray(pred_arima).reshape(test_y.shape)
    print('======= Test Statistics =======')
    statistics(test_x, test_y, mse_test, pred_test)
    print('Exponential Smoothing:', mean_squared_error(test_y, pred_es))
    print('SARIMAX:', mean_squared_error(test_y, pred_arima))
    plt.figure()
    plt.plot(data.data)
    plt.show()
    plot_airpassengers(data.data, pred_test, mse_test, pred_es, pred_arima)
Esempio n. 14
0
    def get(self, request, *args, **kwargs):
        first_date = str(UnivarientData.objects.earliest('date').date)
        last_date = str(UnivarientData.objects.latest(
            'date').date - datetime.timedelta(days=250))

        start_date = self.request.query_params.get('startdate', first_date)
        end_date = self.request.query_params.get('enddate', last_date)

        if end_date > last_date:
            end_date = last_date

        date_valid = UnivarientData.objects.exclude(
            date__gt=end_date).exclude(date__lt=start_date)

        if not date_valid:
            start_date = first_date
            end_date = last_date

        data = read_frame(UnivarientData.objects.all())
        data['date'] = pd.to_datetime(data['date'])
        data = data.drop('id', axis=1)
        data = data.set_index('date')

        startdate = dat.strptime(start_date, '%Y-%m-%d')
        enddate = dat.strptime(end_date, '%Y-%m-%d')

        nextmonth = enddate + relativedelta.relativedelta(months=1)

        train, test = data[startdate:nextmonth], data[nextmonth:]

        arima = SARIMAX(train, order=(1, 0, 2), freq='M', seasonal_order=(
            1, 1, 2, 6), trend='t', enforce_stationarity=False, enforce_invertibility=False).fit()
        predict = arima.predict(test.index.min(), test.index.max())
        predictdata = pd.DataFrame(
            predict, index=test.index, columns=['predictprice'])
        metrics = forecast_accuracy(predictdata.values, test.values)

        predictdata['actual'] = test.values
        predictdata['date'] = predictdata.index.astype('str')

        actual_data = predictdata[['date', 'actual']].values.tolist()
        predicted_data = predictdata[['date', 'predictprice']].values.tolist()

        return Response({'actual_data': actual_data, 'predicted_data': predicted_data, 'mape': metrics.get('mape', 0)*100})
Esempio n. 15
0
class Sarimax:
    def __init__(self, df, cfg):
        self.series = df[cfg['target_feature']]
        self.model = SARIMAX(self.series,
                             order=(3, 1, 0),
                             seasonal_order=(0, 0, 0, 12))

    def fit_model(self):
        # Fit model
        self.model = self.model.fit(disp=0)
        print(self.model.summary())

    def plot_autocorrelation(self):
        # Plot auto correlation
        autocorrelation_plot(self.series)
        plt.show()

    def predict_arima(self, series):
        return self.model.predict(series)
Esempio n. 16
0
    def get(self, request, *args, **kwargs):
        n_steps = int(self.request.query_params.get('nsteps', 10))

        last_date = UnivarientData.objects.latest(
            'date').date + datetime.timedelta(days=30)

        data = read_frame(UnivarientData.objects.all())
        data['date'] = pd.to_datetime(data['date'])
        data = data.drop('id', axis=1)
        data = data.set_index('date')
        arima = SARIMAX(data, order=(1, 0, 2), freq='M', seasonal_order=(1, 2, 1, 6),
                        enforce_stationarity=False, enforce_invertibility=False, ).fit()
        date_index = pd.date_range(start=last_date, periods=n_steps, freq='M')
        data = pd.DataFrame()
        data['prediction'] = arima.predict(date_index.min(), date_index.max())
        data['date'] = date_index
        data['date'] = data['date']
        predicted_data = data[['date', 'prediction']].values.tolist()
        return Response({'predicted_data': predicted_data})
Esempio n. 17
0
def Auto_Arima(df,dirloc,filename):
    import itertools
    from statsmodels.tsa.statespace.sarimax import SARIMAX
    
    p=d=q=range(0,3)
    pdq = list(itertools.product(p,d,q))
    seas_decomp=[]
    for x in pdq:
        x1=(x[0],x[1],x[2],12)
        seas_decomp.append(x1)
    print("Computating AIC of Different Sesonal ARIMA.....\n")
    arima_order=[]
    seas_order=[]
    aic_val=[]
    
    for params in pdq:
        for seas_par in seas_decomp:
            mod = SARIMAX(df,order=params,seasonal_order=seas_par,enforce_stationarity=False, enforce_invertibility=False,freq="MS").fit()
            arima_order.append(params)
            seas_order.append(seas_par)
            aic_val.append(round(mod.aic,2))
            print("SARIMA: {} X {} | AIC = {}".format(params,seas_par,round(mod.aic,2)))
            
    results = pd.DataFrame({"ARIMA Order":arima_order,"Seasonal Order":seas_order,"AIC Value":aic_val}) 
    results_sorted = results.sort_values(by="AIC Value",ascending=True)
    results_sorted=results_sorted.reset_index(drop=True)
    print("Selected SARIMA Order:",results_sorted.head(2))
    
    final_model = SARIMAX(df,order=results_sorted["ARIMA Order"][0],seasona_order=results_sorted["Seasonal Order"][0],enforce_stationarity=False, enforce_invertibility=False,freq="MS").fit()
    print("Final Model Result Summary {}".format(final_model.summary()))
    print(results_sorted["ARIMA Order"][0])
    print(results_sorted["Seasonal Order"][0])
    predictions = final_model.predict(start=dt.datetime.strptime("2020-06-01","%Y-%m-%d"),end=dt.datetime.strptime("2020-12-01","%Y-%m-%d"))
    print("Average Monthly WTI Crude Oil Spot Price from June to Dec 2020:")
    print(predictions)
    with open(os.path.join(dirloc[:-5],outputfile),"a") as f:
         f.write("Simulation Result of SARIMA....\n")
         f.write(str(results_sorted))
         f.write("\n")
         f.write(str(predictions))
    f.close()
    return results_sorted
Esempio n. 18
0
# Prepare the data
X = timeseriesgenerator
y = 

from sklearn.model_selection import train_test_split 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)


# ========================= SARIMAX =========================
from statsmodels.tsa.statespace.sarimax import SARIMAX

mod = SARIMAX(data['ln_wpi'], trend='c', order=(1,1,(1,0,0,1)))
mod = mod.fit(X_train)
print(mod.summary())

pred = mod.predict(X_test)
plt.plot(X, y)
plt.plot(X_test, pred)




# ========================= XGBoost =========================
from xgboost import XGBRegressor 

xgb = XGBRegressor()
xgb.fit(X_train_scaled, y_train)
pred = xgb.predict(X_test)

plt.plot()
Esempio n. 19
0
def run_app():
    image = Image.open('Air_pol.JPG')

    st.image(image, use_column_width=True)
    no2 = Image.open('no2.JPG')
    st.sidebar.image(no2, use_column_width=True)

    df2 = df.copy()
    df2 = df2['Nitrogen_dioxide']
    train = df2[0:-30]
    test = df2[-30:]

    add_selectbox = st.sidebar.selectbox(
        "Select Forecasting Model",
        ("Simple Moving Average", "LSTM", "Triple Exponential Smoothing",
         "Seasonal ARIMA", "Gradient Boosting Regressor",
         "ML Model Comparison Table"))
    st.sidebar.info(
        'This application is developed by Siddhesh D. Munagekar to forecast Nitrogen dioxide concentration in air using multiple forecasting technique'
    )

    if add_selectbox == 'Simple Moving Average':
        df1 = df.Nitrogen_dioxide.copy()
        df1 = pd.DataFrame(df1)
        df1['SMA_20'] = df1.Nitrogen_dioxide.rolling(20, min_periods=1).mean()
        df1['SMA_10'] = df1.Nitrogen_dioxide.rolling(10, min_periods=1).mean()
        df1['SMA_3'] = df1.Nitrogen_dioxide.rolling(3, min_periods=1).mean()
        fig = plt.figure()

        df1.plot(figsize=(25, 15))
        plt.xlabel('Date', fontsize=20)
        plt.ylabel('Nitrogen dioxide', fontsize=20)
        plt.xticks(fontsize=20)
        plt.yticks(fontsize=20)
        plt.title("Simple Moving  Average for 20, 10 and 3 days", fontsize=30)
        plt.legend(
            labels=['Temperature', '20-days SMA', '10-days SMA', '3-days SMA'],
            fontsize=22)
        plt.grid()
        plt.show()
        st.pyplot(use_column_width=True)
        mae = metrics.mean_absolute_error(df['Nitrogen_dioxide'],
                                          df1['SMA_20'])
        st.write("MAE for 20 days is {:,.2f}".format(mae))
        mae = metrics.mean_absolute_error(df['Nitrogen_dioxide'],
                                          df1['SMA_10'])
        st.write("MAE for 10 days is {:,.2f}".format(mae))
        mae = metrics.mean_absolute_error(df['Nitrogen_dioxide'], df1['SMA_3'])
        st.write("MAE for 3 days is {:,.2f}".format(mae))

    if add_selectbox == 'Triple Exponential Smoothing':

        train = pd.DataFrame(train)
        test = pd.DataFrame(test)
        pred = test.copy()
        fit1 = ExponentialSmoothing(np.asarray(train['Nitrogen_dioxide']),
                                    trend='add',
                                    seasonal_periods=7,
                                    seasonal='add').fit()

        pred['Holt_Winter'] = fit1.forecast(len(test))
        # Calculate KPI's
        mae = metrics.mean_absolute_error(test.Nitrogen_dioxide,
                                          pred.Holt_Winter)

        # Plot
        plt.figure(figsize=(16, 8))
        plt.plot(train['Nitrogen_dioxide'], label='Train')
        plt.plot(test['Nitrogen_dioxide'], label='Test')
        plt.plot(pred['Holt_Winter'],
                 label='Holt_Winter (MAE={:.2f})'.format(mae))
        plt.title("Triple Exponential smoothing", fontsize=30)
        plt.xlabel('Date', fontsize=20)
        plt.ylabel('Nitrogen dioxide', fontsize=20)
        plt.legend(fontsize=19)
        plt.xticks(fontsize=20)
        plt.yticks(fontsize=20)
        plt.grid()
        plt.show()
        st.pyplot(use_column_width=True)
        st.write("MAE for 30 days is {:,.2f}".format(mae))

    ##Seasonal_Arima
    if add_selectbox == 'Seasonal ARIMA':

        df3 = df.copy()
        #train = df3[0:-30]
        test = df3[-30:]

        model = SARIMAX(df3['Nitrogen_dioxide'],
                        order=(0, 1, 0),
                        seasonal_order=(2, 1, 0, 30),
                        enforce_stationarity=False,
                        enforce_invertibility=False,
                        dynamic=True)
        results = model.fit()

        df3['predicted_test'] = results.predict(start=360,
                                                end=390,
                                                dynamic=True)

        seasonal_forecast = pd.DataFrame(results.forecast(len(test)))
        seasonal_forecast = seasonal_forecast.rename(
            {0: 'Seasonal forecast for 30 periods'}, axis=1)

        plt.figure(figsize=(16, 8))
        seasonal_forecast.plot(figsize=(25, 10), color='green')
        df3['Nitrogen_dioxide'].plot(figsize=(20, 10))
        df3['predicted_test'].plot(figsize=(20, 10))
        plt.legend(fontsize=19)
        plt.ylabel("Nitrogen_dioxide", fontsize=20)
        plt.xlabel('Date', fontsize=20)
        plt.title("Seasonal Arima", fontsize=30)
        plt.legend(fontsize=19)
        plt.xticks(fontsize=20)
        plt.yticks(fontsize=20)
        plt.grid()
        plt.show()
        st.pyplot(use_column_width=True)

        # Calculate KPI
        mae = metrics.mean_absolute_error(df3.Nitrogen_dioxide[360:],
                                          df3.predicted_test[360:])

        st.write("MAE of Seasonal Arima is  {:.2f}".format(mae))

    if add_selectbox == 'ML Model Comparison Table':

        acc_table = {
            'Model': [
                'Linear Regression', 'Decision Tree', 'Random_forest',
                'Gradient_Boosting'
            ],
            'Train_score': [0.59, 0.70, 0.91, 0.83],
            'Test_score': [0.49, 0.46, 0.40, 0.50],
            'MAE_train': [4265.36, 3713.86, 2220.76, 2958.29],
            'MAE_test': [3053.96, 3116.57, 3161.29, 2726.36]
        }
        acc_table = pd.DataFrame(acc_table)
        acc_table = acc_table.sort_values(
            by='Test_score', ascending=False).reset_index(drop=True)
        st.table(acc_table)

    #Gradient Boosting
    if add_selectbox == 'Gradient Boosting Regressor':

        df55 = df.Nitrogen_dioxide.copy()
        df55 = pd.DataFrame(df55)
        dfML = pd.DataFrame()
        for i in range(7, 0, -1):
            dfML[['t-' + str(i)]] = df55.shift(i)

        dfML['t'] = df55.values
        df_ML = dfML[7:]
        # Split Data into dependent(target) and independent(features) variables

        df_ML22 = df_ML.values
        # Lagged variables (features) and original time series data (target)
        X2 = df_ML22[:, 0:
                     -1]  # slice all rows and start with column 0 and go up to but not including the last column
        y2 = df_ML22[:,
                     -1]  # slice all rows and last column, essentially separating out 't' column

        traintarget_size = int(len(y2) * 0.70)
        train_target, test_target = y2[:traintarget_size], y2[
            traintarget_size:len(y2)]
        trainfeature_size = int(len(X2) * 0.70)
        train_feature, test_feature = X2[:trainfeature_size], X2[
            trainfeature_size:len(X2)]

        gbr = GradientBoostingRegressor(max_features=3,
                                        max_depth=2,
                                        learning_rate=0.1,
                                        n_estimators=100,
                                        subsample=0.8,
                                        random_state=50)

        gbr.fit(train_feature, train_target)

        gbr_train_70_30 = gbr.score(train_feature, train_target)
        gbr_test_70_30 = gbr.score(test_feature, test_target)

        plot_test_pred = gbr.predict(test_feature)
        plot_test_pred = pd.DataFrame(plot_test_pred)
        plot_test_pred = plot_test_pred.rename({0: 'Predicted_test'}, axis=1)

        plot_test_target = pd.DataFrame(test_target)
        plot_test_target = plot_test_target.rename({0: 'Actual_test'}, axis=1)
        gbr_test_plot = pd.concat([plot_test_target, plot_test_pred], axis=1)

        gbr_test_plot.plot(
            title='Gradient boosting Actual vs Predicted test of last 116 days'
        )
        plt.grid()
        st.pyplot(use_column_width=True)
        st.write("Gradient boosting training score {:.2f}".format(
            round(gbr_train_70_30, 2)))
        st.write("Gradient boosting test score {:.2f}".format(
            round(gbr_test_70_30, 2)))

        if st.checkbox("Visualize for last 10 days"):
            gbr_test_plot[106:].plot(title=' GBR Plot of last 10 days')
            st.pyplot(use_column_width=True)

    if add_selectbox == 'LSTM':

        data = df.copy()
        data = data.iloc[:, 7].values
        data = data.reshape(-1, 1)
        data = data.astype('float32')

        # Scaling the data
        scalar = MinMaxScaler()
        data = scalar.fit_transform(data)

        train_lstm = data[:-30, :]
        test_lstm = data[-30:, :]

        # Building the 2D array for supervised learning
        def create_dataset(sequence, time_step):
            dataX = []
            dataY = []
            for i in range(len(sequence) - time_step - 1):
                a = sequence[i:(i + time_step), 0]
                dataX.append(a)
                dataY.append(sequence[i + time_step, 0])
            return np.array(dataX), np.array(dataY)

        time_step = 1
        # Apply the 2D array function to train and test datasets
        train_X, train_Y = create_dataset(train_lstm, time_step)
        test_X, test_Y = create_dataset(test_lstm, time_step)

        train_X = np.reshape(train_X, (train_X.shape[0], 1, train_X.shape[1]))
        test_X = np.reshape(test_X, (test_X.shape[0], 1, test_X.shape[1]))

        # Build the LSTM Model
        model = Sequential()
        # Adding the input layer and LSTM layer
        model.add(
            LSTM(50,
                 activation='relu',
                 input_shape=(1, time_step),
                 return_sequences=True))
        model.add(LSTM(50, return_sequences=True))
        model.add(LSTM(50))
        model.add(Dropout(0.15))
        model.add(Dense(1))
        model.compile(optimizer='adam', loss='mse')
        model.fit(train_X, train_Y, batch_size=4, epochs=50, verbose=2)

        # Make predictions

        train_predict = model.predict(train_X)
        test_predict = model.predict(test_X)
        # inverting predictions
        train_predict = scalar.inverse_transform(train_predict)
        train_Y = scalar.inverse_transform([train_Y])
        test_predict = scalar.inverse_transform(test_predict)
        test_Y = scalar.inverse_transform([test_Y])
        # calculate root mean squared error
        train_score = mean_absolute_error(train_Y[0], train_predict[:, 0])

        test_score = mean_absolute_error(test_Y[0], test_predict[:, 0])

        # LSTM plot
        train_plot = np.empty_like(
            data)  # create an array with the same shape as provided
        train_plot[:, :] = np.nan
        train_plot[time_step:len(train_predict) + time_step, :] = train_predict
        # shifting test predictions for plotting
        test_plot = np.empty_like(data)
        test_plot[:, :] = np.nan
        test_plot[len(train_predict) + (time_step * 2) + 1:len(data) -
                  1, :] = test_predict
        # plot baseline and predictions
        plt.figure(figsize=(16, 8))

        plt.plot(train_plot)
        plt.plot(test_plot, color='green')
        plt.plot(scalar.inverse_transform(data), color='orange')
        plt.title(
            "Long Short Term Memory Network with train ,test and forecast",
            fontsize=20)
        plt.ylabel("Nitrogen_dioxide", fontsize=20)
        plt.legend(labels=['Train plot', 'Test set', 'LSTM forecast'],
                   fontsize=19)
        plt.xticks(fontsize=8)
        plt.yticks(fontsize=8)
        plt.grid()

        plt.show()
        st.pyplot(use_column_width=True)

        st.write('Train Score: %.3f MAE' % (train_score))
        st.write('Test Score: %.3f MAE' % (test_score))

        if st.checkbox('Visualize forecasted chart for 10 future days'):
            test_predict = scalar.fit_transform(test_predict)
            time_step = 10
            x_input = test_predict[(len(test_predict) - time_step):].reshape(
                1, -1)
            # Converting it to list
            temp_input = list(x_input)
            # Arranging list vertically
            temp_input = temp_input[0].tolist()

            # demonstrate prediction for next 10 days

            lst_output = []
            future_day = 10
            n_steps = 10
            i = 0
            # Forcast next 10 days output
            while (i < future_day):

                if (len(temp_input) > 10):

                    x_input = np.array(temp_input[1:])
                    print("{} day input {}".format(i, x_input))
                    x_input = x_input.reshape(1, -1)
                    # Converting to 3d array for lstm
                    x_input = x_input.reshape(1, n_steps, 1)
                    # print(x_input)
                    ypred = model.predict(x_input, verbose=0)
                    print("{} day predicted output {}".format(i, ypred))
                    # adding predicted output  to temp_input list
                    temp_input.extend(ypred[0].tolist())
                    temp_input = temp_input[1:]

                    # print(temp_input)
                    lst_output.extend(ypred.tolist())
                    i = i + 1
                else:
                    x_input = x_input.reshape((n_steps, 1, 1))
                    ypred = model.predict(x_input, verbose=0)
                    print("Predicted y of 0 day", ypred[0])
                    # Addding ypred value in temp_input(previous input)
                    temp_input.extend(ypred[0].tolist())
                    print(len(temp_input))
                    lst_output.extend(ypred.tolist())
                    i = i + 1
                # print(lst_output)

            previous_days1 = np.arange(len(data) - n_steps, len(data))
            predicted_future1 = np.arange(len(data), len(data) + future_day)
            lst_output = lst_output[:future_day]
            outputlist = data.tolist()
            outputlist.extend(lst_output)
            #data[len(data) - n_steps:]

            plt.plot(
                np.append(previous_days1, predicted_future1),
                scalar.inverse_transform(outputlist[len(data) - n_steps:]))
            plt.plot(predicted_future1, scalar.inverse_transform(lst_output))
            plt.title("Forecast for 10 future days", fontsize=20)
            plt.legend(fontsize=19)
            plt.xticks(fontsize=20)
            plt.yticks(fontsize=8)
            plt.ylabel("Nitrogen dioxide")
            plt.show()
            st.pyplot(use_column_width=True)
Esempio n. 20
0
def main(dataset):
    def plot_cf(ts, field):
        """NOTE: I did NOT write this function. It was taken from:
        http://www.seanabu.com/2016/03/22/time-series-seasonal-ARIMA-model-in-python/
        """
        lag_acf = acf(field, nlags=20)
        lag_pacf = pacf(field, nlags=20)
        #Plot ACF:
        plt.subplot(121)
        plt.plot(lag_acf)
        plt.axhline(y=0, linestyle='--', color='gray')
        plt.axhline(y=-1.96 / np.sqrt(len(df_diff)),
                    linestyle='--',
                    color='gray')
        plt.axhline(y=1.96 / np.sqrt(len(df_diff)),
                    linestyle='--',
                    color='gray')
        plt.title('Autocorrelation Function')
        #Plot PACF:
        plt.subplot(122)
        plt.plot(lag_pacf)
        plt.axhline(y=0, linestyle='--', color='gray')
        plt.axhline(y=-1.96 / np.sqrt(len(ts)), linestyle='--', color='gray')
        plt.axhline(y=1.96 / np.sqrt(len(ts)), linestyle='--', color='gray')
        plt.title('Partial Autocorrelation Function')
        plt.show()

    if dataset == 1:
        df = pd.read_csv('q1_train.csv')
        df.Date = pd.to_datetime(df.Date)
        df.set_index("Date", inplace=True)

        ### Differenced Signal ###
        df_diff = df - df.shift()
        df_diff.dropna(inplace=True)
        df_diff_2 = df - df.shift(52)
        df_diff_2.dropna(inplace=True)
        df_diff_3 = df_diff - df_diff.shift(52)
        df_diff_3.dropna(inplace=True)

        p, q = 0, 2
        arma_fit = SARIMAX(df, order=(p, 1, q),
                           seasonal_order=(1, 1, 1, 52)).fit(disp=-1)
        prediction = arma_fit.predict(start=525, end=525 + 103, dynamic=True)

        ### Plots the original data with the prediction ###
        plt.plot(df, color='blue')
        plt.plot(prediction, color='red')
        plt.title("Original Data with Predictions for Two Years")
        plt.show()

        # Put the predictions into a .txt file
        txt = open("Q1_Daniel_March_24196320.txt", 'w')
        txt.write('"x"')
        for line in prediction:
            txt.write("\n" + str(line))

    elif dataset == 2:
        df = pd.read_csv('q2_train.csv')
        df.Date = pd.to_datetime(df.Date)
        df.set_index("Date", inplace=True)

        df_diff = df - df.shift()
        df_diff.dropna(inplace=True)
        df_diff_2 = df - df.shift(52)
        df_diff_2.dropna(inplace=True)
        df_diff_3 = df_diff - df_diff.shift(52)
        df_diff_3.dropna(inplace=True)
        # plot_cf(df_diff,df_diff.activity)

        p, q = 0, 2
        arma_fit = SARIMAX(df, order=(p, 1, q),
                           seasonal_order=(1, 1, 1, 52)).fit(disp=-1)
        prediction = arma_fit.predict(start=525, end=525 + 103, dynamic=True)

        ## Plots the original data with the prediction ###
        plt.plot(df, color='blue')
        plt.plot(prediction, color='red')
        plt.title("Original Data with Predictions for Two Years")
        plt.show()

        # Put the predictions into a .txt file
        txt = open("Q2_Daniel_March_24196320.txt", 'w')
        txt.write('"x"')
        for line in prediction:
            txt.write("\n" + str(line))

    if dataset == 3:
        df = pd.read_csv('q3_train.csv')
        df.Date = pd.to_datetime(df.Date)
        df.set_index("Date", inplace=True)

        ### Differenced Signal ###
        df_diff = df - df.shift()
        df_diff.dropna(inplace=True)
        df_diff_2 = df - df.shift(52)
        df_diff_2.dropna(inplace=True)
        df_diff_3 = df_diff - df_diff.shift(52)
        df_diff_3.dropna(inplace=True)

        p, q = 0, 2
        arma_fit = SARIMAX(df, order=(p, 1, q),
                           seasonal_order=(1, 1, 1, 52)).fit(disp=-1)
        prediction = arma_fit.predict(start=525, end=525 + 103, dynamic=True)

        ### Plots the original data with the prediction ###
        plt.plot(df, color='blue')
        plt.plot(prediction, color='red')
        plt.title("Original Data with Predictions for Two Years")
        plt.show()

        # Put the predictions into a .txt file
        txt = open("Q3_Daniel_March_24196320.txt", 'w')
        txt.write('"x"')
        for line in prediction:
            txt.write("\n" + str(line))

    if dataset == 4:
        df = pd.read_csv('q4_train.csv')
        df.Date = pd.to_datetime(df.Date)
        df.set_index("Date", inplace=True)

        ### Differenced Signal ###
        df_diff = df - df.shift()
        df_diff.dropna(inplace=True)
        df_diff_2 = df - df.shift(52)
        df_diff_2.dropna(inplace=True)
        df_diff_3 = df_diff - df_diff.shift(52)
        df_diff_3.dropna(inplace=True)

        p, q = 0, 2
        arma_fit = SARIMAX(df, order=(p, 1, q),
                           seasonal_order=(1, 1, 1, 52)).fit(disp=-1)
        prediction = arma_fit.predict(start=525, end=525 + 103, dynamic=True)

        ### Plots the original data with the prediction ###
        plt.plot(df, color='blue')
        plt.plot(prediction, color='red')
        plt.title("Original Data with Predictions for Two Years")
        plt.show()

        # Put the predictions into a .txt file
        txt = open("Q4_Daniel_March_24196320.txt", 'w')
        txt.write('"x"')
        for line in prediction:
            txt.write("\n" + str(line))

    elif dataset == 5:
        df = pd.read_csv('q5_train.csv')
        df.Date = pd.to_datetime(df.Date)
        df.set_index("Date", inplace=True)

        ### Differenced Signal ###
        df_diff = df - df.shift()
        df_diff.dropna(inplace=True)
        df_diff_2 = df - df.shift(52)
        df_diff_2.dropna(inplace=True)
        df_diff_3 = df_diff - df_diff.shift(52)
        df_diff_3.dropna(inplace=True)
        # plot_cf(df_diff,df_diff.activity)

        p, q = 0, 2
        arma_fit = SARIMAX(df, order=(p, 1, q),
                           seasonal_order=(1, 1, 1, 52)).fit(disp=-1)
        prediction = arma_fit.predict(start=525, end=525 + 103, dynamic=True)

        ### Plots the original data with the prediction ###
        plt.plot(df, color='blue')
        plt.plot(prediction, color='red')
        plt.title("Original Data with Predictions for Two Years")
        plt.show()

        # Put the predictions into a .txt file
        txt = open("Q5_Daniel_March_24196320.txt", 'w')
        txt.write('"x"')
        for line in prediction:
            txt.write("\n" + str(line))
Esempio n. 21
0
def arima():
    failedMonths = 0 #Records if any months could not be successfully trained on (pred is zero)

    full_df=pd.read_csv('../data/COVID-19_Combined_Mobility_And_Infection_Data_Moving_Avg_updated_mode.csv', infer_datetime_format=True, parse_dates=True)
    full_df['originalCases'] = full_df['num_cases'] #preserve original case values as additional feature

    by_state=full_df['sub_region_1'].unique()
    

    #shift all states data by offset and concatenate in order to prevent bleeding into other states' numbers
    offset = 14
    full_dataframe=pd.DataFrame()
    for region in by_state:
        temp=full_df.loc[(full_df['sub_region_1']==region)]
        temp=temp.loc[(temp['date']<'2020-11-20')]
        #Shift CDC data by offset value
        cdc_dataframe=temp['num_cases'].shift(periods=offset,fill_value=0)
        mobility_dataframe=temp.drop(columns=['date', 'num_cases'])
        all_states=pd.concat([cdc_dataframe, mobility_dataframe],axis=1)
        all_states=all_states.loc[(all_states['num_cases']>0)] #remove rows with zero cases
        full_dataframe=full_dataframe.append(all_states)

    #Build new full data array
    #mobility_dataframe_truc = mobility_dataframe.drop(columns=['date'])
    #full_dataframe = pd.concat([cdc_dataframe_truc, mobility_dataframe_truc], axis=1)
    #full_dataframe['originalCases'] = cdc_dataframe['newAndPnew'] #preserve original case values as additional feature
    #full_dataframe_noDate = full_dataframe.drop(columns=['submission_date'])
    #full_dataframe_noDate = full_dataframe_noDate.loc[(full_dataframe_noDate['newAndPnew']!=0)] #remove rows with zero cases

    #Find length of shorted state dataframe
    minLength = np.inf
    for region in by_state:
        state_data=full_dataframe.loc[(full_dataframe['sub_region_1']==region)]
        length = state_data.shape[0]
        if length < minLength:
            minLength = length

    stride = 10 #trains a new model every {stride} days
    percentErrors = []
    for t in range(3):#(minLength-90)//stride):
        #Linear Mobility Data
        linearTrainX = []
        linearTrainy = []
        linearTestX = []
        linearTesty = []

        #Logarithmic Mobility Data
        logTrainX = []
        logTrainy = []
        logTestX = []
        logTesty = []

        MLPTrainX = []

        for region in by_state[:3]:
            state_data=full_dataframe.loc[(full_dataframe['sub_region_1']==region)].drop(columns=['sub_region_1', 'grocery_and_pharmacy_percent_change_from_baseline'])
            #Convert data to numpy
            linearData = state_data.to_numpy()
            logData = np.log(state_data+1-np.min(state_data.to_numpy())).to_numpy()

            timeTrain = np.arange(1,61).reshape(-1, 1)
            timeTest = np.arange(61,91).reshape(-1, 1)
        
            #Linear Mobility Data
            linearTrainX.append(linearData[t*stride:t*stride+60,1:])
            linearTrainy.append(linearData[t*stride:t*stride+60,:1])
            linearTestX.append(linearData[t*stride+60:t*stride+90,1:])
            linearTesty.append(linearData[t*stride+60:t*stride+90,:1])

            #Logarithmic Mobility Data
            logTrainX.append(logData[t*stride:t*stride+60,1:])
            logTrainy.append(logData[t*stride:t*stride+60,:1])
            logTestX.append(logData[t*stride+60:t*stride+90,1:])
            logTesty.append(logData[t*stride+60:t*stride+90,:1])

            
            MLPTrainXState = []
            for i,feature in enumerate(linearData[t*stride:t*stride+60,1:].T):
                #print("Feature:", i)
                #fit ARIMA
                #Perform grid search to determine ARIMA Order
                #stepwise_fit = auto_arima(feature, start_p = 1, start_q = 1, 
                #                max_p = 3, max_q = 3, m = 7, 
                #                start_P = 0, seasonal = True, 
                #                d = None, D = 1, trace = True, 
                #                error_action ='ignore',   # we don't want to know if an order does not work 
                #                suppress_warnings = True,  # we don't want convergence warnings 
                #                stepwise = True)           # set to stepwise 
                #stepwise_fit.summary() 
                #print("===============================================================================================")
                
                predictArima =[]
                arimaOrders = [(1,0,0),(1,0,1),(3,0,0),(1,0,0),(0,1,1),(1,0,0),(2,0,0)]
                seasonalOrders = [(2, 1, 0, 7), (2, 1, 0, 7), (1, 1, 0, 7), (1, 1, 0, 7),(0,1,1,7),(0,1,1,7),(2, 1, 0, 7)]

                model = SARIMAX(feature,  
                        order = arimaOrders[i],  
                        seasonal_order =seasonalOrders[i],
                        initialization='approximate_diffuse') 
        
                result = model.fit(disp=False) 
                if showPlot >=2 :
                    visualize_ARIMA(result, timeTrain, linearTrainX[:,i], timeTest, linearTestX[:,i])

                predictArima.append(result.predict(61, 90, typ = 'levels'))
                predictArima = np.mean(predictArima, axis=0)
                MLPTrainXState.append(predictArima)
            MLPTrainX.append(np.array(MLPTrainXState).T)
        MLPTrainX = np.array(MLPTrainX).reshape(-1,6)
        linearTrainX = np.array(linearTrainX).reshape(-1,6)
        linearTrainy = np.array(linearTrainy).reshape(-1,1)
        linearTesty = np.array(linearTesty).reshape(-1,1)

        #Use "Last known case value" as bias
        #(I completely made this up but it improved accuracy by ~5%)
        #bias1 = np.ones((30,1))#*linearTrainy[0]
        #bias2 = np.ones((30,1))#*linearTrainy[30]
        bias = np.ones((linearTrainX.shape[0],1))#np.vstack((bias1, bias2))
        linearTrainX = np.hstack((linearTrainX, bias))

        bias3 = np.ones((MLPTrainX.shape[0],1))#*linearTrainy[-1]
        MLPTrainX = np.hstack((MLPTrainX, bias3))
        
        failCounter = 0
        maxFail = 4
        while failCounter < maxFail: #Retrain if prediction is zero
            model = Sequential()
            #model.add(BatchNormalization())
            model.add(Dense(10, input_dim=7, activation='relu'))
            #model.add(Dropout(0.15))
            model.add(Dense(30, activation='relu'))
            #model.add(Dropout(0.15))
            model.add(Dense(1, activation='relu'))

            model.compile(optimizer='adam',loss='mean_squared_error', metrics=['accuracy'])
            model.fit(linearTrainX, linearTrainy, epochs=100, verbose=0)

            y_pred = model.predict(MLPTrainX)
            if np.sum(y_pred==0) < 0.1 * MLPTrainX.shape[0]:
                break
            print("Prediction is zero. Retraining...")
            failCounter += 1
            if failCounter == maxFail:
                failedMonths += 1
                percentError = 1
                print("Could not train model on this data")
        if failCounter != maxFail:
            error = y_pred-linearTesty
            percentError = np.abs(error/linearTesty).T
            percentErrorsByState = []
            print(percentError.shape)
            for i in range(len(by_state)):
                percentErrorsByState.append(percentError[i*30:(i+1)*30])
            percentErrorsByState = np.array(percentErrorsByState).reshape(51)
            print("Loss:", np.mean(percentError))
            #print("Percent Error:",percentError)
            percentErrors.append(percentErrorsByState)

        if showPlot >= 1 or np.mean(percentError) > 0.4:
            plt.plot(timeTrain, linearTrainy[0:60], label="Past")
            plt.plot(timeTest, linearTesty[0:30], label="True Future")
            plt.plot(timeTest, y_pred[0:30], label="Predicted Future")
            plt.plot(timeTest, MLPTrainX[0:30,-2], label="Predicted ARIMA (case only)")
            plt.legend()
            plt.show()
    print(np.array(percentErrors).shape)
    print("Failed Months:", failedMonths)
    print(np.mean(percentErrors, axis=1))
    plt.plot(np.mean(percentErrors, axis=1).flatten())
    plt.show()
    return
Esempio n. 22
0
def sarima_detect(train_set,
                  test_set,
                  shoptype,
                  categories,
                  thresh=1,
                  tol=0.4,
                  order=(0, 0, 0),
                  seasonal_order=(0, 1, 0, 12)):
    global outputs
    outputs = output_dir + shoptype + '\\'
    dicky_test = pd.DataFrame()
    if not os.path.exists(outputs + '\\broken'):
        os.makedirs(outputs + '\\broken')
    if not os.path.exists(outputs + '\\good'):
        os.makedirs(outputs + '\\good')

    for cat in categories:
        x = np.log(train_set[cat] + 1)
        y = np.log(test_set[cat] + 1)
        ax = plt.gca()
        x.plot(title=cat, colormap='jet')
        y.plot()
        x.rolling(6).mean().plot()
        #    x.interpolate(inplace = True)
        x.index = x.index.to_timestamp()
        #    result_mul = seasonal_decompose(x, model='addtive')
        #    deseasonalized = x / result_mul.seasonal
        #    results_AR = model.fit(disp=-1)
        #    x_log_diff = x - x.shift()
        #    x_log_diff.dropna(inplace=True)
        #    plt.plot(results_AR.fittedvalues, color='red')
        #    plt.title('RSS: %.4f'% sum((results_AR.fittedvalues-x_log_diff)**2))
        #    deseasonalized.plot()
        sarima_mod = SARIMAX(x,
                             trend='n',
                             order=order,
                             seasonal_order=seasonal_order,
                             enforce_stationarity=False).fit()
        #    print(sarima_mod.summary())
        forecast = sarima_mod.predict('2018-07-01', '2019-04-01')
        forecast.plot()
        #    D_data = x.diff().dropna()
        #    D_data.columns = [u'sales diff']

        #    D_data.plot()
        #y.rolling(6).std().plot()
        ax.legend(["2017-2018", "2019", "rolling", "predicted"])
        y.index = y.index.to_timestamp()
        a = y.corr(forecast)
        diff = y - forecast
        b = diff.std()
        c = "{:.2%}".format(abs(diff[0]) / y[0])
        d = abs(y[0] - x[17]) / abs(forecast[0] - x[17])
        dftest = adfuller(x, autolag='AIC')
        dfoutput = pd.Series(dftest[0:4],
                             index=[
                                 'Test Statistic', 'p-value', '#Lags Used',
                                 'Number of Observations Used'
                             ])
        for key, value in dftest[4].items():
            dfoutput['Critical Value (%s)' % key] = value
    #    print(dfoutput)

        dfoutput['CAT'] = cat
        dicky_test = dicky_test.append(dfoutput.transpose(), ignore_index=True)
        dicky_test['if_unitroot'] = 0
        dicky_test.loc[
            dicky_test['Critical Value (10%)'] < dicky_test['Test Statistic'],
            'if_unitroot'] = 1
        dicky_test.loc[dicky_test.CAT == cat, 'AIC'] = sarima_mod.aic
        dicky_test.loc[dicky_test.CAT == cat, 'BIC'] = sarima_mod.bic
        dicky_test.loc[dicky_test.CAT == cat, 'corr'] = a
        dicky_test.loc[dicky_test.CAT == cat, 'diff_std'] = b
        dicky_test.loc[dicky_test.CAT == cat, 'diff_fore201901'] = c
        dicky_test.loc[dicky_test.CAT == cat, 'diff_18-19'] = d
        if (x.max() * (1 + tol) < y.max()) or (x.min() > y.min() * (1 + tol)):
            dicky_test.loc[dicky_test.CAT == cat, 'extremum'] = 1
        else:
            dicky_test.loc[dicky_test.CAT == cat, 'extremum'] = 0

        if (x.max() * (1 + tol) < y.max()) or (x.min() > y.min() *
                                               (1 + tol)) or (d > thresh):
            dicky_test.loc[dicky_test.CAT == cat, 'TAG'] = 1
            plt.savefig(outputs + 'broken\\' + cat + '.png')
        else:
            dicky_test.loc[dicky_test.CAT == cat, 'TAG'] = 0
            plt.savefig(outputs + 'good\\' + cat + '.png')

        plt.show()

    return dicky_test
Esempio n. 23
0
def baseline():
    showPlot = False
    np.set_printoptions(precision=3, suppress=True)

    mobility_dataframe = pd.read_csv('google_baseline_test.csv',
                                     infer_datetime_format=True,
                                     parse_dates=True)
    cdc_dataframe = pd.read_csv('cdc_baseline_test_movingAvg.csv',
                                infer_datetime_format=True,
                                parse_dates=True)

    #=========================FIND BEST OFFSET========================================

    bestLinearCorr = 0
    bestLogCorr = 0
    bestLinearOffset = -1
    bestLogOffset = -1
    bestLinearData = 0
    bestLogData = 0

    correlationScores = []
    correlationLogScores = []

    for offset in range(100):
        #Shift CDC data by offset value
        cdc_dataframe_truc = cdc_dataframe.shift(periods=offset, fill_value=0)

        #Build new full data array
        mobility_dataframe_truc = mobility_dataframe.drop(columns=['date'])
        full_dataframe = pd.concat(
            [cdc_dataframe_truc, mobility_dataframe_truc], axis=1)
        full_dataframe['originalCases'] = cdc_dataframe[
            'newAndPnew']  #preserve original case values as additional feature
        full_dataframe_noDate = full_dataframe.drop(
            columns=['submission_date'])
        full_dataframe_noDate = full_dataframe_noDate.loc[(
            full_dataframe_noDate['newAndPnew'] !=
            0)]  #remove rows with zero cases

        #Compute linear and logatrithmic correlations
        linearCorr = full_dataframe_noDate.corr()
        linearCorr = linearCorr.to_numpy()[
            0, 1:]  #Take only correlations between 'cases' and mobility data

        logData = np.log(full_dataframe_noDate + 1 -
                         np.min(full_dataframe_noDate.to_numpy()))
        logCorr = logData.corr()
        logCorr = logCorr.to_numpy()[
            0, 1:]  #Take only correlations between 'cases' and mobility data

        print("Offset:", offset, "Correlation:    ", linearCorr)
        print("           Log Correlation:", logCorr)

        #Save best values
        if np.linalg.norm(linearCorr) > np.linalg.norm(bestLinearCorr):
            bestLinearCorr = linearCorr
            bestLinearOffset = offset
            bestLinearData = full_dataframe_noDate

        if np.linalg.norm(logCorr) > np.linalg.norm(bestLogCorr):
            bestLogCorr = logCorr
            bestLogOffset = offset
            bestLogData = logData

        correlationScores.append(np.linalg.norm(linearCorr))
        correlationLogScores.append(np.linalg.norm(logCorr))

    if showPlot:
        plt.plot(correlationScores)
        plt.xlabel("Cases offset (days)")
        plt.ylabel("Norm of correlation vector")
        plt.title("Linear correlation vs. data offset")
        plt.show()
        plt.plot(correlationLogScores)
        plt.xlabel("Cases offset (days)")
        plt.ylabel("Norm of correlation vector")
        plt.title("Logarithmic correlation vs. data offset")
        plt.show()

        #Plot data correlations
        #sns.pairplot(bestLinearData[['newAndPnew','retail_and_recreation_percent_change_from_baseline', 'grocery_and_pharmacy_percent_change_from_baseline', 'parks_percent_change_from_baseline', 'workplaces_percent_change_from_baseline', 'residential_percent_change_from_baseline','originalCases']], diag_kind='kde')
        #plt.show()

        #sns.pairplot(bestLogData[['newAndPnew','retail_and_recreation_percent_change_from_baseline', 'grocery_and_pharmacy_percent_change_from_baseline', 'parks_percent_change_from_baseline', 'workplaces_percent_change_from_baseline', 'residential_percent_change_from_baseline','originalCases']], diag_kind='kde')
        #plt.show()

    print("Best Full Correlation:", bestLinearCorr)
    print("Best Full Correlation Norm:", np.linalg.norm(bestLinearCorr))
    print("Best Full Offset:", bestLinearOffset)

    print("Best Log Correlation:", bestLogCorr)
    print("Best Log Correlation Norm:", np.linalg.norm(bestLogCorr))
    print("Best Log Offset:", bestLogOffset)

    #=========================BEGIN MODEL FITTING========================================

    linearMSE = []
    logMSEAdj = []
    linearCasesMSE = []
    logCasesMSE = []
    logisticMSE = []
    dataNoise = []
    arimaMSE = []
    gaussMSE = []

    #Convert data to numpy
    linearCasesOnly = bestLinearData['originalCases'].to_numpy()
    logCasesOnly = np.log(linearCasesOnly + 1)
    bestLinearData = bestLinearData.to_numpy()
    bestLogData = bestLogData.to_numpy()

    stride = 3  #trains a new model every {stride} days
    maxEpoch = 100

    for t in range(
        (min(bestLinearData.shape[0], bestLogData.shape[0]) - 90) // stride):
        print("Training model:", t)

        #Linear Mobility Data
        linearTrainX = bestLinearData[t * stride:t * stride + 60, 1:]
        linearTrainy = bestLinearData[t * stride:t * stride + 60, :1]
        linearTestX = bestLinearData[t * stride + 60:t * stride + 90, 1:]
        linearTesty = bestLinearData[t * stride + 60:t * stride + 90, :1]

        #Logarithmic Mobility Data
        logTrainX = bestLogData[t * stride:t * stride + 60, 1:]
        logTrainy = bestLogData[t * stride:t * stride + 60, :1]
        logTestX = bestLogData[t * stride + 60:t * stride + 90, 1:]
        logTesty = bestLogData[t * stride + 60:t * stride + 90, :1]

        #Cases-only data
        linearCasesTrainX = linearCasesOnly[t * stride:t * stride + 60]
        logCasesTrainX = logCasesOnly[t * stride:t * stride + 60]
        linearCasesTestX = linearCasesOnly[t * stride + 60:t * stride + 90]
        logCasesTestX = logCasesOnly[t * stride + 60:t * stride + 90]

        timeTrain = np.arange(1, 61).reshape(-1, 1)
        timeTest = np.arange(61, 91).reshape(-1, 1)

        #Uncomment to add time data to mobility dataset
        #linearTrainX = np.hstack((linearTrainX, timeTrain))
        #logTrainX = np.hstack((logTrainX, timeTrain))
        #linearTestX = np.hstack((linearTestX, timeTest))
        #logTestX = np.hstack((logTestX, timeTest))

        #fit linear model
        linear_model = RidgeCV(cv=3).fit(linearTrainX, linearTrainy)

        predict = linear_model.predict(linearTestX)
        linearMSE.append(np.abs(predict - linearTesty) / linearTesty)

        #fit log model
        linear_model = RidgeCV(cv=3).fit(logTrainX, logTrainy)

        predict = linear_model.predict(logTestX)
        predictAdj = np.exp(predict) - 1 + np.min(
            full_dataframe_noDate.to_numpy(
            ))  #convert from log back to raw case number
        logMSEAdj.append(np.abs(predictAdj - linearTesty) / linearTesty)

        #fit linear cases only model
        cases_model = RidgeCV(cv=3).fit(timeTrain, linearCasesTrainX)
        if showPlot:
            visualize_cases(cases_model, timeTrain, linearCasesTrainX,
                            timeTest, linearCasesTestX)

        predict = cases_model.predict(timeTest)
        linearCasesMSE.append(
            np.abs(predict - linearCasesTestX) / linearCasesTestX)

        #fit log cases only model
        cases_model = RidgeCV(cv=3).fit(np.log(timeTrain), logCasesTrainX)
        if showPlot:
            visualize_cases(cases_model, np.log(timeTrain), logCasesTrainX,
                            np.log(timeTest), logCasesTestX)

        predict = cases_model.predict(np.log(timeTest))
        predictAdj = np.exp(
            predict) - 1  #convert from log back to raw case number
        logCasesMSE.append(
            np.abs(predictAdj - linearCasesTestX) / linearCasesTestX)

        #fit logistic model
        logistic_model, cov = optimize.curve_fit(
            logisticDerivative,
            timeTrain.reshape(linearCasesTrainX.shape),
            linearCasesTrainX,
            p0=[4 * np.max(linearCasesTrainX), 60, 1 / 30],
            maxfev=10000,
            bounds=(np.array([1, 0, 0]), np.array([20000, np.Inf, np.Inf])))
        if showPlot:
            visualize_logistic(logistic_model, timeTrain, linearCasesTrainX,
                               timeTest, linearCasesTestX)

        predictLogistic = logisticDerivative(
            timeTest.reshape(linearCasesTestX.shape), logistic_model[0],
            logistic_model[1], logistic_model[2])
        logisticMSE.append(
            np.abs(predictLogistic - linearCasesTestX) / linearCasesTestX)

        predict = logisticDerivative(
            timeTrain.reshape(linearCasesTrainX.shape), logistic_model[0],
            logistic_model[1], logistic_model[2])
        dataNoise.append(
            np.mean(np.abs(predict - linearCasesTrainX) / linearCasesTrainX))

        #fit stacking regressor
        estimators = [('lr', RidgeCV()),
                      ('svr', LinearSVR(random_state=42),
                       ('rf',
                        RandomForestClassifier(n_estimators=10,
                                               random_state=42)))]
        reg = StackingRegressor(estimators=estimators,
                                final_estimator=GaussianProcessRegressor(
                                    kernel=DotProduct() + WhiteKernel(),
                                    random_state=0))
        stacking_model = reg.fit(timeTrain, linearCasesTrainX)
        if showPlot:
            visualize_cases(stacking_model, timeTrain, linearCasesTrainX,
                            timeTest, linearCasesTestX)

        predict = stacking_model.predict(timeTest)
        linearCasesMSE.append(
            np.abs(predict - linearCasesTestX) / linearCasesTestX)

        #fit ARIMA
        #Perform grid search to determine ARIMA Order
        #stepwise_fit = auto_arima(linearCasesTrainX, start_p = 1, start_q = 1,
        #                  max_p = 3, max_q = 3, m = 7,
        #                  start_P = 0, seasonal = True,
        #                  d = None, D = 1, trace = True,
        #                  error_action ='ignore',   # we don't want to know if an order does not work
        #                  suppress_warnings = True,  # we don't want convergence warnings
        #                  stepwise = True)           # set to stepwise
        #stepwise_fit.summary()

        model = SARIMAX(linearCasesTrainX,
                        order=(2, 0, 0),
                        seasonal_order=(2, 1, 0, 7))

        result = model.fit(disp=False)
        if showPlot:
            visualize_ARIMA(result, timeTrain, linearCasesTrainX, timeTest,
                            linearCasesTestX)

        predictArima = result.predict(61, 90, typ='levels')
        arimaMSE.append(
            np.abs(predictArima - linearCasesTestX) / linearCasesTestX)

        #Evaluate other models to use as input to gaussian process
        arima1 = SARIMAX(linearCasesTrainX,
                         order=(2, 0, 0),
                         seasonal_order=(2, 1, 0, 7)).fit(disp=False)
        arima2 = SARIMAX(linearCasesTrainX,
                         order=(2, 0, 0),
                         seasonal_order=(2, 1, 1, 7)).fit(disp=False)
        arima3 = SARIMAX(linearCasesTrainX,
                         order=(1, 1, 0),
                         seasonal_order=(1, 1, 1, 7)).fit(disp=False)
        arima4 = SARIMAX(linearCasesTrainX,
                         order=(0, 1, 1),
                         seasonal_order=(1, 1, 1, 7)).fit(disp=False)
        arima5 = SARIMAX(linearCasesTrainX,
                         order=(0, 1, 1),
                         seasonal_order=(2, 1, 0, 7)).fit(disp=False)

        predictLog = cases_model.predict(np.log(timeTrain))  #Log model
        predictAdj = np.exp(
            predictLog) - 1  #convert from log back to raw case number
        predictLogistic = logisticDerivative(
            timeTrain.reshape(linearCasesTrainX.shape), logistic_model[0],
            logistic_model[1], logistic_model[2])  #logistic model
        predictArima1 = arima1.predict(1, 60, typ='levels')
        predictArima2 = arima2.predict(1, 60, typ='levels')
        predictArima3 = arima3.predict(1, 60, typ='levels')
        predictArima4 = arima4.predict(1, 60, typ='levels')
        predictArima5 = arima5.predict(1, 60, typ='levels')

        testLog = cases_model.predict(np.log(timeTest))  #Log model
        testAdj = np.exp(
            testLog) - 1  #convert from log back to raw case number
        testLogistic = logisticDerivative(
            timeTest.reshape(linearCasesTestX.shape), logistic_model[0],
            logistic_model[1], logistic_model[2])  #logistic model
        testArima1 = arima1.predict(61, 90, typ='levels')
        testArima2 = arima2.predict(61, 90, typ='levels')
        testArima3 = arima3.predict(61, 90, typ='levels')
        testArima4 = arima4.predict(61, 90, typ='levels')
        testArima5 = arima5.predict(61, 90, typ='levels')

        #fit gaussian process meta-learner
        gaussTrain = np.array([
            predictLogistic, predictArima1, predictArima2, predictArima3,
            predictArima4, predictArima5
        ]).T
        gaussTest = np.array([
            testLogistic, testArima1, testArima2, testArima3, testArima4,
            testArima5
        ]).T
        reg = GaussianProcessRegressor(kernel=DotProduct() + WhiteKernel(),
                                       random_state=0)
        stacking_model = reg.fit(gaussTrain, linearCasesTrainX)
        predictTrain = stacking_model.predict(gaussTrain)
        predictTest = stacking_model.predict(gaussTest)
        if showPlot:
            visualize_gauss(
                np.hstack((predictTrain, predictTest)).T, timeTrain,
                linearCasesTrainX, timeTest, linearCasesTestX)

        gaussMSE.append(
            np.abs(predictTest - linearCasesTestX) / linearCasesTestX)

    #Plot proof-of-concept graph
    if True:
        plt.plot(np.array(linearMSE).mean(axis=0),
                 label='Mobility (linear, non-temporal)')
        plt.plot(np.array(logMSEAdj).mean(axis=0),
                 label='Mobility (logarithmic, non-temporal)')
        plt.xlabel("Days in advance to predict")
        plt.ylabel("Percent deviation from true value")
        plt.legend(loc="upper left")
        plt.show()

    #Plot baseline graph
    #plt.plot(np.array(linearCasesMSE).mean(axis=0), label='Cases (linear, temporal)') #Don't plot because performance is terrible
    plt.plot(np.array(logCasesMSE).mean(axis=0),
             label='Cases (logarithmic temporal)')
    plt.plot(np.array(logisticMSE).mean(axis=0),
             label='Cases (logistic temporal)')
    plt.plot(np.array(arimaMSE).mean(axis=0), label='Cases (ARIMA)')
    plt.plot(np.array(gaussMSE).mean(axis=0),
             label='Cases (Gaussian Process meta)')
    plt.xlabel("Days in advance to predict")
    plt.ylabel("Percent deviation from true value")
    plt.legend(loc="upper left")
    plt.show()

    print("Average logistic Test error:", np.mean(dataNoise))
def ts_crossvalidation(X,
                       param_grid,
                       y=None,
                       cv=5,
                       model="ARIMA",
                       ignore_warnings=True):
    """
    Function to perform Timeseries crossv validation using nested cross validation
    Only possible to use ARIMA and SVR models right now
    :params
        X - Prediction variable with time series
        param_grid - dictionary with name of the parameter and list of options
        y - Target variable for SVR
        cv - number of folds to use in each validation
        model - model on which cross validation will be performed
        ignore_warnings - Option to silence warnings
    """
    if ignore_warnings:
        # Ignore all warnigs for Nested Cross Validations
        warnings.filterwarnings("ignore")
    # Time series cross validation initialization
    tscv = TimeSeriesSplit(n_splits=cv)
    # Initialization of results
    grid_search_result = pd.DataFrame()
    # Getting possible combinations of parameters
    grid_list = param_grid_product(param_grid)
    if model == "ARIMA":
        # -----
        # ARIMA Model Cross validation start
        # -----
        assert 'order' in param_grid, "No order parameters for ARIMA"
        # Loop over possible combinations of parameters
        for grid_dict in grid_list:
            order = grid_dict['order']
            # Set defaults for ARIMA parameters
            param_arima = {
                'seasonal_order': (0, 0, 0, 0),
                'freq': None,
                'enforce_stationarity': True,
                'enforce_invertibility': True
            }
            # Update ARIMA parameters if they were specified
            for key in param_arima:
                if key in grid_dict:
                    param_arima[key] = grid_dict[key]
            # Initialize error lists
            rmse_list = []
            aic_list = []
            # Nested cross validation run per each configuration
            for train_index, test_index in tscv.split(X):
                # Train and test initialization for specific nested crossvalidation step
                X_train, X_test = X[train_index], X[test_index]
                # Model initialization and training
                try:
                    model = SARIMAX(
                        X_train,
                        freq=param_arima['freq'],
                        order=order,
                        seasonal_order=param_arima['seasonal_order'],
                        enforce_stationarity=param_arima[
                            'enforce_stationarity'],
                        enforce_invertibility=param_arima[
                            'enforce_invertibility']).fit(disp=0)
                    # Model test for crossvalidation step
                    pred = model.predict(X_test.index[0], X_test.index[-1])
                    error = np.sqrt(mean_squared_error(X_test, pred))
                    # Save results for crossvalidation step
                    rmse_list.append(error)
                    aic_list.append(model.aic)
                except:
                    # If error continue to next model evaluation
                    continue
            # Consolidate metrics for parameter configuration using mean
            try:
                total_error = np.mean(rmse_list)
                total_aic = np.mean(aic_list)
            except:
                # If error continue to next parameter configuration
                continue
            # Save results on main DataFrame
            to_append = pd.DataFrame([{
                'name':
                'ARIMA{}x{}'.format(order, param_arima['seasonal_order']),
                'AIC':
                total_aic,
                'RMSE':
                total_error
            }])
            grid_search_result = grid_search_result.append(to_append,
                                                           sort=False,
                                                           ignore_index=True)

    elif model == "SVR":
        # -----
        # SVR Model Cross validation start
        # -----
        assert y is not None, "No target variable samples (y) for SVR"
        for grid_dict in grid_list:
            param_svr = {
                'C': 1,
                'kernel': 'rbf',
                'gamma': 'scale',
            }
            for key in param_svr:
                if key in grid_dict:
                    param_svr[key] = grid_dict[key]
            # Initialize confidence lists and model
            confidence = None
            conf_list = []
            svr_rbf = SVR(kernel=param_svr['kernel'],
                          C=param_svr['C'],
                          gamma=param_svr['gamma'])
            for train_index, test_index in tscv.split(X):
                # Train and test initialization for specific nested crossvalidation step
                X_train, X_test = X.iloc[train_index], X.iloc[test_index]
                y_train, y_test = y.iloc[train_index], y.iloc[test_index]
                try:
                    # Model training
                    svr_rbf.fit(X_train, y_train)
                    # Model test for nested crossvalidation step
                    svm_confidence = svr_rbf.score(X_test, y_test)
                    # Accuracy must be a valid number, otherwise the model failed to converge
                    assert (0 <= svm_confidence <= 1)
                    conf_list.append(svm_confidence)
                except:
                    continue
            # Consolidate confidence for parameter configuration using mean
            try:
                confidence = np.mean(conf_list)
            except:
                continue
            # Save results on main DataFrame
            to_append = pd.DataFrame([{
                'kernel': param_svr['kernel'],
                'gamma': param_svr['gamma'],
                'C': param_svr['C'],
                'confidence': confidence
            }])
            grid_search_result = grid_search_result.append(to_append,
                                                           sort=False,
                                                           ignore_index=True)

    return grid_search_result
def TIME_SERIES_ALGO(df, bool_stat):
    dict_rmse = dict()

    bool_log, df_log = log_transformation(df)
    col = df.columns[0]
    # 1.. NAIVE APPROACH
    # IN THIS APPROCAH WE ASSIGN RECENT VALUE TO THE TEST DATAFRAME

    try:
        train, test = train_test_split(df)

        y_prd = np.asarray([train.ix[train.shape[0] - 1].values[0]] *
                           (test.shape[0]))

        rs_naive = sqrt(mean_squared_error(test[col].values, y_prd))
        print(rs_naive)
        dict_rmse["naive"] = rs_naive
        insert_into_database("NAIVE", rs_naive, "{}")

        if bool_log:
            # PERFORM SAME ABOVE THING FOR LOG TRANSFORMED DATA
            train, test = train_test_split(df_log)

            y_prd = np.asarray([train.ix[train.shape[0] - 1].values[0]] *
                               (test.shape[0]))

            y_prd = np.exp(y_prd)

            rs_naive_log = sqrt(mean_squared_error(test[col].values, y_prd))
            print(rs_naive_log)
            dict_rmse["naive_log"] = rs_naive_log
            insert_into_database("NAIVE", rs_naive_log, "{}")

    except Exception as e:
        insert_into_database("NAIVE", None, e)
        print(("error in modelling in naive approach,{}".format(e)))

    # 2..SIMPLE AVERAGE
    try:

        train, test = train_test_split(df)
        mean_forecast = train[col].mean()
        y_prd = np.asarray([mean_forecast] * test.shape[0])
        rs_mean = sqrt(mean_squared_error(test[col].values, y_prd))
        dict_rmse["simple_avg"] = rs_mean
        insert_into_database("SIMPLE_AVG", rs_mean, "{}")

        if bool_log:
            train, test = train_test_split(df_log)
            mean_forecast = train[col].mean()
            y_prd = np.asarray([mean_forecast] * test.shape[0])

            y_prd = np.exp(y_prd)

            rs_mean = sqrt(mean_squared_error(test[col].values, y_prd))
            dict_rmse["simple_avg_log"] = rs_mean
            insert_into_database("SIMPLE_AVG", rs_mean, "{}")

    except Exception as e:
        insert_into_database("SIMPLE_AVG", None, e)
        print(("error in moving average,{}".format(e)))

    # 3..MOVING AVERAGE

    # IN PROGRESS HAVE TO MODIFY IT...
    try:
        train, test = train_test_split(df)
        for i in range(25, 90):
            # As rolling mean returns mean fo ecah row we want mean f only last row because it is onlu used to forecast
            mean_moving = train[col].rolling(i).mean().ix[train.shape[0] - 1]
            print(mean_moving)
            y_prd = np.asarray([mean_moving] * test.shape[0])
            rs_moving = sqrt(mean_squared_error(test[col].values, y_prd))
            insert_into_database("MVG_AVG", rs_moving, "{}")

    except Exception as e:
        insert_into_database("MVG_AVG", None, e)
        print(("error in moving average,{}".format(e)))
    try:

        if bool_log:
            for i in range(25, 90):
                train, test = train_test_split(df_log)

                # print(type(train[col].rolling(i).mean()))
                mean_moving = train[col].rolling(i).mean().ix[train.shape[0] -
                                                              1]

                y_prd = np.array([mean_moving] * test.shape[0])
                print(y_prd)
                y_prd = np.exp(y_prd)

                rs_moving_log = sqrt(
                    mean_squared_error(test[col].values, y_prd))
                insert_into_database("MVG_AVERAGE", rs_moving_log, "{}")

    except Exception as e:
        insert_into_database("MVG_AVERAGE", None, e)
        print(("error in log moving average model, {}".format(e)))

    # 4.. SIMPLE EXPONENTIAL SMOOTHING
    try:
        train, test = train_test_split(df)
        fit2 = SimpleExpSmoothing(df[col]).fit(smoothing_level=0.6,
                                               optimized=False)
        # print(test.index[0])
        # print(test.index[test.shape[0]-1])
        y_prd = fit2.forecast(len(test))
        print(y_prd)

        rs_simple = sqrt(mean_squared_error(test.values, y_prd))
        dict_rmse["simple"] = rs_simple
        insert_into_database("SIMPLE_EXP", rs_simple, "{}")

    except Exception as e:
        print(("error is simple exp without log,{}".format(e)))
        insert_into_database("SIMPLE_EXP", None, e)

    try:
        if bool_log:
            train, test = train_test_split(df_log)
            fit2 = SimpleExpSmoothing(df[col]).fit(smoothing_level=0.6,
                                                   optimized=False)
            y_prd = fit2.forecast(len(test))
            y_prd = np.exp(y_prd)
            rs_simple = sqrt(mean_squared_error(test.values, y_prd))
            dict_rmse["simple_log"] = rs_simple
            insert_into_database("SIMPLE_EXP", rs_simple, "{}")

    except Exception as e:
        insert_into_database("SIMPLE_EXP", None, e)
        print(("simple exponential smoothing log,{}".format(e)))

    # HOT LINEAR METHOD FOR FORECASTING
    try:
        train, test = train_test_split(df)
        fit2 = Holt(train[col], exponential=True, damped=False).fit()
        y_prd = fit2.predict(test.index.values[0],
                             test.index.values[test.shape[0] - 1])
        rs_hotl = sqrt(mean_squared_error(test[col].values, y_prd))
        dict_rmse["rs_hotl"] = rs_hotl
        insert_into_database("HOLT_LINEAR", rs_hotl, "{}")

        if bool_log:
            train, test = train_test_split(df)
            fit2 = Holt(train[col], exponential=True, damped=False).fit()
            y_prd = fit2.predict(test.index.values[0],
                                 test.index.values[test.shape[0] - 1])
            y_prd = np.exp(y_prd)
            rs_hotl_log = sqrt(mean_squared_error(test[col].values, y_prd))
            dict_rmse["rs_hotl_log"] = rs_hotl_log
            insert_into_database("HOLT_LINEAR", rs_hotl_log, "{}")

    except Exception as e:
        insert_into_database("HOLT_LINEAR", None, e)
        print((
            "error in HOLT linear forecasting in without damped.{}".format(e)))

    try:

        fit2 = Holt(train[col], exponential=True, damped=True).fit()
        y_prd = fit2.predict(test.index.values[0],
                             test.index.values[test.shape[0] - 1])
        rs_holtld = sqrt(mean_squared_error(test[col].values, y_prd))
        dict_rmse["rs_holtld"] = rs_holtld
        insert_into_database("HOLT_LINEAR", rs_holtld, "{}")

        if bool_log:
            fit2 = Holt(train[col], exponential=True, damped=True).fit()
            y_prd = fit2.predict(test.index.values[0],
                                 test.index.values[test.shape[0] - 1])
            y_prd = np.exp(y_prd)
            rs_holtld = sqrt(mean_squared_error(test[col].values, y_prd))
            dict_rmse["rs_holtld"] = rs_holtld
            insert_into_database("HOLT_LINEAR", rs_holtld, "{}")

    except Exception as e:
        print(("error in HOLT linear smoothing  damped,{}".format(e)))
        insert_into_database("HOLT_LINEAR", None, e)

    # HOLT WINTERS FORECASTING..
    try:
        train, test = train_test_split(df)
        # print("fmmf")
        fit2 = ExponentialSmoothing(test[col],
                                    trend="mul",
                                    seasonal="mul",
                                    seasonal_periods=12).fit()
        y_prd = fit2.predict(test.index.values[0],
                             test.index.values[test.shape[0] - 1])
        rs_hlw = sqrt(mean_squared_error(test[col].values, y_prd))
        print(rs_hlw)
        dict_rmse["rs_hlw"] = rs_hlw
        insert_into_database("HOLT_WINTER", rs_hlw, "{}")

        if bool_log:
            train, test = train_test_split(df_log)
            fit2 = ExponentialSmoothing(test[col],
                                        trend="add",
                                        seasonal="add",
                                        seasonal_periods=12).fit()
            y_prd = fit2.predict(test.index.values[0],
                                 test.index.values[test.shape[0] - 1])
            y_prd = np.exp(y_prd)
            rs_hlw_log = sqrt(mean_squared_error(test[col].values, y_prd))
            print(rs_hlw_log)
            dict_rmse["rs_hlw_log"] = rs_hlw_log
            insert_into_database("HOLT_WINTER", rs_hlw_log, "{}")

    except Exception as e:
        print(("error in HOLT winter forecasting,{}".format(e)))
        insert_into_database("HOLT_WINTER", None, e)
    # ARIMA MODEL....

    # try:
    #     rs = test_stationary(df, col)
    #     if rs:
    #
    #         # Here we decide the order of diffrencing the Time Series
    #         df_diff = df - df.shift()
    #         df_diff.dropna(inplace=True)
    #         rs = test_stationary(df_diff, col)
    #         if rs:
    #             df_diff = df_diff - df_diff.shift()
    #
    #     df_diff.dropna(inplace=True)
    #
    #     train, test = train_test_split(df_diff)
    #
    #     """ The acf and pacf plots are
    #         used to calculate the the parametre for AR
    #         AND MA MODELS"""
    #
    #     ar_list = get_params_p(train)
    #     ma_list = get_params_q(train)
    #
    #     for i in ma_list:
    #         for j in ar_list:
    #             try:
    #                 model = ARIMA(train, order=(j, 0, i)).fit()
    #                 y_prd = model.predict(start=test.index.values[0], end=test.index.values[test.shape[0] - 1])
    #
    #                 rs = sqrt(mean_squared_error(test[col].values, y_prd))
    #                 insert_into_database("ARIMA", rs, "{}")
    #             except Exception as e:
    #
    #                 print(("error while training arima,{}".format(e)))
    #                 insert_into_database("ARIMA", None, e)
    # except Exception as e:
    #
    #     print(("error in arima model,{}".format(e)))
    #     insert_into_database("ARIMA", None, e)

    # .. SARIMAX
    try:
        train, test = train_test_split(df)
        p = d = q = list(range(0, 2))
        non_seas = list(itertools.product(p, d, q))
        lis = [1, 3, 6, 12, 24, 56]

        for i in lis:
            sea_so = [(x[0], x[1], x[2], i)
                      for x in list(itertools.product(p, d, q))]

            for j in non_seas:
                for k in sea_so:
                    try:
                        model = SARIMAX(train,
                                        order=j,
                                        seasonal_order=k,
                                        enforce_stationarity=False,
                                        enforce_invertibility=False).fit()
                        y_prd = model.predict(
                            start=test.index.values[0],
                            end=test.index.values[test.shape[0] - 1])

                        rs = sqrt(mean_squared_error(test.values, y_prd))

                        print(rs)
                        insert_into_database("SARIMAX", rs, "{}")
                    except Exception as e:
                        print(("error while training the SARIMAX MODELS,{}".
                               format(e)))
                        insert_into_database("SARIMAX", None, e)

    except Exception as e:
        print(("error in seasonal_arima,{}".format(e)))
        insert_into_database("SARIMAX", None, e)

    # ..AUTO_ARIMA..

    try:
        train, test = train_test_split(df)
        model = auto_arima(train,
                           start_p=1,
                           start_q=1,
                           start_P=1,
                           start_Q=1,
                           max_p=5,
                           max_q=5,
                           max_P=5,
                           max_Q=1,
                           d=1,
                           D=1,
                           seasonal=True)
        model = model.fit(train)
        y_prd = model.predict(n_periods=len(test))
        rs = sqrt(mean_squared_error(test.values, y_prd))
        print("results in auto_Arima", rs)
        dict_rmse["auto_arima"] = rs
        insert_into_database("AUTO_ARIMA", rs, "{}")

    except Exception as e:

        print("error in auto_Arima,{}".format(e))
        insert_into_database("Auto_arima", None, e)
def baseline(showPlot):
    np.set_printoptions(precision=3, suppress=True)

    full_df = pd.read_csv(
        '../data/COVID-19_Combined_Mobility_And_Infection_Data_Moving_Avg_updated_lin_int.csv',
        infer_datetime_format=True,
        parse_dates=True)

    #=========================FIND BEST OFFSET========================================

    by_state = full_df['sub_region_1'].unique()
    bestLinearCorr = 0
    bestLogCorr = 0
    bestLinearOffset = -1
    bestLogOffset = -1
    bestLinearData = 0
    bestLogData = 0
    #min_all_states_lin_dim=100
    #min_all_states_log_dim=100

    correlationScores = []
    correlationLogScores = []

    for offset in range(30):
        #shift all states data by offset and concatenate in order to prevent bleeding into other states' numbers
        full_dataframe = pd.DataFrame()
        min_dim = 100
        for region in by_state:
            temp = full_df.loc[(full_df['sub_region_1'] == region)]
            temp = temp.loc[(temp['date'] > '2020-05-01')]
            #Shift CDC data by offset value
            cdc_dataframe = temp['num_cases'].shift(periods=offset,
                                                    fill_value=0)
            mobility_dataframe = temp.drop(
                columns=['date', 'sub_region_1', 'num_cases'])
            all_states = pd.concat([cdc_dataframe, mobility_dataframe], axis=1)
            all_states = all_states.loc[(all_states['num_cases'] >
                                         0)]  #remove rows with zero cases
            full_dataframe = full_dataframe.append(all_states)
            '''if(all_states.shape[0]<min_dim):
                min_dim=all_states.shape[0]'''

        #Compute linear and logatrithmic correlations
        linearCorr = full_dataframe.corr()
        linearCorr = linearCorr.to_numpy()[
            0, 1:]  #Take only correlations between 'cases' and mobility data

        logData = np.log(full_dataframe + 1 -
                         np.min(full_dataframe.to_numpy()))
        logCorr = logData.corr()
        logCorr = logCorr.to_numpy()[
            0, 1:]  #Take only correlations between 'cases' and mobility data

        #print("Offset:", offset, "Min_state_dim:    ", min_dim)
        #print("           Log Correlation:", logCorr)

        #Save best values
        if np.linalg.norm(linearCorr) > np.linalg.norm(bestLinearCorr):
            bestLinearCorr = linearCorr
            bestLinearOffset = offset
            min_all_states_lin_dim = min_dim
            #bestLinearData = full_dataframe

        if np.linalg.norm(logCorr) > np.linalg.norm(bestLogCorr):
            bestLogCorr = logCorr
            bestLogOffset = offset
            min_all_states_log_dim = min_dim
            #bestLogData = logData

        correlationScores.append(np.linalg.norm(linearCorr))
        correlationLogScores.append(np.linalg.norm(logCorr))

    if showPlot:
        plt.plot(correlationScores)
        plt.xlabel("Cases offset (days)")
        plt.ylabel("Norm of correlation vector")
        plt.title("Linear correlation vs. data offset")
        plt.show()
        plt.plot(correlationLogScores)
        plt.xlabel("Cases offset (days)")
        plt.ylabel("Norm of correlation vector")
        plt.title("Logarithmic correlation vs. data offset")
        plt.show()

    print("Best Full Correlation:", bestLinearCorr)
    print("Best Full Correlation Norm:", np.linalg.norm(bestLinearCorr))
    print("Best Full Offset:", bestLinearOffset)

    print("Best Log Correlation:", bestLogCorr)
    print("Best Log Correlation Norm:", np.linalg.norm(bestLogCorr))
    print("Best Log Offset:", bestLogOffset)

    #num_models=(min(min_all_states_lin_dim, min_all_states_log_dim)-111)//3

    linearMSE_by_state = []
    logMSEAdj_by_state = []
    linearCasesMSE_by_state = []
    logCasesMSE_by_state = []
    logisticMSE_by_state = []
    dataNoise_by_state = []
    arimaMSE_by_state = []
    gaussMSE_by_state = []
    for s in range(len(by_state)):

        #=========================BEGIN MODEL FITTING========================================

        #Get the data for that state and shift it
        bestLinearData = pd.DataFrame()
        bestLogDf = pd.DataFrame()
        temp = full_df.loc[(full_df['sub_region_1'] == by_state[s])]
        temp = temp.loc[(temp['date'] < '2020-11-30')]
        #Shift CDC data by offset value
        cdc_lin_dataframe = temp['num_cases'].shift(periods=bestLinearOffset,
                                                    fill_value=0)
        mobility_lin_dataframe = temp.drop(
            columns=['date', 'sub_region_1', 'num_cases'])
        all_lin_states = pd.concat([cdc_lin_dataframe, mobility_lin_dataframe],
                                   axis=1)
        all_lin_states = all_lin_states.loc[(all_lin_states['num_cases'] >
                                             0)]  #remove rows with zero cases
        bestLinearData = bestLinearData.append(all_lin_states)
        #Shift CDC data by offset value
        cdc_log_dataframe = temp['num_cases'].shift(periods=bestLogOffset,
                                                    fill_value=0)
        mobility_log_dataframe = temp.drop(
            columns=['date', 'sub_region_1', 'num_cases'])
        all_log_states = pd.concat([cdc_log_dataframe, mobility_log_dataframe],
                                   axis=1)
        all_log_states = all_log_states.loc[(all_log_states['num_cases'] >
                                             0)]  #remove rows with zero cases
        bestLogDf = bestLogDf.append(all_log_states)
        bestLogData = np.log(bestLogDf + 1 - np.min(bestLogDf.to_numpy()))

        linearMSE = []
        logMSEAdj = []
        linearCasesMSE = []
        logCasesMSE = []
        logisticMSE = []
        dataNoise = []
        arimaMSE = []
        gaussMSE = []

        #Convert data to numpy
        linearCasesOnly = bestLinearData['num_cases'].to_numpy()
        logCasesOnly = np.log(linearCasesOnly + 1)
        bestLinearData = bestLinearData.to_numpy()
        bestLogData = bestLogData.to_numpy()

        stride = 3  #trains a new model every {stride} days
        maxEpoch = 100

        for t in range(
            (min(bestLinearData.shape[0], bestLogData.shape[0]) - 111) //
                stride):
            #print("Size of training:", range((min(bestLinearData.shape[0], bestLogData.shape[0])-111)//stride))
            print("Training model:", t)
            print("State:", by_state[s])

            #Linear Mobility Data
            linearTrainX = bestLinearData[t * stride:t * stride + 60, 1:]
            linearTrainy = bestLinearData[t * stride:t * stride + 60, :1]
            linearTestX = bestLinearData[t * stride + 60:t * stride + 111, 1:]
            linearTesty = bestLinearData[t * stride + 60:t * stride + 111, :1]

            #Logarithmic Mobility Data
            logTrainX = bestLogData[t * stride:t * stride + 60, 1:]
            logTrainy = bestLogData[t * stride:t * stride + 60, :1]
            logTestX = bestLogData[t * stride + 60:t * stride + 111, 1:]
            logTesty = bestLogData[t * stride + 60:t * stride + 111, :1]

            #Cases-only data
            linearCasesTrainX = linearCasesOnly[t * stride:t * stride + 60]
            logCasesTrainX = logCasesOnly[t * stride:t * stride + 60]
            linearCasesTestX = linearCasesOnly[t * stride + 60:t * stride +
                                               111]
            logCasesTestX = logCasesOnly[t * stride + 60:t * stride + 111]

            timeTrain = np.arange(1, 61).reshape(-1, 1)
            timeTest = np.arange(61, 112).reshape(-1, 1)

            #Uncomment to add time data to mobility dataset
            #linearTrainX = np.hstack((linearTrainX, timeTrain))
            #logTrainX = np.hstack((logTrainX, timeTrain))
            #linearTestX = np.hstack((linearTestX, timeTest))
            #logTestX = np.hstack((logTestX, timeTest))

            #fit linear model
            linear_model = RidgeCV(cv=3).fit(linearTrainX, linearTrainy)

            predict = linear_model.predict(linearTestX)
            linearMSE.append(np.abs(predict - linearTesty) / linearTesty)

            #fit log model
            linear_model = RidgeCV(cv=3).fit(logTrainX, logTrainy)

            predict = linear_model.predict(logTestX)
            predictAdj = np.exp(predict) - 1 + np.min(full_dataframe.to_numpy(
            ))  #convert from log back to raw case number
            logMSEAdj.append(np.abs(predictAdj - linearTesty) / linearTesty)

            #fit linear cases only model
            cases_model = RidgeCV(cv=3).fit(timeTrain, linearCasesTrainX)
            if False:
                visualize_cases(cases_model, timeTrain, linearCasesTrainX,
                                timeTest, linearCasesTestX)

            predict = cases_model.predict(timeTest)
            linearCasesMSE.append(
                np.abs(predict - linearCasesTestX) / linearCasesTestX)

            #fit log cases only model
            cases_model = RidgeCV(cv=3).fit(np.log(timeTrain), logCasesTrainX)
            if False:
                visualize_cases(cases_model, np.log(timeTrain), logCasesTrainX,
                                np.log(timeTest), logCasesTestX)

            predict = cases_model.predict(np.log(timeTest))
            predictAdj = np.exp(
                predict) - 1  #convert from log back to raw case number
            logCasesMSE.append(
                np.abs(predictAdj - linearCasesTestX) / linearCasesTestX)

            #fit logistic model
            logistic_model, cov = optimize.curve_fit(
                logisticDerivative,
                timeTrain.reshape(linearCasesTrainX.shape),
                linearCasesTrainX,
                p0=[4 * np.max(linearCasesTrainX), 60, 1 / 30],
                maxfev=10000,
                bounds=(np.array([1, 0, 0]), np.array([20000, np.Inf,
                                                       np.Inf])))
            if False:
                visualize_logistic(logistic_model, timeTrain,
                                   linearCasesTrainX, timeTest,
                                   linearCasesTestX)

            predictLogistic = logisticDerivative(
                timeTest.reshape(linearCasesTestX.shape), logistic_model[0],
                logistic_model[1], logistic_model[2])
            logisticMSE.append(
                np.abs(predictLogistic - linearCasesTestX) / linearCasesTestX)

            predict = logisticDerivative(
                timeTrain.reshape(linearCasesTrainX.shape), logistic_model[0],
                logistic_model[1], logistic_model[2])
            dataNoise.append(
                np.mean(
                    np.abs(predict - linearCasesTrainX) / linearCasesTrainX))

            #fit stacking regressor
            estimators = [('lr', RidgeCV()),
                          ('svr', LinearSVR(random_state=42),
                           ('rf',
                            RandomForestClassifier(n_estimators=10,
                                                   random_state=42)))]
            reg = StackingRegressor(estimators=estimators,
                                    final_estimator=GaussianProcessRegressor(
                                        kernel=DotProduct() + WhiteKernel(),
                                        random_state=0))
            stacking_model = reg.fit(timeTrain, linearCasesTrainX)
            if False:
                visualize_cases(stacking_model, timeTrain, linearCasesTrainX,
                                timeTest, linearCasesTestX)

            predict = stacking_model.predict(timeTest)
            linearCasesMSE.append(
                np.abs(predict - linearCasesTestX) / linearCasesTestX)

            #fit ARIMA
            #Perform grid search to determine ARIMA Order
            '''stepwise_fit = auto_arima(linearCasesTrainX, start_p = 1, start_q = 1, 
                            max_p = 3, max_q = 3, m = 7, 
                            start_P = 0, seasonal = True, 
                            d = None, D = 1, trace = True, 
                            error_action ='ignore',   # we don't want to know if an order does not work 
                            suppress_warnings = True,  # we don't want convergence warnings 
                            stepwise = True)           # set to stepwise 
            stepwise_fit.summary()'''

            model = SARIMAX(linearCasesTrainX,
                            initialization='approximate_diffuse',
                            order=(2, 0, 0),
                            seasonal_order=(2, 1, 0, 7))

            result = model.fit(disp=False)
            if True:
                visualize_ARIMA(result, timeTrain, linearCasesTrainX, timeTest,
                                linearCasesTestX)

            predictArima = result.predict(61, 111, typ='levels')
            arimaMSE.append(
                np.abs(predictArima - linearCasesTestX) / linearCasesTestX)

            #Evaluate other models to use as input to gaussian process
            arima1 = SARIMAX(linearCasesTrainX,
                             initialization='approximate_diffuse',
                             order=(2, 0, 0),
                             seasonal_order=(2, 1, 0, 7)).fit(disp=False)
            arima2 = SARIMAX(linearCasesTrainX,
                             initialization='approximate_diffuse',
                             order=(2, 0, 0),
                             seasonal_order=(2, 1, 1, 7)).fit(disp=False)
            arima3 = SARIMAX(linearCasesTrainX,
                             initialization='approximate_diffuse',
                             order=(1, 1, 0),
                             seasonal_order=(1, 1, 1, 7)).fit(disp=False)
            arima4 = SARIMAX(linearCasesTrainX,
                             initialization='approximate_diffuse',
                             order=(0, 1, 1),
                             seasonal_order=(1, 1, 1, 7)).fit(disp=False)
            arima5 = SARIMAX(linearCasesTrainX,
                             initialization='approximate_diffuse',
                             order=(0, 1, 1),
                             seasonal_order=(2, 1, 0, 7)).fit(disp=False)

            predictLog = cases_model.predict(np.log(timeTrain))  #Log model
            predictAdj = np.exp(
                predictLog) - 1  #convert from log back to raw case number
            predictLogistic = logisticDerivative(
                timeTrain.reshape(linearCasesTrainX.shape), logistic_model[0],
                logistic_model[1], logistic_model[2])  #logistic model
            predictArima1 = arima1.predict(1, 60, typ='levels')
            predictArima2 = arima2.predict(1, 60, typ='levels')
            predictArima3 = arima3.predict(1, 60, typ='levels')
            predictArima4 = arima4.predict(1, 60, typ='levels')
            predictArima5 = arima5.predict(1, 60, typ='levels')

            testLog = cases_model.predict(np.log(timeTest))  #Log model
            testAdj = np.exp(
                testLog) - 1  #convert from log back to raw case number
            testLogistic = logisticDerivative(
                timeTest.reshape(linearCasesTestX.shape), logistic_model[0],
                logistic_model[1], logistic_model[2])  #logistic model
            testArima1 = arima1.predict(61, 111, typ='levels')
            testArima2 = arima2.predict(61, 111, typ='levels')
            testArima3 = arima3.predict(61, 111, typ='levels')
            testArima4 = arima4.predict(61, 111, typ='levels')
            testArima5 = arima5.predict(61, 111, typ='levels')

            #fit gaussian process meta-learner
            gaussTrain = np.array([
                predictLogistic, predictArima1, predictArima2, predictArima3,
                predictArima4, predictArima5
            ]).T
            gaussTest = np.array([
                testLogistic, testArima1, testArima2, testArima3, testArima4,
                testArima5
            ]).T
            reg = GaussianProcessRegressor(kernel=DotProduct() + WhiteKernel(),
                                           random_state=0)
            stacking_model = reg.fit(gaussTrain, linearCasesTrainX)
            predictTrain = stacking_model.predict(gaussTrain)
            predictTest = stacking_model.predict(gaussTest)
            if False:
                visualize_gauss(
                    np.hstack((predictTrain, predictTest)).T, timeTrain,
                    linearCasesTrainX, timeTest, linearCasesTestX)

            gaussMSE.append(
                np.abs(predictTest - linearCasesTestX) / linearCasesTestX)

        #Append to state totals
        linearMSE_by_state.append(
            np.reshape(np.array(linearMSE).mean(axis=0), (51)))
        logMSEAdj_by_state.append(
            np.reshape(np.array(logMSEAdj).mean(axis=0), (51)))
        linearCasesMSE_by_state.append(
            np.reshape(np.array(linearCasesMSE).mean(axis=0), (51)))
        logCasesMSE_by_state.append(
            np.reshape(np.array(logCasesMSE).mean(axis=0), (51)))
        logisticMSE_by_state.append(
            np.reshape(np.array(logisticMSE).mean(axis=0), (51)))
        dataNoise_by_state.append(np.mean(dataNoise))
        arimaMSE_by_state.append(
            np.reshape(np.array(arimaMSE).mean(axis=0), (51)))
        gaussMSE_by_state.append(
            np.reshape(np.array(gaussMSE).mean(axis=0), (51)))
        print("Average logistic Test error:", np.mean(dataNoise))

    #Plot proof-of-concept graph
    if showPlot:
        plt.plot(np.array(linearMSE_by_state).mean(axis=0),
                 label='Mobility (linear, non-temporal)')
        plt.plot(np.array(logMSEAdj_by_state).mean(axis=0),
                 label='Mobility (logarithmic, non-temporal)')
        plt.xlabel("Days in advance to predict")
        plt.ylabel("Percent deviation from true value")
        plt.legend(loc="upper left")
        plt.show()

        #Plot baseline graph
        #plt.plot(np.array(linearCasesMSE_by_state).mean(axis=0), label='Cases (linear, temporal)') #Don't plot because performance is terrible
        plt.plot(np.array(logCasesMSE_by_state).mean(axis=0),
                 label='Cases (logarithmic temporal)')
        plt.plot(np.array(logisticMSE_by_state).mean(axis=0),
                 label='Cases (logistic temporal)')
        plt.plot(np.array(arimaMSE_by_state).mean(axis=0),
                 label='Cases (ARIMA)')
        plt.plot(np.array(gaussMSE_by_state).mean(axis=0),
                 label='Cases (Gaussian Process meta)')
        plt.xlabel("Days in advance to predict")
        plt.ylabel("Percent deviation from true value")
        plt.legend(loc="upper left")
        plt.show()
    print("Average logistic test error:", np.mean(dataNoise_by_state))
Esempio n. 27
0
# best_model, models = best_sarima_model(train_data=log_transformed_train_data,p=range(3),q=range(3),P=range(3),Q=range(3))
# preds_best = np.exp(best_model.predict(start='2019-01-01', dynamic=True, typ='levels'))
# print(f'MAPE{np.round(mean_abs_pct_error(log_transformed_test_data,preds_best),2)}')

agile_model = SARIMAX(endog=log_transformed_train_data,
                      order=(1, 1, 2),
                      seasonal_order=(1, 1, 2, 52),
                      enforce_invertibility=False).fit()
agile_model.summary()

#just do deactive warnings regarding PyCharm and Numpy
# noinspection PyTypeChecker
agile_model_pred = np.exp(
    agile_model.predict(start=test_first_date,
                        end=test_last_date,
                        dynamic=True,
                        typ='levels'))

print(f'MAPE {np.round(mean_abs_pct_error(test_data,agile_model_pred),2)}%')
# print(f'MAE:{np.round(mean_absolute_error(test_data,agile_model_pred),2)}')

# noinspection PyTypeChecker
agile_model_forecast = np.exp(agile_model.forecast(steps=2))
print(agile_model_forecast)


def plot_prediciton(training_data, agile_model, agile_model_pred,
                    original_data):
    model_data = training_data.values[1:].reshape(-1) - agile_model.resid[1:]
    model_data = pd.concat((model_data, agile_model_pred))
    plt.figure(figsize=(16, 6))
Esempio n. 28
0
model = SARIMAX(series,
                order=(1, 1, 1),
                seasonal_order=(1, 1, 1, 52),
                trend='n',
                enforce_stationarity=False,
                enforce_invertibility=False).fit()

print("________________________")
print("MODEL SUMMARY")
print(model.summary().tables[1])

# Nice way to check residuals follow a Gaussian distribution
model.plot_diagnostics(figsize=(15, 12))
plt.show()

train_pred = model.predict()
train_pred_cpy = train_pred.copy()
print(train_pred_cpy)
print(type(train_pred_cpy))
print(type(series))

cdf_index = a_organic[0:train_size].index
#print(cdf_index)
#print(type(cdf_index))

#________________________________________________
#Comparing the FIT with the trained data
#________________________________________________

#I need to create here a data frame from the series
compare_frame = {
Esempio n. 29
0
# In[162]:

loss_per_epoch = model.history.history['loss']
plt.plot(range(len(loss_per_epoch)), loss_per_epoch)

# In[163]:

first_eval_batch = scaled_train[-30:]

# In[164]:

first_eval_batch = first_eval_batch.reshape((1, n_input, n_features))

# In[165]:

model.predict(first_eval_batch)

# In[166]:

scaled_test[0]

# In[167]:

test_predictions = []

first_eval_batch = scaled_train[-n_input:]
current_batch = first_eval_batch.reshape((1, n_input, n_features))

# In[168]:

np.append(current_batch[:, 1:, :], [[[99]]], axis=1)
Esempio n. 30
0
#fifth, run SARIMA train_model with the order determined by auto_arima
# ----------
train_model = SARIMAX(train['total'],
                      order=(0, 0, 1),
                      seasonal_order=(2, 0, 0, 7),
                      enforce_invertibility=False).fit()
print(train_model.summary())
# enforce invertibility allows to keep coefficients below 1, just to avoid ValueError

#sixth, test predictions vs test set
# ----------
start = len(train)
end = len(train) + len(test) - 1

predictions = train_model.predict(
    start, end, exog=test[['holiday']]).rename('SARIMAX predictions vs test')

test['total'].plot(legend=True)
predictions.plot(legend=True)
plt.show()

#seventh, evaluate the model on rmse error
# ----------
error = rmse(test['total'], predictions)
std = test['total'].std()
error_result = error / std * 100
print('rmse error is the following percentage out of standard dev: ')
print(error_result)

# eight, run forecast into the future with full dataset
# ----------