Esempio n. 1
0
def load_pindex_u(db_interface,index_name):
    t = time.time()
    meta_table = index_name + "_meta"
    meta_inf = db_interface.query_table(meta_table,
                                        columns_queried=['T', 'T0', 'k', 'gamma', 'var_direct_method', 'k_var', 'T_var',
                                                         'soft_thresholding', 'start_time', 'aggregation_method',
                                                         'agg_interval', 'persist_l','col_to_row_ratio', 'L','last_TS_fullSVD','last_TS_inc',
                                                              'last_TS_seen', 'p' ,'time_series_table_name', 'indexed_column','time_column'])
    
    T, T0, k, gamma, direct_var, k_var, T_var, SSVT, start_time, aggregation_method, agg_interval, persist_l, col_to_row_ratio, L, ReconIndex, MUpdateIndex, TimeSeriesIndex , p= meta_inf[0][:-3]
    L_m = db_interface.query_table(index_name + "_m", ['L'], 'modelno =0')[0][0]
    
    time_series_table_name, value_column, time_column = meta_inf[0][-3:]
    last = get_bound_time(db_interface, time_series_table_name, time_column ,'max')
    value_columns = value_column.split(',')
    # ------------------------------------------------------
    # temp fix
    gamma = float(gamma)
    if not isinstance(start_time, (int, np.integer)):
        start_time = pd.to_datetime(start_time)
    if not isinstance(last, (int, np.integer)):
        last = pd.to_datetime(start_time)
    agg_interval = float(agg_interval)
    # ------------------------------------------------------
    no_ts = len(value_columns)
    last_index = (index_ts_mapper(start_time, agg_interval, last) + 1)
    if last_index - MUpdateIndex//no_ts <= 5*L_m:
        print(L, last_index, MUpdateIndex)
        print('nothing major to update')
        return False
    if p < 1.0:
        fill_in_missing = False
    else: fill_in_missing = True
    TSPD = TSPI(interface=db_interface, index_name=index_name, schema=None, T=T, T0=T0, rank=k, gamma=gamma,
                direct_var=direct_var, rank_var=k_var, T_var=T_var, SSVT=SSVT, start_time=start_time,
                aggregation_method=aggregation_method, agg_interval=agg_interval, time_series_table_name=time_series_table_name, 
                time_column = time_column, value_column = value_columns ,persist_L = persist_l,col_to_row_ratio = col_to_row_ratio, fill_in_missing = fill_in_missing, p =p)
    
    model_no = int(max((last_index*no_ts - 1) / (T / 2) - 1, 0))
    last_model_no = int(max((MUpdateIndex - 1) / (T / 2) - 1, 0))
    model_start = last_model_no*T/2
    print(model_no, last_model_no, ReconIndex, model_start, last_index)
    
    new_points_ratio = (last_index*no_ts - ReconIndex)/(ReconIndex - model_start)
    print(new_points_ratio)
    
    if new_points_ratio < gamma and model_no <= last_model_no and (last_index*no_ts)%(T//2) != 0:
        print('marginal update')
        start = (MUpdateIndex)//TSPD.no_ts
        end = (TimeSeriesIndex - 1)//TSPD.no_ts
    else:
        print('big update')
        start = max((TimeSeriesIndex - T)//TSPD.no_ts,0)
        end = (TimeSeriesIndex - 1)//TSPD.no_ts
    # initiate TSPI object 
    TSPD.ts_model = TSMM(TSPD.k, TSPD.T, TSPD.gamma, TSPD.T0, col_to_row_ratio=col_to_row_ratio,
                         model_table_name=index_name, SSVT=TSPD.SSVT, L=L, persist_L = TSPD.persist_L, no_ts = TSPD.no_ts, fill_in_missing = fill_in_missing, p =p)
    TSPD.ts_model.ReconIndex, TSPD.ts_model.MUpdateIndex, TSPD.ts_model.TimeSeriesIndex = ReconIndex, MUpdateIndex, TimeSeriesIndex

    # load variance models if any
    if TSPD.k_var != 0:
        col_to_row_ratio, L, ReconIndex, MUpdateIndex, TimeSeriesIndex = db_interface.query_table(meta_table,
                                                                                                  columns_queried=[
                                                                                                      'col_to_row_ratio_var',
                                                                                                      'L_var',
                                                                                                      'last_TS_fullSVD_var',
                                                                                                      'last_TS_inc_var',
                                                                                                      'last_TS_seen_var'])[0]

        TSPD.var_model = TSMM(TSPD.k_var, TSPD.T_var, TSPD.gamma, TSPD.T0, col_to_row_ratio=col_to_row_ratio,
                              model_table_name=index_name + "_variance", SSVT=TSPD.SSVT, L=L, persist_L =TSPD.persist_L, no_ts = TSPD.no_ts, fill_in_missing = fill_in_missing, p =p)
        TSPD.var_model.ReconIndex, TSPD.var_model.MUpdateIndex, TSPD.var_model.TimeSeriesIndex = ReconIndex, MUpdateIndex, TimeSeriesIndex

    print('loading meta_model time', time.time()-t)
    # LOADING SUB-MODELs Information
    TSPD._load_models_from_db(TSPD.ts_model)
    print('loading sub models time', time.time()-t)
    if end >= start:
        start_point = index_ts_inv_mapper(TSPD.start_time, TSPD.agg_interval, start)
        end_point = index_ts_inv_mapper(TSPD.start_time, TSPD.agg_interval, end)
        TSPD.ts_model.TimeSeries = TSPD._get_range(start_point, end_point)
        print('loading time series time', time.time()-t)
        print(start, end, start_point,end_point)
    # query variance models table
    if TSPD.k_var != 0:
        TSPD._load_models_from_db(TSPD.var_model)

        # load last T points of  variance time series (squared of observations if not direct_var)
        if TSPD.direct_var:
            end_var = (TSPD.var_model.TimeSeriesIndex - 1)//TSPD.no_ts
            start = max(start -1,0)
            TT = min(end_var-start+1, TSPD.var_model.T//TSPD.no_ts)
            if (end_var-start+1) - TT >0:
                start +=  (end_var-start+1) - TT 
            mean = np.zeros([TT,TSPD.no_ts])
            print(mean.shape, start, end_var, TSPD.var_model.T )
            start_point = index_ts_inv_mapper(TSPD.start_time, TSPD.agg_interval, start)
            end_point = index_ts_inv_mapper(TSPD.start_time, TSPD.agg_interval, end_var)
            print(start, end_var, start_point,end_point,TT)
            if end_var != start:
                for ts_n, value_column in enumerate(TSPD.value_column):
                    mean[:,ts_n] = get_prediction_range(index_name, TSPD.time_series_table_name, value_column,db_interface, start_point, end_point, uq=False)
                TSPD.var_model.TimeSeries = TSPD.ts_model.TimeSeries[:len(mean),:] - mean
        else:
            TSPD.var_model.TimeSeries = (TSPD.ts_model.TimeSeries) ** 2
    print('loading time series variance time', time.time()-t)
    return TSPD
Esempio n. 2
0
def _get_forecast_range(index_name,table_name, value_column, index_col, interface, t1, t2,MUpdateIndex,L,k,T,last_model, interval, start_ts, last_TS_seen,no_ts, value_index,direct_var = False,variance = False,averaging = 'average', projected = False,p = 1.0):
    """
    Return the forecasted value in the past at the time range t1 to t2 for the value of column_name using index_name 
    ----------
    Parameters
    ----------
    index_name: string 
        name of the PINDEX used to query the prediction

    index_name: table_name 
        name of the time series table in the database

    value_column: string
        name of column than contain time series value

    index_col: string  
        name of column that contains time series index/timestamp

    interface: db_class object
        object used to communicate with the DB. see ../database/db_class for the abstract class
    
    t1: (int or timestamp)
        index or timestamp indicating the start of the queried range 
    
    t2: (int or timestamp)
        index or timestamp indicating the end of the queried range  
    
    L: (int)
        Model parameter determining the number of rows in each matrix in a sub model. 
    
    k: (int )
        Model parameter determining the number of retained singular values in each matrix in a sub model. 
    
    T: (int )
        Model parameter determining the number of datapoints in each matrix in a sub model.
    
    last_model: (int )
        The index of the last sub model

    averaging: string, optional, (default 'average')
        Coefficients used when forecasting, 'average' means use the average of all sub models coeffcients. 
    ----------
    Returns
    ----------
    prediction  array, shape [(t1 - t2 +1)  ]
        forecasted value of the time series  in the range [t1,t2]  using index_name
    """
    ############### EDITS ##################
    #1- Replace last_ts with the last time stamp seen 
    ########################################
    # get coefficients
    coeffs = np.array(interface.get_coeff(index_name + '_c_view', averaging))
    coeffs_ts = coeffs[-no_ts:]
    coeffs = coeffs[:-no_ts]
    no_coeff = len(coeffs)
 
    if not direct_var or not variance:
            if projected:
                if last_model != 0:
                    q_model = last_model- 1
                else:
                    q_model = last_model
                U = interface.get_U_row(index_name + '_u', [0, 2 * L], [q_model, q_model], k,
                                             return_modelno=False,return_weights_decom=True)[:-1,k:]
                no_coeff = U.shape[0]
                projection_matrix = np.dot(U,U.T)
            
            agg_interval = float(interval)
            if not isinstance(start_ts, (int, np.integer)):
                start_ts = pd.Timestamp(start_ts)
            # if the range queries is beyond what we have so far, get the last point seen
            last_TS_seen = get_bound_time(interface, table_name, index_col, 'max')
            if not isinstance(last_TS_seen, (int, np.integer)):
                last_TS_seen = index_ts_mapper(start_ts, agg_interval, last_TS_seen)
            last_TS_seen+=1
            print(t1,t2, last_TS_seen)
            
            t1_ = min(t1, last_TS_seen)
            t2_ = min(t2, last_TS_seen)
            end = index_ts_inv_mapper(start_ts, agg_interval, t1_ - 1 )
            start = index_ts_inv_mapper(start_ts, agg_interval, t1_ - no_coeff  )
            print(start, end)
            obs = interface.get_time_series(table_name, start, end, start_ts = start_ts,  value_column=value_column, index_column= index_col, Desc=False, interval = agg_interval, aggregation_method =  averaging)
            output = np.zeros([t2 - t1_ + 1 ])
            obs = np.array(obs)[-no_coeff:,0]
            print(len(obs[:]), no_coeff)
            # Fill using fill_method
            if p <1:
                obs = np.array(pd.DataFrame(obs).fillna(value = 0).values[:,0])
                obs /= p
            else:
                obs = np.array(pd.DataFrame(obs).fillna(method = 'ffill').values[:,0])
                obs = np.array(pd.DataFrame(obs).fillna(method = 'bfill').values[:,0])
            if variance:
                obs = obs **2
            observations = np.zeros([t2 - t1_ + 1 + no_coeff])
            observations[:no_coeff] = obs
            
            for i in range(0, t2 + 1 - t1_): 
                    if i  < len(obs):
                        if projected:
                            output[i] = np.dot(coeffs.T, np.dot(projection_matrix, observations[i:i + no_coeff]))+coeffs_ts[value_index]
                        else:
                            output[i] = np.dot(coeffs.T,  observations[i:i + no_coeff])+coeffs_ts[value_index]
                    else:
                        output[i] = np.dot(coeffs.T,  observations[i:i + no_coeff])+coeffs_ts[value_index]
                    if i+no_coeff >= len(obs):
                        observations[i+no_coeff] = output[i]

            return output[-(t2 - t1 + 1):]
            
    # the forecast should always start at the last point
    t1_ = MUpdateIndex//no_ts 
    output = np.zeros([t2 - t1_ + 1 + no_coeff])
    output[:no_coeff] = _get_imputation_range(index_name, table_name, value_column, index_col, interface, t1_ - no_coeff, t1_ - 1, L,k,T,last_model,value_index, no_ts)
    for i in range(0, t2 + 1 - t1_):
        output[i + no_coeff] = np.dot(coeffs.T, output[i:i + no_coeff])+coeffs_ts[value_index]
    return output[-(t2 - t1 + 1):]
Esempio n. 3
0
def get_prediction_range( index_name, table_name, value_column, interface, t1,t2 , uq = True, uq_method ='Gaussian', c = 95., projected = False):

    """
    Return an array of N (N = t2-t1+1) predicted value along with the confidence interval for the value of column_name  at time t1 to t2  
    using index_name  by calling either forecast_range or impute_range function 
    ----------
    Parameters
    ----------
    index_name: string 
        name of the PINDEX used to query the prediction

    table_name: string 
        name of the time series table in the database

    value_column: string
        name of column than contain time series value

    interface: db_class object
        object used to communicate with the DB. see ../database/db_class for the abstract class
    
    t1: (int or timestamp)
        index or timestamp indicating the start of the queried range 
    
    t2: (int or timestamp)
        index or timestamp indicating the end of the queried range 
    
    uq: boolean optional (default=true) 
        if true,  return upper and lower bound of the  c% confidenc interval

    uq_method: string optional (defalut = 'Gaussian') options: {'Gaussian', 'Chebyshev'}
        Uncertainty quantification method used to estimate the confidence interval

    c: float optional (default 95.)    
        confidence level for uncertainty quantification, 0<c<100
    ----------
    Returns
    ----------
    prediction array, shape [(t1 - t2 +1)  ]
        Values of the predicted point of the time series in the time interval t1 to t2
    
    deviation array, shape [1, (t1 - t2 +1)  ]
        The deviation from the mean to get the desired confidence level 
    """
    # query pindex parameters


    T,T_var, L, k,k_var, L_var, last_model, MUpdateIndex,var_direct, interval, start_ts, last_TS_seen, last_TS_seen_var, index_col, value_columns, MUpdateIndex_var, p = interface.query_table( index_name+'_meta',['T','T_var', 'L', 'k','k_var','L_var', 'no_submodels', 'last_TS_inc', 'var_direct_method', 'agg_interval','start_time', "last_TS_seen", "last_TS_seen_var", "time_column","indexed_column",'last_TS_inc_var','p'])[0]
    last_model -= 1
    value_columns = value_columns.split(',')
    no_ts = len(value_columns)

    try: value_index = value_columns.index(value_column)
    except: raise Exception('The value column %s selected is not indexed by the chosen pindex'%(value_column))
    
    if not isinstance(t1, (int, np.integer)):
        t1 = pd.to_datetime(t1)
        t2 = pd.to_datetime(t2)
        start_ts = pd.to_datetime(start_ts)
    
    
    interval = float(interval)
    t1 = index_ts_mapper(start_ts, interval, t1)
    t2 = index_ts_mapper(start_ts, interval, t2)
    
    if MUpdateIndex == 0:
        last_TS_seen = get_bound_time(interface, table_name, index_col, 'max')
        obs = interface.get_time_series(table_name, start_ts, last_TS_seen, start_ts = start_ts,  value_column=value_column, index_column= index_col, Desc=False, interval = interval, aggregation_method = 'average')
        if uq: return np.mean(obs)*np.ones(t2-t1+1), np.zeros(t2-t1+1)
        else: return np.mean(obs)*np.ones(t2-t1+1)


    # check uq variables
    if uq:

        if c < 0 or c >=100:
            raise Exception('confidence interval c must be in the range (0,100): 0 <=c< 100')

        if uq_method == 'Chebyshev':
            alpha = 1./(np.sqrt(1-c/100))
        elif uq_method == 'Gaussian':
            alpha = norm.ppf(1/2 + c/200)
        else:
            raise Exception('uq_method option is not recognized,  available options are: "Gaussian" or "Chebyshev"')
            
    # if all points are in the future, use _get_forecast_range 
    if t1 > (MUpdateIndex - 1)//no_ts:
        print('forecasting')
        if not uq: return _get_forecast_range(index_name,table_name, value_column, index_col, interface, t1,t2, MUpdateIndex,L,k,T,last_model,interval, start_ts, last_TS_seen,no_ts,value_index, projected = projected, p = p)
        
        else:
            prediction = _get_forecast_range(index_name,table_name, value_column, index_col, interface, t1,t2, MUpdateIndex,L,k,T,last_model,interval, start_ts, last_TS_seen,no_ts,value_index, projected = projected, p = p)
            var = _get_forecast_range(index_name+'_variance',table_name, value_column, index_col, interface, t1,t2, MUpdateIndex_var, L,k_var,T_var,last_model,interval, start_ts, last_TS_seen_var, no_ts,value_index,variance = True, direct_var =var_direct,  projected = projected,p = p)
            # if the second model is used for the second moment, subtract the squared mean to estimate the variance
            if not var_direct:
                var = var - (prediction)**2
            var *= (var>0) 
            
            return prediction, alpha*np.sqrt(var)
    
    # if all points are in the past, use get_imputation_range
    elif t2 <=  (MUpdateIndex - 1)//no_ts:    
        if not uq: return _get_imputation_range(index_name, table_name, value_column, index_col, interface, t1,t2,L,k,T,last_model, value_index, no_ts,p = p)
        else:
            prediction = _get_imputation_range(index_name, table_name, value_column, index_col, interface, t1,t2,L,k,T,last_model, value_index, no_ts,p = p)
            if (MUpdateIndex_var-1)//no_ts >= t2:
                var = _get_imputation_range(index_name+'_variance',table_name, value_column, index_col, interface, t1,t2, L_var,k_var,T_var,last_model, value_index, no_ts,p = p)
            else:
                imputations_var = _get_imputation_range(index_name+'_variance', table_name, value_column, index_col, interface, t1,(MUpdateIndex_var-1)//no_ts,L_var,k_var,T_var,last_model, value_index, no_ts,p = p)
                forecast_var = _get_forecast_range(index_name+'_variance',table_name, value_column, index_col, interface,MUpdateIndex_var//no_ts ,t2, MUpdateIndex_var,L_var,k_var,T_var,last_model,interval, start_ts,last_TS_seen, no_ts,value_index,variance = True, direct_var =var_direct,projected = projected,p = p)
                var = np.array(list(imputations_var)+list(forecast_var))
            # if the second model is used for the second moment, subtract the squared mean to estimate the variance
            if not var_direct:
                var = var - (prediction)**2
            var *= (var>0) 
            return prediction, alpha*np.sqrt(var)
    
    # if points are in both the future and in the past, use both        
    else:
        imputations = _get_imputation_range(index_name, table_name, value_column, index_col, interface, t1,(MUpdateIndex-1)//no_ts,L,k,T,last_model,value_index, no_ts,p = p)
        forecast = _get_forecast_range(index_name,table_name, value_column, index_col, interface,(MUpdateIndex)//no_ts ,t2, MUpdateIndex,L,k,T,last_model,interval, start_ts,last_TS_seen, no_ts,value_index,projected = projected,p = p)
        if not uq: return list(imputations)+list(forecast)
        else:
            imputations_var = _get_imputation_range(index_name+'_variance', table_name, value_column, index_col, interface, t1,(MUpdateIndex_var-1)//no_ts,L_var,k_var,T_var,last_model, value_index, no_ts,p = p)
            forecast_var = _get_forecast_range(index_name+'_variance',table_name, value_column, index_col, interface,MUpdateIndex_var//no_ts ,t2, MUpdateIndex_var,L_var,k_var,T_var,last_model,interval, start_ts,last_TS_seen, no_ts,value_index,variance = True, direct_var =var_direct,projected = projected,p = p)
            if not var_direct:
                forecast_var = forecast_var - (forecast)**2
                imputations_var = imputations_var - (imputations)**2
            imputations_var *= (imputations_var>0)
            forecast_var *= (forecast_var>0)
            return np.array(list(imputations)+list(forecast)), np.array(list(alpha*np.sqrt(imputations_var)) + list(alpha*np.sqrt(forecast_var)))
Esempio n. 4
0
def get_prediction(index_name, table_name, value_column, interface, t, uq = True, uq_method ='Gaussian', c = 95, projected = False):
    """
    Return the predicted value along with the confidence interval for the value of column_name  at time t  using index_name 
    by calling either get_forecast or get_imputation function 
    ----------
    Parameters
    ----------
    index_name: string 
        name of the PINDEX used to query the prediction

    index_name: table_name 
        name of the time series table in the database

    value_column: string
        name of column than contain time series value

    interface: db_class object
        object used to communicate with the DB. see ../database/db_class for the abstract class
    
    t: (int or timestamp)
        index or timestamp indicating the queried time. 
    
    uq: boolean optional (default=true) 
        if true,  return upper and lower bound of the  c% confidenc interval

    uq_method: string optional (defalut = 'Gaussian') options: {'Gaussian', 'Chebyshev'}
        Uncertainty quantification method used to estimate the confidence interval

    c: float optional (default 95.)    
        confidence level for uncertainty quantification, 0<c<100
    ----------
    Returns
    ----------
    prediction float
        Values of time series at time t
    
    deviation float
        The deviation from the mean to get the desired confidence level 
    
    """
    # query pindex parameters
    
    T,T_var, L, k,k_var, L_var, last_model, MUpdateIndex,var_direct, interval, start_ts, last_TS_seen, last_TS_seen_var, index_col, value_columns, MUpdateIndex_var, p = interface.query_table( index_name+'_meta',['T','T_var', 'L', 'k','k_var','L_var', 'no_submodels', 'last_TS_inc', 'var_direct_method', 'agg_interval','start_time', "last_TS_seen", "last_TS_seen_var", "time_column","indexed_column",'last_TS_inc_var','p'])[0]
    ############ Fix queried values ####################
    last_model -= 1
    value_columns = value_columns.split(',')
    no_ts = len(value_columns)
    
    if not isinstance(t, (int, np.integer)):
        t = pd.to_datetime(t)
        start_ts = pd.to_datetime(start_ts)
    interval = float(interval)
    ###################################################
    # Check 1: value colmn is indexed 

    try: value_index = value_columns.index(value_column)
    except: raise Exception('The value column selected is not indexed by the chosen pindex')
    
    # if the model is not fit, return the average
    if MUpdateIndex == 0:
        last_TS_seen = get_bound_time(interface, table_name, index_col, 'max')
        obs = interface.get_time_series(table_name, start_ts, last_TS_seen, start_ts = start_ts,  value_column=value_column, index_column= index_col, Desc=False, interval = interval, aggregation_method = 'average')
        if uq: return np.mean(obs), 0
        else: return np.mean(obs)

    t = index_ts_mapper(start_ts, interval, t)
    if uq:
        
        if uq_method == 'Chebyshev':
            alpha = 1./(np.sqrt(1-c/100))
        
        elif uq_method == 'Gaussian':
            alpha = norm.ppf(1/2 + c/200)
        
        else:
            raise Exception('uq_method option is not recognized,  available options are: "Gaussian" or "Chebyshev"')

    if t > (MUpdateIndex - 1)//no_ts:
        if not uq: return _get_forecast_range(index_name,table_name, value_column, index_col, interface,t, t, MUpdateIndex,L,k,T,last_model, interval, start_ts, last_TS_seen,no_ts,value_index, projected = projected,p = p)[-1]
        else:
            prediction = _get_forecast_range(index_name,table_name, value_column, index_col, interface,t, t, MUpdateIndex,L,k,T,last_model, interval, start_ts,last_TS_seen, no_ts,value_index, projected = projected,p = p)[-1]
            var = _get_forecast_range(index_name+'_variance',table_name, value_column, index_col, interface,t, t, MUpdateIndex_var,L_var,k_var,T_var,last_model,interval, start_ts,last_TS_seen_var,no_ts,value_index,  projected = projected, variance = True, direct_var =var_direct,p = p)[-1]
            
            if not var_direct:
                var = var - (prediction)**2
            var *= (var>0)
            return prediction, alpha*np.sqrt(var)

    else:
        if not uq: return _get_imputation(index_name, table_name, value_column, index_col, interface, t,L,k,T,last_model,no_ts,value_index,p = p)
        else:
            prediction = _get_imputation(index_name, table_name, value_column, index_col, interface, t,L,k,T,last_model, no_ts,value_index,p = p)
            if t > (MUpdateIndex_var - 1)//no_ts: var =  _get_forecast_range(index_name+'_variance',table_name, value_column, index_col, interface,t, t, MUpdateIndex_var,L_var,k_var,T_var,last_model,interval, start_ts,last_TS_seen_var,no_ts,value_index,  projected = projected, variance = True, direct_var =var_direct, p = p)[-1]
            else: var = _get_imputation(index_name+'_variance',table_name, value_column, index_col, interface, t, L_var,k_var,T_var,last_model, no_ts,value_index,p = p)
            
            if not var_direct:
                var = var - (prediction)**2
            var *= (var>0)
            return prediction, alpha*np.sqrt(var)