Ejemplo n.º 1
0
def make_time_features(ts, index=None, epoch=None, epoch_span=None):
    """Project datetimes into vector space for use in machine learning models

    Outputs:

    - projection onto the unit circle of
      - second of day
      - day of week
      - day of year
    - seconds since `epoch`, normalized by `epoch_span`
    - binary workday indicator (i.e., Monday-Friday except major US holidays)

    :param ts: timestamp(s) to process
    :type ts: datetime.datetime or iterable thereof
    :param index: index of ts (e.g., if from a larger dataframe)
    :param epoch: start of time reckoning
    :param epoch_span: length of time reckoning
    :rtype: pd.DataFrame
    :returns: various projections of datetimes into vector space
    """
    # input validation
    try:
        if len(ts) == 1:
            _singleton = True
        elif len(ts) > 1:
            _singleton = False
        elif len(ts) < 1:
            raise ValueError("must pass non-empty iterable of timestamps")
    except TypeError:
        return make_time_features([ts], index=index, epoch=epoch, epoch_span=epoch_span)

    if not isinstance(ts, pd.DatetimeIndex):
        ts = pd.Series(0, index=ts).index
    if not isinstance(ts, pd.DatetimeIndex):
        raise ValueError("must pass non-empty iterable of timestamps")

    if index is None:
        index = pd.RangeIndex(len(ts))
    if epoch is None:
        epoch = min(ts)
    if epoch_span is None:
        epoch_span = float((max(ts) - epoch).total_seconds())

    time_features = {}
    start = min(ts)
    end = max(ts)

    # Major US holidays
    NewYearsDay = pd.tseries.holiday.Holiday("New Years Day", month=1, day=1)
    MemorialDay = pd.tseries.holiday.Holiday("Memorial Day", month=6, day=1, offset=pd.DateOffset(weekday=MO(-1)))
    IndependenceDay = pd.tseries.holiday.Holiday("Independence Day", month=7, day=4)
    LaborDay = pd.tseries.holiday.Holiday("Labor Day", month=9, day=1, offset=pd.DateOffset(weekday=MO(1)))
    ThanksgivingDay = pd.tseries.holiday.Holiday(
        "Thanksgiving Day", month=11, day=1, offset=pd.DateOffset(weekday=TH(4))
    )
    ChristmasDay = pd.tseries.holiday.Holiday("Christmas Day", month=12, day=25)
    holidays = (
        NewYearsDay.dates(start.date(), end.date()).tolist()
        + MemorialDay.dates(start.date(), end.date()).tolist()
        + IndependenceDay.dates(start.date(), end.date()).tolist()
        + LaborDay.dates(start.date(), end.date()).tolist()
        + ThanksgivingDay.dates(start.date(), end.date()).tolist()
        + ChristmasDay.dates(start.date(), end.date()).tolist()
    )
    holidays = set([h.date() for h in holidays])

    # projections onto unit circle
    time_features["day_cos"] = np.cos((ts.hour * 3600 + ts.minute * 60 + ts.second) * 2 * np.pi / 86400.0)
    time_features["day_sin"] = np.sin((ts.hour * 3600 + ts.minute * 60 + ts.second) * 2 * np.pi / 86400.0)
    time_features["week_cos"] = np.cos(ts.dayofweek * 2 * np.pi / 7.0)
    time_features["week_sin"] = np.sin(ts.dayofweek * 2 * np.pi / 7.0)
    time_features["year_cos"] = np.cos(ts.dayofyear * 2 * np.pi / 365.0)
    time_features["year_sin"] = np.sin(ts.dayofyear * 2 * np.pi / 365.0)
    # linear march through time
    time_features["epoch"] = (ts - epoch).total_seconds() / epoch_span
    # workday indicator
    time_features["workday"] = [int(weekday < 5 and date not in holidays) for weekday, date in zip(ts.weekday, ts.date)]

    if _singleton:
        return {k: v[0] for k, v in time_features.items()}
    else:
        return pd.DataFrame(time_features, index=index)
                    '/media/edwin/6F71AD994355D30E/Edwin/Maestría Meteorologia/Tesis/otras_variables/mejores_var_'
                    + pp + '_hum_2m.csv')
            else:
                recoleccion_minim_1 = pd.read_csv(
                    '/media/edwin/6F71AD994355D30E/Edwin/Maestría Meteorologia/Tesis/otras_variables/mejores_var_'
                    + pp + varia_1)

        base_h = []
        estacion_sintmp = []
        estacion_sintmp_col = []
        dist_menor_3 = []
        dist_menor_10 = []
        fecha_min_rec = []
        estaciones_aut = pd.DataFrame({'cod': [], 'inicio': [], 'fin': []})

        step = pd.DateOffset(hours=1)

        for j in recoleccion_minim_1.cod_1:  #[21206990.0]:#
            print(j)

            min_2 = []
            max_2 = []

            os.chdir(
                '/media/edwin/6F71AD994355D30E/Edwin/Maestría Meteorologia/Tesis/datos_ideam/validados_col_col/'
            )
            valores = os.listdir()
            if 'v_' + str(j)[0:-2] + varia_1 not in valores:
                estacion_sintmp.append('v_' + str(j)[0:-2] + varia_1)
                continue
            base_validada = pd.read_csv('v_' + str(j)[0:-2] + varia_1)
Ejemplo n.º 3
0
    def evaluate_prediction(self, start_date=None, end_date=None, nshares = None, months = 6):
        
        # Default start date is one year before end of data
        # Default end date is end date of data
        if start_date is None:
            start_date = self.max_date - pd.DateOffset(months = months)
            if start_date < self.min_date:
                start_date = self.min_date + 0.5*(self.max_date-self.min_date)
        if end_date is None:
            end_date = self.max_date
            
        start_date, end_date = self.handle_dates(start_date, end_date)
        
        # Training data starts self.training_years years before start date and goes up to start date
        train = self.stock[(self.stock['Date'] < start_date.date()) & 
                           (self.stock['Date'] > (start_date - pd.DateOffset(years=self.training_years)).date())]
        
        # Testing data is specified in the range
        test = self.stock[(self.stock['Date'] >= start_date.date()) & (self.stock['Date'] <= end_date.date())]
        
        # Create and train the model
        model = self.create_model()
        model.fit(train)
        
        # Make a future dataframe and predictions
        future = model.make_future_dataframe(periods = 365, freq='D')
        future = model.predict(future)
        
        # Merge predictions with the known values
        test = pd.merge(test, future, on = 'ds', how = 'inner')

        train = pd.merge(train, future, on = 'ds', how = 'inner')
        
        # Calculate the differences between consecutive measurements
        test['pred_diff'] = test['yhat'].diff()
        test['real_diff'] = test['y'].diff()
        
        # Correct is when we predicted the correct direction
        test['correct'] = (np.sign(test['pred_diff']) == np.sign(test['real_diff'])) * 1
        
        # Accuracy when we predict increase and decrease
        increase_accuracy = 100 * np.mean(test[test['pred_diff'] > 0]['correct'])
        decrease_accuracy = 100 * np.mean(test[test['pred_diff'] < 0]['correct'])

        # Calculate mean absolute error
        test_errors = abs(test['y'] - test['yhat'])
        test_mean_error = np.mean(test_errors)

        train_errors = abs(train['y'] - train['yhat'])
        train_mean_error = np.mean(train_errors)

        # Calculate percentage of time actual value within prediction range
        test['in_range'] = False

        for i in test.index:
            if (test.ix[i, 'y'] < test.ix[i, 'yhat_upper']) & (test.ix[i, 'y'] > test.ix[i, 'yhat_lower']):
                test.ix[i, 'in_range'] = True

        in_range_accuracy = 100 * np.mean(test['in_range'])

        if not nshares:

            # Date range of predictions
            print('\nPrediction Range: {} to {}.'.format(start_date.date(),
                end_date.date()))

            # Final prediction vs actual value
            print('\nPredicted price on {} = ${:.2f}.'.format(max(future['ds']).date(), future.ix[len(future) - 1, 'yhat']))
            print('Actual price on    {} = ${:.2f}.\n'.format(max(test['ds']).date(), test.ix[len(test) - 1, 'y']))

            print('Average Absolute Error on Training Data = ${:.2f}.'.format(train_mean_error))
            print('Average Absolute Error on Testing  Data = ${:.2f}.\n'.format(test_mean_error))

            # Direction accuracy
            print('When the model predicted an increase, the price increased {:.2f}% of the time.'.format(increase_accuracy))
            print('When the model predicted a  decrease, the price decreased  {:.2f}% of the time.\n'.format(decrease_accuracy))

            print('The actual value was within the {:d}% confidence interval {:.2f}% of the time.'.format(int(100 * model.interval_width), in_range_accuracy))


             # Reset the plot
            self.reset_plot()
            
            # Set up the plot
            fig, ax = plt.subplots(1, 1)

            # Plot the actual values
            ax.plot(train['ds'], train['y'], 'ko-', linewidth = 1.4, alpha = 0.8, ms = 1.8, label = 'Observations')
            ax.plot(test['ds'], test['y'], 'ko-', linewidth = 1.4, alpha = 0.8, ms = 1.8, label = 'Observations')
            
            # Plot the predicted values
            ax.plot(future['ds'], future['yhat'], 'navy', linewidth = 2.4, label = 'Predicted');

            # Plot the uncertainty interval as ribbon
            ax.fill_between(future['ds'].dt.to_pydatetime(), future['yhat_upper'], future['yhat_lower'], alpha = 0.6, 
                           facecolor = 'gold', edgecolor = 'k', linewidth = 1.4, label = 'Confidence Interval')

            # Put a vertical line at the start of predictions
            plt.vlines(x=min(test['ds']).date(), ymin=min(future['yhat_lower']), ymax=max(future['yhat_upper']), colors = 'r',
                       linestyles='dashed', label = 'Prediction Start')

            # Plot formatting
            plt.legend(loc = 2, prop={'size': 8}); plt.xlabel('Date'); plt.ylabel('Price $');
            plt.grid(linewidth=0.6, alpha = 0.6)
                       
            plt.title('{} Model Evaluation from {} to {}.'.format(self.symbol,
                start_date.date(), end_date.date()));
            plt.show();

        
        # If a number of shares is specified, play the game
        elif nshares:
            
            # Only playing the stocks when we predict the stock will increase
            test_pred_increase = test[test['pred_diff'] > 0]
            
            test_pred_increase.reset_index(inplace=True)
            prediction_profit = []
            
            # Iterate through all the predictions and calculate profit from playing
            for i, correct in enumerate(test_pred_increase['correct']):
                
                # If we predicted up and the price goes up, we gain the difference
                if correct == 1:
                    prediction_profit.append(nshares * test_pred_increase.ix[i, 'real_diff'])
                # If we predicted up and the price goes down, we lose the difference
                else:
                    prediction_profit.append(nshares * test_pred_increase.ix[i, 'real_diff'])
            
            test_pred_increase['pred_profit'] = prediction_profit
            
            # Put the profit into the test dataframe
            test = pd.merge(test, test_pred_increase[['ds', 'pred_profit']], on = 'ds', how = 'left')
            test.ix[0, 'pred_profit'] = 0
        
            # Profit for either method at all dates
            test['pred_profit'] = test['pred_profit'].cumsum().ffill()
            test['hold_profit'] = nshares * (test['y'] - float(test.ix[0, 'y']))
            
            # Display information
            print('You played the stock market in {} from {} to {} with {} shares.\n'.format(
                self.symbol, start_date.date(), end_date.date(), nshares))
            
            print('When the model predicted an increase, the price increased {:.2f}% of the time.'.format(increase_accuracy))
            print('When the model predicted a  decrease, the price decreased  {:.2f}% of the time.\n'.format(decrease_accuracy))

            # Display some friendly information about the perils of playing the stock market
            print('The total profit using the Prophet model = ${:.2f}.'.format(np.sum(prediction_profit)))
            print('The Buy and Hold strategy profit =         ${:.2f}.'.format(float(test.ix[len(test) - 1, 'hold_profit'])))
            print('\nThanks for playing the stock market!\n')
            
           
            
            # Plot the predicted and actual profits over time
            self.reset_plot()
            
            # Final profit and final smart used for locating text
            final_profit = test.ix[len(test) - 1, 'pred_profit']
            final_smart = test.ix[len(test) - 1, 'hold_profit']

            # text location
            last_date = test.ix[len(test) - 1, 'ds']
            text_location = (last_date - pd.DateOffset(months = 1)).date()

            plt.style.use('dark_background')

            # Plot smart profits
            plt.plot(test['ds'], test['hold_profit'], 'b',
                     linewidth = 1.8, label = 'Buy and Hold Strategy') 

            # Plot prediction profits
            plt.plot(test['ds'], test['pred_profit'], 
                     color = 'g' if final_profit > 0 else 'r',
                     linewidth = 1.8, label = 'Prediction Strategy')

            # Display final values on graph
            plt.text(x = text_location, 
                     y =  final_profit + (final_profit / 40),
                     s = '$%d' % final_profit,
                    color = 'g' if final_profit > 0 else 'r',
                    size = 18)
            
            plt.text(x = text_location, 
                     y =  final_smart + (final_smart / 40),
                     s = '$%d' % final_smart,
                    color = 'g' if final_smart > 0 else 'r',
                    size = 18);

            # Plot formatting
            plt.ylabel('Profit  (US $)'); plt.xlabel('Date'); 
            plt.title('Predicted versus Buy and Hold Profits');
            plt.legend(loc = 2, prop={'size': 10});
            plt.grid(alpha=0.2); 
            plt.show()
Ejemplo n.º 4
0
def add_missing_row(
    df: pd.DataFrame,
    id_cols: List[str],
    reference_col: str,
    complete_index: Union[Dict[str, str], List[str]] = None,
    method: str = None,
    cols_to_keep: List[str] = None
) -> pd.DataFrame:
    """
    Add missing row to a df base on a reference column

    ---

    ### Parameters

    *mandatory :*
    - `id_cols` (*list of str*): names of the columns used to create each group
    - `reference_col` (*str*): name of the column used to identify missing rows

    *optional :*
    - `complete_index` (*list* or *dict*): [A, B, C] a list of values used to add missing rows.
      It can also be a dict to declare a date range.
      By default, use all values of reference_col.
    - `method` (*str*): by default all missing rows are added. The possible values are :
        - `"between"` : add missing rows having their value between min and max values for each group,
        - `"between_and_after"` : add missing rows having their value bigger than min value for each group.
        - `"between_and_before"` : add missing rows having their value smaller than max values for each group.
    - `cols_to_keep` (*list of str*): name of other columns to keep, linked to the reference_col.

    ---

    ### Example

    **Input**

    YEAR | MONTH | NAME
    :---:|:---:|:--:
    2017|1|A
    2017|2|A
    2017|3|A
    2017|1|B
    2017|3|B

    ```cson
    add_missing_row:
      id_cols: ['NAME']
      reference_col: 'MONTH'
    ```

    **Output**

    YEAR | MONTH | NAME
    :---:|:---:|:--:
    2017|1|A
    2017|2|A
    2017|3|A
    2017|1|B
    2017|2|B
    2017|3|B

    """
    if cols_to_keep is None:
        cols_for_index = [reference_col]
    else:
        cols_for_index = [reference_col] + cols_to_keep
    check_params_columns_duplicate(id_cols + cols_for_index)

    if method == 'between' or method == 'between_and_after':
        df['start'] = df.groupby(id_cols)[reference_col].transform(min)
        id_cols += ['start']
    if method == 'between' or method == 'between_and_before':
        df['end'] = df.groupby(id_cols)[reference_col].transform(max)
        id_cols += ['end']

    names = id_cols + cols_for_index
    new_df = df.set_index(names)
    index_values = df.groupby(id_cols).sum().index.values

    if complete_index is None:
        complete_index = df.groupby(cols_for_index).sum().index.values
    elif isinstance(complete_index, dict):
        if complete_index['type'] == 'date':
            freq = complete_index['freq']
            date_format = complete_index['format']
            start = complete_index['start']
            end = complete_index['end']
            if isinstance(freq, dict):
                freq = pd.DateOffset(**{k: int(v) for k, v in freq.items()})
            complete_index = pd.date_range(start=start, end=end, freq=freq)
            complete_index = complete_index.strftime(date_format)
        else:
            raise ParamsValueError(f'Unknown complete index type: '
                                   f'{complete_index["type"]}')

    if not isinstance(index_values[0], tuple):
        index_values = [(x,) for x in index_values]
    if not isinstance(complete_index[0], tuple):
        complete_index = [(x,) for x in complete_index]
    new_tuples_index = [x + y for x in index_values for y in complete_index]

    new_index = pd.MultiIndex.from_tuples(new_tuples_index, names=names)
    new_df = new_df.reindex(new_index).reset_index()

    if method == 'between' or method == 'between_and_after':
        new_df = new_df[new_df[reference_col] >= new_df['start']]
        del new_df['start']
    if method == 'between' or method == 'between_and_before':
        new_df = new_df[new_df[reference_col] <= new_df['end']]
        del new_df['end']

    return new_df
Ejemplo n.º 5
0
    def get_intraday_history(self, symbol, from_date=None, to_date=None):
        """
        Returns the historical quotes of the specified ticker narroweed by the date.

        Parameters
        ----------
        symbol : str
            The name of the symbol used to retrieve the information.
        from_date : datetime
            The start date (Argentina Time Zone) used to filter the information.
        to_date : datetime
            The end date (Argentina Time Zone) used to filter the information.

        Raises
        ------
        pyhomebroker.exceptions.SessionException
            If the user is not logged in.
        requests.exceptions.HTTPError
            There is a problem related to the HTTP request.
        """

        if not self._auth.is_user_logged_in:
            raise SessionException('User is not logged in')

        headers = {
            'User-Agent': user_agent,
            'Accept-Encoding': 'gzip, deflate',
            'Content-Type': 'application/x-www-form-urlencoded'
        }

        if from_date == None:
            from_date = datetime.date.today()

        if to_date == None:
            to_date = from_date + datetime.timedelta(days=1)

        from_date = from_date + datetime.timedelta(seconds=self.__hours * 3600)
        to_date = to_date + datetime.timedelta(seconds=self.__hours * 3600)

        url = '{}/Intradiario/history?symbol={}&resolution=1&from={}&to={}'.format(
            self._auth.broker['page'], symbol.upper(),
            self.__convert_datetime_to_epoch(from_date),
            self.__convert_datetime_to_epoch(to_date))

        resp = rq.get(url,
                      headers=headers,
                      cookies=self._auth.cookies,
                      proxies=self._proxies)
        resp.raise_for_status()
        resp = resp.json()

        df = pd.DataFrame({
            'date': resp['t'],
            'open': resp['o'],
            'high': resp['h'],
            'low': resp['l'],
            'close': resp['c'],
            'volume': resp['v']
        })
        df.date = pd.to_datetime(
            df.date, unit='s') - pd.DateOffset(seconds=self.__hours * 3600)
        df.volume = df.volume.astype(int)

        return df
Ejemplo n.º 6
0
                       period_df_nona_clean,
                       on='User_id')
cycle_table_filter = cycle_table[(cycle_table.start_date_clean_y >
                                  cycle_table.end_date_clean_x)]

# keep only the most relevant possible cycle
cycle_table_filter_2 = cycle_table_filter\
    .sort_values(by=['start_date_clean_x', 'start_date_clean_y'])\
    .drop_duplicates(subset=['User_id', 'start_date_clean_x'], keep='first')
cycle_table_filter_2 = cycle_table_filter_2.assign(
    cycle_length=cycle_table_filter_2.apply(
        lambda x: (x['start_date_clean_y'] - x['start_date_clean_x']).days,
        axis=1))
cycle_table_filter_2 = cycle_table_filter_2.assign(
    end_cycle=cycle_table_filter_2.apply(
        lambda x: x['start_date_clean_y'] - pd.DateOffset(1)
        if x['cycle_length'] < 40 else pd.NaT,
        axis=1))

cols_to_keep = [
    'User_id', 'start_date_x', 'end_date_x', 'start_date_clean_x',
    'end_date_clean_x', 'cycle_length', 'end_cycle'
]
rich_period_df = pd.merge(period_df,
                          cycle_table_filter_2[cols_to_keep],
                          left_on=['User_id', 'start_date'],
                          right_on=['User_id', 'start_date_x'],
                          how='left')

rich_period_df = rich_period_df.drop(columns=['start_date_x', 'end_date_x'])
new_col_names = [c.replace('_x', '') for c in rich_period_df.columns]
Ejemplo n.º 7
0
                                          frequency='1d',
                                          field=field,
                                          data_frequency='daily')


# ### View Data
# Let's get returns data for our risk model using the `get_pricing` function. For this model, we'll be looking back to 5 years of data.

# In[14]:


five_year_returns =     get_pricing(
        data_portal,
        trading_calendar,
        universe_tickers,
        universe_end_date - pd.DateOffset(years=5),
        universe_end_date)\
    .pct_change()[1:].fillna(0)

five_year_returns

# # Statistical Risk Model
# It's time to build the risk model. You'll be creating a statistical risk model using PCA. So, the first thing is building the PCA model.
# ## Fit PCA
# Implement `fit_pca` to fit a PCA model to the returns data

# In[18]:

from sklearn.decomposition import PCA

Ejemplo n.º 8
0
    def create_dataset(self, nwps, data_path, start_index=9001, test=False):
        self.data['dayweek'] = self.data.index.dayofweek
        self.data['month'] = self.data.index.month
        self.data['hour'] = self.data.index.hour
        self.data['sp_index'] = [self.sp_index(d) for d in self.data.index]

        dataset = pd.DataFrame()
        target = pd.Series(name='target')
        dataset_3d = np.array([])

        nwps_lstm = nwps.copy(deep=True)
        for var in self.variables:
            if var == 'WS':
                var = 'wind'
            elif var == 'WD':
                var = 'direction'
            elif var == 'Temperature':
                var = 'Temp'
            cols = [
                col for col in nwps.columns if str.lower(var) in str.lower(col)
            ]
            nwps_lstm[str.lower(var)] = nwps_lstm[cols].mean(axis=1).values
        lags1 = np.hstack([
            np.arange(24, 52),
            np.arange(71, 75),
            96,
            120,
            144,
            np.arange(166, 175),
            192,
        ])
        lags2 = np.hstack([np.arange(8735, 8741), 8760, 8736 + 168])
        lags_days = np.arange(1, 8)

        for date in self.data.index[start_index:]:
            date_inp1 = [date - pd.DateOffset(hours=int(l)) for l in lags1]
            date_inp2 = [date - pd.DateOffset(hours=int(l)) for l in lags2]
            date_days = [date - pd.DateOffset(days=int(l)) for l in lags_days]

            try:
                temp_max = nwps[['Temp_max']].loc[date].values
                var_imp = np.hstack(
                    (temp_max,
                     self.data[['hour', 'month', 'sp_index',
                                'dayweek']].loc[date].values,
                     nwps.drop(columns=['Temp_max']).loc[date].values,
                     np.power(self.data['month'].loc[date] * temp_max / 12, 3),
                     np.power(self.data['sp_index'].loc[date] * temp_max / 100,
                              3)))

                col = ['Temp', 'hour', 'month', 'sp_index', 'dayweek'
                       ] + nwps.drop(columns=['Temp_max']).columns.tolist() + [
                           'Temp_month', 'Temp_sp_days'
                       ]

                var_unimp = np.hstack((
                    self.data.loc[date_inp1, 'SCADA'].values,
                    self.data.loc[date_inp2, 'SCADA'].values,
                    self.data.loc[date_inp1, 'APE_net'].values,
                    self.data.loc[date_inp1, 'SCADA'].values +
                    self.data.loc[date_inp1, 'APE_net'].values,
                    nwps.loc[date_days, 'Temp_max'].values,
                    nwps.loc[date_days, 'Temp_min'].values,
                ))
                col += ['SCADA_' + str(i) for i in range(45)]
                col += ['SCADA_' + str(i) for i in range(45, 53)]
                col += ['APE_' + str(i) for i in range(45)]
                col += ['TOTAL_' + str(i) for i in range(45)]
                col += ['Temp_max_' + str(i) for i in range(7)]
                col += ['Temp_min_' + str(i) for i in range(7)]

                temp_max = nwps[['Temp_max']].loc[date].values
                var_3d = np.hstack(
                    (np.array([0]), self.data.loc[date, 'APE_net'],
                     self.data.loc[date, 'APE_net'] +
                     self.data.loc[date, 'SCADA'], nwps_lstm[[
                         'cloud', 'wind', 'direction', 'Temp_max', 'Temp_min',
                         'Temp_athens', 'Temp_thessaloniki', 'Temp_ioannina',
                         'Temp_larissa', 'Temp_patra'
                     ]].loc[date].values,
                     self.data[['hour', 'month', 'sp_index',
                                'dayweek']].loc[date].values,
                     np.power(self.data['month'].loc[date] * temp_max / 12, 3),
                     np.power(self.data['sp_index'].loc[date] * temp_max / 100,
                              3)))
                for d in date_inp1:
                    temp_max = nwps[['Temp_max']].loc[d].values
                    v = np.hstack(
                        (self.data.loc[d, 'SCADA'], self.data.loc[d,
                                                                  'APE_net'],
                         self.data.loc[d, 'APE_net'] +
                         self.data.loc[d, 'SCADA'], nwps_lstm[[
                             'cloud', 'wind', 'direction', 'Temp_max',
                             'Temp_min', 'Temp_athens', 'Temp_thessaloniki',
                             'Temp_ioannina', 'Temp_larissa', 'Temp_patra'
                         ]].loc[d].values,
                         self.data[['hour', 'month', 'sp_index',
                                    'dayweek']].loc[d].values,
                         np.power(self.data['month'].loc[d] * temp_max / 12,
                                  3),
                         np.power(
                             self.data['sp_index'].loc[d] * temp_max / 100,
                             3)))
                    var_3d = np.vstack((var_3d, v))

            except:
                continue
            inp = np.hstack((var_imp, var_unimp))

            inp1 = pd.Series(inp, index=col, name=date)
            targ1 = pd.Series(self.data['SCADA'].loc[date],
                              index=[date],
                              name='target1')
            if not inp1.isnull().any() and not targ1.isnull().any():
                dataset = dataset.append(inp1)
                target = target.append(targ1)
                if dataset_3d.shape[0] == 0:
                    dataset_3d = var_3d
                elif len(dataset_3d.shape) == 2:
                    dataset_3d = np.stack((dataset_3d, var_3d))
                else:
                    dataset_3d = np.vstack(
                        (dataset_3d, var_3d[np.newaxis, :, :]))
        if not test:
            corr = []
            for f in range(dataset.shape[1]):
                corr.append(
                    np.abs(
                        np.corrcoef(dataset.values[:, f],
                                    target.values.ravel())[1, 0]))
            ind = np.argsort(np.array(corr))[::-1]
            columns = dataset.columns[ind]
            dataset = dataset[columns]
            joblib.dump(
                ind, os.path.join(data_path, 'dataset_columns_order.pickle'))
        else:
            ind = joblib.load(
                os.path.join(data_path, 'dataset_columns_order.pickle'))
            columns = dataset.columns[ind]
            dataset = dataset[columns]
        return dataset, target, dataset_3d
def append_feature(data, price):
    next_index = data.index[-1] + pd.DateOffset(1)
    data.loc[next_index] = [price, data['Adj. Volume'][-1]]
    return data
Ejemplo n.º 10
0
first_new_transaction = new_merchant_transactions.groupby('card_id').agg({'month_lag' : 'min', 'purchase_date' : 'min'}).reset_index()
first_new_transaction.columns = ['card_id', 'new_month_lag', 'new_purchase_date']

# In[ ]:


# converting to datetime
last_hist_transaction['hist_purchase_date'] = pd.to_datetime(last_hist_transaction['hist_purchase_date']) 
first_new_transaction['new_purchase_date'] = pd.to_datetime(first_new_transaction['new_purchase_date']) 

# In[ ]:


# substracting month_lag for each row
last_hist_transaction['observation_date'] = \
    last_hist_transaction.apply(lambda x: x['hist_purchase_date']  - pd.DateOffset(months=x['hist_month_lag']), axis=1)

first_new_transaction['observation_date'] = \
    first_new_transaction.apply(lambda x: x['new_purchase_date']  - pd.DateOffset(months=x['new_month_lag']-1), axis=1)

# At this point we just reversed month lag function to get a rought estimate of the `observation_date` to be used for specific `card_id`. As you can see below, the `observation_date` is already different for many cards!

# In[ ]:


last_hist_transaction.head(20)

# In[ ]:


first_new_transaction.head(20)
Ejemplo n.º 11
0
import pandas as pd
import dash
import dash_core_components as dcc
import dash_html_components as html
import plotly.graph_objects as go
from dash.dependencies import Input, Output

global_ndays_range = 20

# --- Start --- Reading base data for the Sunburst
industry_sentiment = pd.read_json('covidsm_agg_sentiment2_industry.json.zip',
                                  orient='records')
industry_sentiment['published_at_date'] = pd.to_datetime(
    industry_sentiment['published_at_date'], unit='ms')
global_start_day = industry_sentiment['published_at_date'].max(
) - pd.DateOffset(days=global_ndays_range)
industries_hrchy = pd.read_csv('industries-hrchy.csv')
industries_hrchy = industries_hrchy.replace(np.nan, '', regex=True)
# --- End --- Reading base data for the Sunburst

# --- Start --- Load base Sunburst (no data)
fig_layout = dict(margin=dict(t=0, l=0, r=0, b=0), width=800, height=850)
fig_ind = go.Figure(data=[
    go.Sunburst(ids=['total'],
                labels=['All Industries'],
                parents=[''],
                marker=dict(colors=[0], colorscale='RdBu', cmid=0),
                hovertemplate=
                '<b>(%{id})</b> %{label} <br>- Sentiment score: %{color:.2f}')
],
                    layout=fig_layout)
        wget.download(
            'https://snap.stanford.edu/data/loc-gowalla_edges.txt.gz',
            out=str(dataset_path / 'loc-gowalla_edges.txt.gz'), bar=print_progressbar)
        wget.download(
            'https://snap.stanford.edu/data/loc-gowalla_totalCheckins.txt.gz',
            out=str(dataset_path / 'loc-gowalla_totalCheckins.txt.gz'), bar=print_progressbar)

    gowalla_dataset = pd.read_csv(
        dataset_path / 'loc-gowalla_totalCheckins.txt.gz',
        sep='\t', names=['userId', 'timestamp', 'long', 'lat', 'loc_id'])
    gowalla_dataset['timestamp'] = pd.to_datetime(gowalla_dataset['timestamp']).dt.tz_localize(None)

    split_date = pd.to_datetime(config['SPLIT_DATE'])
    start_date = gowalla_dataset['timestamp'].min() \
        if 'TRAIN_DAYS' not in config \
        else pd.to_datetime(split_date - pd.DateOffset(days=config['TRAIN_DAYS']))
    end_test_date = split_date + pd.DateOffset(days=config['TEST_DAYS'])
    end_date = pd.to_datetime(
        end_test_date + pd.DateOffset(days=config['VAL_DAYS']) if 'VAL_DAYS' in config else 0)

    timestamp_filter = (gowalla_dataset['timestamp'] >= start_date) & (
                gowalla_dataset['timestamp'] <= end_date)
    gowalla_dataset = gowalla_dataset[timestamp_filter]
    gowalla_dataset.sort_values('timestamp', inplace=True)

    new_user_ids = {k: v for v, k in enumerate(gowalla_dataset['userId'].unique())}
    new_item_ids = {k: v for v, k in enumerate(gowalla_dataset['loc_id'].unique())}

    gowalla_dataset['userId'] = gowalla_dataset['userId'].map(new_user_ids)
    gowalla_dataset['loc_id'] = gowalla_dataset['loc_id'].map(new_item_ids)
Ejemplo n.º 13
0
 def _week_to_date(self, row: int):
     origin_date = pd.to_datetime("2019-12-29") if row.Vecka >= 52 else pd.to_datetime("2021-01-03")
     return origin_date + pd.DateOffset(days=7 * int(row.Vecka))
Ejemplo n.º 14
0
import pandas as pd

from fastsr.containers.learning_data import LearningData

dat = pd.read_csv('data/hour.csv')
datetime_index = list()
for i, r in dat.iterrows():
    datetime_index.append(pd.to_datetime(r[1]) + pd.DateOffset(hours=r[5]))
dt_index = pd.DatetimeIndex(datetime_index)
dat.set_index(dt_index)
columns = ['atemp', 'windspeed', 'hum', 'cnt']
slim_dat = dat[columns]
learning_data = LearningData()
learning_data.from_data(slim_dat, columns, 'ucisimplebike')
learning_data.lag_predictors(24, column_names=['atemp', 'windspeed', 'hum'])
learning_data.to_hdf('data/hour_simple_lagged.hdf5')
Ejemplo n.º 15
0
def test_dateoffset_instance_subclass_check():
    assert not issubclass(pd.DateOffset, cudf.DateOffset)
    assert not isinstance(pd.DateOffset(), cudf.DateOffset)
Ejemplo n.º 16
0
def main():
    # path of study folder
    study_path = str(sys.argv[1])
    # participants# (eg. "P301 P302 P401")
    p_nums = str(sys.argv[2])

    t0 = time()

    participants = p_nums.split(' ')

    for p in participants:
        print('Comparing in wild for '+p)
        current_dir = os.getcwd()
        save_folder = os.path.join(os.getcwd(), 'output_files', 'leave_' + p + '_out')
        if os.path.exists(save_folder):
            os.chdir(save_folder)
        else:
            os.chdir(os.path.join(os.getcwd(), 'output_files', 'using_all'))

        # TODO can move to a settings file (test, then delete if not needed)
        model = XGBClassifier(learning_rate=0.01,
                              n_estimators=400,
                              max_depth=10,
                              min_child_weight=1,
                              gamma=0,
                              subsample=1,
                              colsample_btree=1,
                              scale_pos_weight=1,
                              random_state=7,
                              slient=0,
                              nthread=4
                              )
        model = joblib.load('WRIST.dat')

        path_table = os.path.join(study_path, p, 'In Wild/Summary/Actigraph/', p + ' In Wild IntensityMETMinLevel.csv')
        df_table = pd.read_csv(path_table, index_col=None, header=0)
        path_gyro = os.path.join(study_path, p, 'In Wild/Wrist/Aggregated/Gyroscope/Gyroscope_resampled.csv')
        df_gyro = pd.read_csv(path_gyro, index_col=None, header=0)
        df_gyro['Datetime'] = pd.to_datetime(df_gyro['Time'], unit='ms', utc=True).dt.tz_convert(
            'America/Chicago').dt.tz_localize(None)

        # minute level data at 20hz, so 60*20 = 1200
        data_length = 1200
        nan_limit = 4

        prediction = []
        for n in df_table['Datetime']:
            start_time = pd.to_datetime(n)
            end_time = start_time + pd.DateOffset(minutes=1)
            temp_gyro = df_gyro.loc[(df_gyro['Datetime'] >= start_time)
                                    & (df_gyro['Datetime'] < end_time)].reset_index(drop=True)
            if len(temp_gyro['rotX']) == data_length:
                this_min_gyro = [temp_gyro['rotX'], temp_gyro['rotY'], temp_gyro['rotZ']]
                if np.count_nonzero(np.isnan(this_min_gyro[0])) > nan_limit:
                    prediction.append(-1)
                else:
                    model_output = model.predict(extract_features([this_min_gyro]))
                    prediction.append(model_output[0])
            else:
                prediction.append(-1)

        df_table['model_classification'] = prediction

        print("Hours of data: %g" % (float(len(df_table)) / float(60)))

        set_realistic_met_estimate(df_table)
        df_table.to_csv(p+'_in_wild_comparison.csv', index=False, encoding='utf8')

        l_datetime_all = df_table['Datetime'].tolist()
        l_freedson_all = df_table['MET (Freedson)'].tolist()
        l_vm3_all = df_table['MET (VM3)'].tolist()
        l_estimation_all = df_table['estimation'].tolist()
        l_freedson_all = [l_freedson_all[i] for i in range(len(l_estimation_all)) if not np.isnan(l_estimation_all[i])]
        l_vm3_all = [l_vm3_all[i] for i in range(len(l_estimation_all)) if not np.isnan(l_estimation_all[i])]
        l_estimation_all = [l_estimation_all[i] for i in range(len(l_estimation_all)) if
                            not np.isnan(l_estimation_all[i])]
        l_datetime_all = [l_datetime_all[i] for i in range(len(l_estimation_all)) if not np.isnan(l_estimation_all[i])]
        vm3_all_reshaped = np.array(l_vm3_all).reshape(-1, 1)
        estimation_all_reshaped = np.array(l_estimation_all).reshape(-1, 1)
        freedson_all_reshaped = np.array(l_freedson_all).reshape(-1, 1)


        fig = go.Figure()
        fig.add_trace(go.Scatter(x=l_estimation_all, y=l_vm3_all, mode='markers'))
        regr = linear_model.LinearRegression()
        regr.fit(estimation_all_reshaped, vm3_all_reshaped)
        y_pred = regr.predict(estimation_all_reshaped)
        y_plot = np.reshape(y_pred, y_pred.shape[0])
        fig.add_trace(go.Scatter(x=l_estimation_all, y=y_plot, mode='lines', name='linear regression',
                                 line=dict(color='red', width=4)))
        fig.update_layout(title='Linear Regression',
                          xaxis_title='Estimation',
                          yaxis_title='VM3 METs')
        outf = open('wild_est_vs_vm3_r2.txt', 'a')
        outf.write('%g\n' % r2_score(vm3_all_reshaped, y_pred))
        outf.close()
        print("The r2 score for in wild estimation vs VM3 is: %g" % (r2_score(vm3_all_reshaped, y_pred)))

        # calculate Pearson's correlation
        corr, _ = pearsonr(np.array(l_estimation_all), np.array(l_vm3_all))
        print('Pearsons correlation: %g' % corr)
        outf = open('wild_est_vs_vm3_pearson.txt', 'a')
        outf.write('%g\n' % corr)
        outf.close()
        # calculate Spearman's correlation
        corr, _ = spearmanr(np.array(l_estimation_all), np.array(l_vm3_all))
        print('Spearmans correlation: %g' % corr)
        outf = open('wild_est_vs_vm3_spearman.txt', 'a')
        outf.write('%g\n' % corr)
        outf.close()

        py.offline.plot(fig, filename='in_wild_model_to_vm3.html', auto_open=False)


        fig = go.Figure()
        fig.add_trace(go.Scatter(x=l_estimation_all, y=l_freedson_all, mode='markers'))
        regr = linear_model.LinearRegression()
        regr.fit(estimation_all_reshaped, freedson_all_reshaped)
        y_pred = regr.predict(estimation_all_reshaped)
        y_plot = np.reshape(y_pred, y_pred.shape[0])
        fig.add_trace(go.Scatter(x=l_estimation_all, y=y_plot, mode='lines', name='linear regression',
                                 line=dict(color='red', width=4)))
        fig.update_layout(title='Linear Regression',
                          xaxis_title='Estimation',
                          yaxis_title='Freedson METs')
        outf = open('wild_est_vs_freedson_r2.txt', 'a')
        outf.write('%g\n' % r2_score(freedson_all_reshaped, y_pred))
        outf.close()
        print("The r2 score for in wild estimation vs Freedson is: %g" % (r2_score(freedson_all_reshaped, y_pred)))
        py.offline.plot(fig, filename='in_wild_model_to_freedson.html', auto_open=False)


        fig = go.Figure()
        fig.add_trace(go.Scatter(x=l_datetime_all, y=l_estimation_all, mode='markers', name='model estimation'))
        fig.add_trace(go.Scatter(x=l_datetime_all, y=l_vm3_all, mode='markers', name='actigraph vm3'))
        fig.add_trace(go.Scatter(x=l_datetime_all, y=l_freedson_all, mode='markers', name='actigraph freedson'))
        fig.update_layout(title='Model and ActiGraph Estimation',
                          xaxis_title='Datetime',
                          yaxis_title='MET')
        py.offline.plot(fig, filename='in_wild_comparison.html', auto_open=False)

        os.chdir(current_dir)

    t1 = time()
    print("Total in wild comparison time: %g minutes" % (float(t1 - t0) / float(60)))
Ejemplo n.º 17
0
def test_datetime64_with_DateOffset(klass, assert_func):
    s = klass(date_range('2000-01-01', '2000-01-31'), name='a')
    result = s + pd.DateOffset(years=1)
    result2 = pd.DateOffset(years=1) + s
    exp = klass(date_range('2001-01-01', '2001-01-31'), name='a')
    assert_func(result, exp)
    assert_func(result2, exp)

    result = s - pd.DateOffset(years=1)
    exp = klass(date_range('1999-01-01', '1999-01-31'), name='a')
    assert_func(result, exp)

    s = klass([
        Timestamp('2000-01-15 00:15:00', tz='US/Central'),
        pd.Timestamp('2000-02-15', tz='US/Central')
    ],
              name='a')
    result = s + pd.offsets.Day()
    result2 = pd.offsets.Day() + s
    exp = klass([
        Timestamp('2000-01-16 00:15:00', tz='US/Central'),
        Timestamp('2000-02-16', tz='US/Central')
    ],
                name='a')
    assert_func(result, exp)
    assert_func(result2, exp)

    s = klass([
        Timestamp('2000-01-15 00:15:00', tz='US/Central'),
        pd.Timestamp('2000-02-15', tz='US/Central')
    ],
              name='a')
    result = s + pd.offsets.MonthEnd()
    result2 = pd.offsets.MonthEnd() + s
    exp = klass([
        Timestamp('2000-01-31 00:15:00', tz='US/Central'),
        Timestamp('2000-02-29', tz='US/Central')
    ],
                name='a')
    assert_func(result, exp)
    assert_func(result2, exp)

    # array of offsets - valid for Series only
    if klass is Series:
        with tm.assert_produces_warning(PerformanceWarning):
            s = klass([Timestamp('2000-1-1'), Timestamp('2000-2-1')])
            result = s + Series(
                [pd.offsets.DateOffset(years=1),
                 pd.offsets.MonthEnd()])
            exp = klass([Timestamp('2001-1-1'), Timestamp('2000-2-29')])
            assert_func(result, exp)

            # same offset
            result = s + Series([
                pd.offsets.DateOffset(years=1),
                pd.offsets.DateOffset(years=1)
            ])
            exp = klass([Timestamp('2001-1-1'), Timestamp('2001-2-1')])
            assert_func(result, exp)

    s = klass([
        Timestamp('2000-01-05 00:15:00'),
        Timestamp('2000-01-31 00:23:00'),
        Timestamp('2000-01-01'),
        Timestamp('2000-03-31'),
        Timestamp('2000-02-29'),
        Timestamp('2000-12-31'),
        Timestamp('2000-05-15'),
        Timestamp('2001-06-15')
    ])

    # DateOffset relativedelta fastpath
    relative_kwargs = [('years', 2), ('months', 5), ('days', 3), ('hours', 5),
                       ('minutes', 10), ('seconds', 2), ('microseconds', 5)]
    for i, kwd in enumerate(relative_kwargs):
        op = pd.DateOffset(**dict([kwd]))
        assert_func(klass([x + op for x in s]), s + op)
        assert_func(klass([x - op for x in s]), s - op)
        op = pd.DateOffset(**dict(relative_kwargs[:i + 1]))
        assert_func(klass([x + op for x in s]), s + op)
        assert_func(klass([x - op for x in s]), s - op)

    # assert these are equal on a piecewise basis
    offsets = [
        'YearBegin', ('YearBegin', {
            'month': 5
        }), 'YearEnd', ('YearEnd', {
            'month': 5
        }), 'MonthBegin', 'MonthEnd', 'SemiMonthEnd', 'SemiMonthBegin', 'Week',
        ('Week', {
            'weekday': 3
        }), 'BusinessDay', 'BDay', 'QuarterEnd', 'QuarterBegin',
        'CustomBusinessDay', 'CDay', 'CBMonthEnd', 'CBMonthBegin',
        'BMonthBegin', 'BMonthEnd', 'BusinessHour', 'BYearBegin', 'BYearEnd',
        'BQuarterBegin', ('LastWeekOfMonth', {
            'weekday': 2
        }),
        ('FY5253Quarter', {
            'qtr_with_extra_week': 1,
            'startingMonth': 1,
            'weekday': 2,
            'variation': 'nearest'
        }),
        ('FY5253', {
            'weekday': 0,
            'startingMonth': 2,
            'variation': 'nearest'
        }), ('WeekOfMonth', {
            'weekday': 2,
            'week': 2
        }), 'Easter', ('DateOffset', {
            'day': 4
        }), ('DateOffset', {
            'month': 5
        })
    ]

    with warnings.catch_warnings(record=True):
        for normalize in (True, False):
            for do in offsets:
                if isinstance(do, tuple):
                    do, kwargs = do
                else:
                    do = do
                    kwargs = {}

                    for n in [0, 5]:
                        if (do in [
                                'WeekOfMonth', 'LastWeekOfMonth',
                                'FY5253Quarter', 'FY5253'
                        ] and n == 0):
                            continue
                    op = getattr(pd.offsets, do)(n,
                                                 normalize=normalize,
                                                 **kwargs)
                    assert_func(klass([x + op for x in s]), s + op)
                    assert_func(klass([x - op for x in s]), s - op)
                    assert_func(klass([op + x for x in s]), op + s)
Ejemplo n.º 18
0
volumeData['lag6'] = volumeData['volume'].shift(6)
volumeData['lag7'] = volumeData['volume'].shift(7)

# COMMAND ----------

volumeData

# COMMAND ----------

import seaborn as sns
#sns.heatmap(volumeData.corr(), annot=True, fmt=".2f")
#display()

# COMMAND ----------

volumeDataTest['time'] = volumeDataTest['time'] + pd.DateOffset(hours=2)
volumeDataTest2 = volumeDataTest
del volumeDataTest

# COMMAND ----------

volumeDataTest2.head()

# COMMAND ----------

print volumeData.shape, volumeDataTest2.shape

# COMMAND ----------

# append columns to find whether it is a holiday, weekend or weekday and what is the hour in 24 hours
from datetime import datetime
Ejemplo n.º 19
0
# 导入数据
filename = "c:\\LOG\\879448_2020-11-18-13-00-00_2020-11-25-09-00-00"  # !! 修改!!!!
data = pd.read_csv(filename+"_RAW.csv", parse_dates=[2])
data['flag'] = ''

print('records:', len(data))

# 建立时间轴
# 根据数据生成目标时间格子
min = _min if _min < data['TIME'].min() else data['TIME'].min() #!! 修改!!!!
max = _max if _max > data['TIME'].max() else data['TIME'].max()  # !! 修改!!!!
# min = data['TIME'].min()
# max = data['TIME'].min()
# Gridlist = pd.date_range(min.replace(microsecond=0, second=0, minute=min.minute//5*5), max+pd.DateOffset(minutes=5), freq='5T')
# Gridlist = pd.date_range(min, max, freq='5T')
Gridlist = pd.date_range(min.replace(microsecond=0, second=0, minute=min.minute//5*5), max+pd.DateOffset(minutes=5), freq='5T')


Gridlist = pd.DataFrame(Gridlist, columns=['TIME'])

# 数据处理

# 建立ID列表
IDSet = set(data['ID'].values)

# 每个ID循环处理
for id in IDSet:
    print('\nProcessing ', id)
    data_per_ID = data[data.ID == id]
    data1 = data_per_ID.values.tolist()
    print('raw records:', len(data1))
Ejemplo n.º 20
0
    distances = pd.read_csv(args.distances, sep="\t")
    distances_by_sample_names = get_distances_by_sample_names(distances)

    # Load model details
    with open(args.model, "r") as fh:
        model_json = json.load(fh)

    predictors = model_json["predictors"]
    cost_function = model_json["cost_function"]
    l1_lambda = model_json["l1_lambda"]
    coefficients = np.array(model_json["coefficients_mean"])
    mean_stds = np.array(model_json["mean_stds_mean"])

    delta_month = args.delta_months[-1]
    delta_time = delta_month / 12.0
    delta_offset = pd.DateOffset(months=delta_month)

    model = DistanceExponentialGrowthModel(predictors=predictors,
                                           delta_time=delta_time,
                                           cost_function=cost_function,
                                           l1_lambda=l1_lambda,
                                           distances=distances_by_sample_names)
    model.coef_ = coefficients
    model.mean_stds_ = mean_stds

    # collect fitness and projection
    forecasts_df = model.predict(tips)
    forecasts_df["weighted_distance_to_future_by_%s" %
                 "-".join(predictors)] = forecasts_df["y"]

    # collect dicts from dataframe
    adjustment_reader=bundle_data.adjustment_reader)

#Fonction pour récupérer les prix à partir du portail de données
def get_pricing(data_portal, trading_calendar, assets, start_date, end_date, field='close'):
    end_dt = pd.Timestamp(end_date.strftime('%Y-%m-%d'), tz='UTC', offset='C')
    start_dt = pd.Timestamp(start_date.strftime('%Y-%m-%d'), tz='UTC', offset='C')

    end_loc = trading_calendar.closes.index.get_loc(end_dt)
    start_loc = trading_calendar.closes.index.get_loc(start_dt)

    return data_portal.get_history_window(
        assets=assets,
        end_dt=end_dt,
        bar_count=end_loc - start_loc,
        frequency='1d',
        field=field,
        data_frequency='daily')

#Nous récupérer les données pour notre modèle de risque à l'aide de la fonction get_pricing.
#Pour ce modèle, nous utiliserons sur 5 ans de données.
five_year_returns = \
    get_pricing(
        data_portal,
        trading_calendar,
        universe_tickers,
        universe_end_date - pd.DateOffset(years=5),
        universe_end_date)\
    .pct_change()[1:].fillna(0)

five_year_returns.sample(5)
Ejemplo n.º 22
0
 def custom1(inst):
     new_index = inst.index+pds.DateOffset(milliseconds=500)
     d = pds.Series(2.0 * inst['mlt'], index=new_index)
     d.name = 'doubleMLT'
     print(new_index)
     return d
Ejemplo n.º 23
0
    def evaluate_prediction(self, start_date=None, end_date=None, nshares = None):

        # Default start date is one year before end of data
        # Default end date is end date of data
        if start_date is None:
            start_date = self.max_date - pd.DateOffset(years=1)
        if end_date is None:
            end_date = self.max_date

        start_date, end_date = self.handle_dates(start_date, end_date)

        # Training data starts self.training_years years before start date and goes up to start date
        train = self.stock[(self.stock['Date'] < start_date) &
                           (self.stock['Date'] > (start_date - pd.DateOffset(years=self.training_years)))]

        # Testing data is specified in the range
        test = self.stock[(self.stock['Date'] >= start_date) & (self.stock['Date'] <= end_date)]

        # Create and train the model
        model = self.create_model()
        model.fit(train)

        # Make a future dataframe and predictions
        future = model.make_future_dataframe(periods = 365, freq='D')
        future = model.predict(future)

        # Merge predictions with the known values
        test = pd.merge(test, future, on = 'ds', how = 'inner')

        train = pd.merge(train, future, on = 'ds', how = 'inner')

        # Calculate the differences between consecutive measurements
        test['pred_diff'] = test['yhat'].diff()
        test['real_diff'] = test['y'].diff()

        # Correct is when we predicted the correct direction
        test['correct'] = (np.sign(test['pred_diff'][1:]) == np.sign(test['real_diff'][1:])) * 1

        # Accuracy when we predict increase and decrease
        increase_accuracy = 100 * np.mean(test[test['pred_diff'] > 0]['correct'])
        decrease_accuracy = 100 * np.mean(test[test['pred_diff'] < 0]['correct'])

        # Calculate mean absolute error
        test_errors = abs(test['y'] - test['yhat'])
        test_mean_error = np.mean(test_errors)

        train_errors = abs(train['y'] - train['yhat'])
        train_mean_error = np.mean(train_errors)

        # Calculate percentage of time actual value within prediction range
        test['in_range'] = False

        for i in test.index:
            if (test.loc[i, 'y'] < test.loc[i, 'yhat_upper']) & (test.loc[i, 'y'] > test.loc[i, 'yhat_lower']):
                test.loc[i, 'in_range'] = True

        in_range_accuracy = 100 * np.mean(test['in_range'])

        if not nshares:

            # Date range of predictions
            print('\nPrediction Range: {} to {}.'.format(start_date,
                end_date))

            # Final prediction vs actual value
            print('\nPredicted price on {} = ${:.2f}.'.format(max(future['ds']), future.loc[future.index[-1], 'yhat']))
            print('Actual price on    {} = ${:.2f}.\n'.format(max(test['ds']), test.loc[test.index[-1], 'y']))

            print('Average Absolute Error on Training Data = ${:.2f}.'.format(train_mean_error))
            print('Average Absolute Error on Testing  Data = ${:.2f}.\n'.format(test_mean_error))

            # Direction accuracy
            print('When the model predicted an increase, the price increased {:.2f}% of the time.'.format(increase_accuracy))
            print('When the model predicted a  decrease, the price decreased  {:.2f}% of the time.\n'.format(decrease_accuracy))

            print('The actual value was within the {:d}% confidence interval {:.2f}% of the time.'.format(int(100 * model.interval_width), in_range_accuracy))

        # If a number of shares is specified, play the game
        elif nshares:

            # Only playing the stocks when we predict the stock will increase
            test_pred_increase = test[test['pred_diff'] > 0]

            test_pred_increase.reset_index(inplace=True)
            prediction_profit = []

            # Iterate through all the predictions and calculate profit from playing
            for i, correct in enumerate(test_pred_increase['correct']):

                # If we predicted up and the price goes up, we gain the difference
                if correct == 1:
                    prediction_profit.append(nshares * test_pred_increase.loc[i, 'real_diff'])
                # If we predicted up and the price goes down, we lose the difference
                else:
                    prediction_profit.append(nshares * test_pred_increase.loc[i, 'real_diff'])

            test_pred_increase['pred_profit'] = prediction_profit

            # Put the profit into the test dataframe
            test = pd.merge(test, test_pred_increase[['ds', 'pred_profit']], on = 'ds', how = 'left')
            test.loc[0, 'pred_profit'] = 0

            # Profit for either method at all dates
            test['pred_profit'] = test['pred_profit'].cumsum().ffill()
            test['hold_profit'] = nshares * (test['y'] - float(test.loc[0, 'y']))

            # Display information
            print('You played the stock market in {} from {} to {} with {} shares.\n'.format(
                self.symbol, start_date, end_date, nshares))

            print('When the model predicted an increase, the price increased {:.2f}% of the time.'.format(increase_accuracy))
            print('When the model predicted a  decrease, the price decreased  {:.2f}% of the time.\n'.format(decrease_accuracy))

            # Display some friendly information about the perils of playing the stock market
            print('The total profit using the Prophet model = ${:.2f}.'.format(np.sum(prediction_profit)))
            print('The Buy and Hold strategy profit =         ${:.2f}.'.format(float(test.loc[test.index[-1], 'hold_profit'])))
            print('\nThanks for playing the stock market!\n')

            # Plot the predicted and actual profits over time

            # Final profit and final smart used for locating text
            final_profit = test.loc[test.index[-1], 'pred_profit']
            final_smart = test.loc[test.index[-1], 'hold_profit']

            # text location
            last_date = test.loc[test.index[-1], 'ds']
            text_location = (last_date - pd.DateOffset(months = 1))

        return test
Ejemplo n.º 24
0
    def plot1_s4(self, const='G', freq='S4_sig1', sbas=False):
        if self._check_noNull_values(const, freq):
            # Get file UTC date
            figure_name = self._figure_name()
            fecha = figure_name[5:]  # e.g. 200926
            fecha2 = datetime.datetime.strptime(fecha, "%y%m%d")
            fecha3 = datetime.datetime.strftime(fecha2, "%Y/%m/%d")

            fecha2_tomorrow = fecha2 + pd.DateOffset(days=1)
            fecha2_tomorrow = fecha2_tomorrow.to_pydatetime()

            # Get UTC day range, to add a vertical strip
            fecha_morning_first = fecha2 + pd.DateOffset(hours=11)
            fecha_morning_first = fecha_morning_first.to_pydatetime()

            fecha_morning_last = fecha2 + pd.DateOffset(hours=23)
            fecha_morning_last = fecha_morning_last.to_pydatetime()

            # Get the PRNs
            PRNs = self.extract_prns(const, freq)

            # Include SBAS data if corresponds
            if sbas: PRNs = self._append_sbas_prns(const, freq, PRNs)

            # Create the figure with the subplots
            n_rows = (len(PRNs) + 1) // 2
            n_cols = 2

            fig, axs = plt.subplots(n_rows,
                                    n_cols,
                                    figsize=(7 * n_cols, 1 * n_rows),
                                    sharex="col",
                                    sharey="row",
                                    gridspec_kw={
                                        'hspace': 0,
                                        'wspace': 0
                                    })
            j = 0

            for ax in axs.T.reshape(
                    -1):  # Plot up to down, rather than left to right
                # ax -> s4
                # ax2 -> elevation
                ax2 = ax.twinx()

                if j < len(PRNs):
                    # Plot s4 info
                    prn_value = PRNs[j]

                    # -> Get the correct freq if sbas==True
                    if sbas and prn_value[0] == 'S':
                        freq_n = self._change_frequency(const, freq)
                    else:
                        freq_n = freq

                    df3_s4 = self.get_s4(prn_value, freq_n)

                    color1 = "blue"  # This color is used in y axis labels, ticks and border
                    colors1 = ["lightsteelblue", "cornflowerblue",
                               "navy"]  # These colors are used for the plots

                    for k in range(3):
                        df4_s4 = df3_s4[k + 1]

                        ax.plot(df4_s4.index,
                                df4_s4.values,
                                '.',
                                color=colors1[k],
                                markersize=2)
                        ax.set_facecolor(color="lightgrey")
                        ax.axvspan(fecha_morning_first,
                                   fecha_morning_last,
                                   color="white")  # strip morning/night

                    # Plot elevation info
                    df3_elev = self.get_elevation(PRNs[j], freq)

                    color2 = "orange"
                    ax2.plot(df3_elev.index,
                             df3_elev.values,
                             '.',
                             color=color2,
                             markersize=1)

                    # Annotate the prn in the subplot
                    x_location = fecha2 + pd.Timedelta(minutes=30)
                    ax2.text(x_location,
                             35,
                             self._convert2SVID(PRNs[j]),
                             fontsize=15,
                             weight='roman')  # 0.375

                # Set axis limits
                ax.set_xlim([fecha2, fecha2_tomorrow])
                ax.set_ylim([0, 1])
                ax2.set_ylim([0, 90])

                # Set ticks and tick labels
                # Set y axis format, labels odds subplots only
                len_half_ax = len(axs.T.reshape(-1)) / 2

                if j >= len_half_ax:  # change only for the 2nd column
                    k = j - len_half_ax

                    # Set y labels only to even subplots
                    ax.yaxis.set_minor_locator(AutoMinorLocator(4))
                    ax.set_yticks([0, 1])
                    ax2.yaxis.set_minor_locator(AutoMinorLocator(4))
                    ax2.set_yticks([0, 90])

                    if k % 2 == 0:
                        ax.set_yticklabels([0, 1])
                        ax2.set_yticklabels([0, 90])
                    else:
                        ax.set_yticklabels(['', ''])
                        ax2.set_yticklabels(['', ''])

                    # Set yellow color to the right y axis
                    for axis in ['top', 'bottom', 'left']:
                        ax.spines[axis].set_linewidth(2)
                        ax2.spines[axis].set_linewidth(2)

                    ax.spines['right'].set_color(color2)
                    ax.spines['right'].set_linewidth(2)
                    ax2.spines['right'].set_color(color2)
                    ax2.spines['right'].set_linewidth(2)
                    ax2.tick_params(axis='y', which='both', colors=color2)

                else:  # apply some changes to the 1st column
                    # remove y tick labels for elevation
                    ax2.yaxis.set_minor_locator(AutoMinorLocator(4))
                    ax2.set_yticks([0, 90])
                    ax2.set_yticklabels(['', ''])

                    # set linewidth to top, bottom and right borders of the subplot
                    for axis in ['top', 'bottom', 'right']:
                        ax.spines[axis].set_linewidth(2)
                        ax2.spines[axis].set_linewidth(2)

                    # Set blue color to the left y axis
                    ax.spines['left'].set_color(color1)
                    ax.spines['left'].set_linewidth(2)
                    ax2.spines['left'].set_color(color1)
                    ax2.spines['left'].set_linewidth(2)
                    ax.tick_params(axis='y', which='both', colors=color1)

                # set x axis format
                hours = mdates.HourLocator(interval=2)
                ax.xaxis.set_major_locator(hours)  # ticks interval: 2h
                ax.xaxis.set_minor_locator(
                    AutoMinorLocator(2))  # minor tick division: 2
                myFmt = DateFormatter("%H")
                ax.xaxis.set_major_formatter(myFmt)  # x format: hours

                # set the ticks style
                ax.xaxis.set_tick_params(width=2,
                                         length=8,
                                         which='major',
                                         direction='out')
                ax.xaxis.set_tick_params(width=1,
                                         length=4,
                                         which='minor',
                                         direction='out')
                ax.yaxis.set_tick_params(width=2,
                                         length=15,
                                         which='major',
                                         direction='inout')
                ax.yaxis.set_tick_params(width=1,
                                         length=4,
                                         which='minor',
                                         direction='out')
                ax2.yaxis.set_tick_params(width=2,
                                          length=15,
                                          which='major',
                                          direction='inout')
                ax2.yaxis.set_tick_params(width=1,
                                          length=4,
                                          which='minor',
                                          direction='out')

                # set the label ticks
                ax.tick_params(axis='x', which='major', labelsize=12)
                ax.tick_params(axis='y', labelsize=12)
                ax2.tick_params(axis='y', labelsize=12)

                # set grid
                ax.grid(which='major', axis='both', ls=':', linewidth=1.2)
                ax.grid(which='minor', axis='both', ls=':', alpha=0.5)

                # Set title and axis labels
                aux = self.get_freq_name(const, int(freq[-1]))
                frequency_name = aux["name"]
                frequency_value = aux["value"] + "MHz"

                # -> Title
                if j == 0:  # Subplot on Upper left
                    fig.text(0,
                             1,
                             fecha3,
                             ha='left',
                             va='bottom',
                             fontsize=17,
                             weight='semibold',
                             transform=ax.transAxes)
                    fig.text(0.5,
                             1,
                             'Jicamarca',
                             ha='left',
                             va='bottom',
                             fontsize=17,
                             weight='semibold',
                             transform=ax.transAxes)

                if j == n_rows - 1:  # Subplot on Lower left
                    pass

                if j == n_rows:  # Subplot on Upper right
                    fig.text(0,
                             1,
                             'S4',
                             ha='center',
                             va='bottom',
                             fontsize=17,
                             weight='semibold',
                             transform=ax.transAxes)
                    fig.text(0.4,
                             1,
                             frequency_value,
                             ha='center',
                             va='bottom',
                             fontsize=17,
                             weight='semibold',
                             transform=ax.transAxes)
                    fig.text(
                        1,
                        1,
                        f"{frequency_name} | {self.get_const_name(const)}",
                        ha='right',
                        va='bottom',
                        fontsize=17,
                        weight='semibold',
                        transform=ax.transAxes)

                # -> Labels
                if j == n_rows * n_cols - 1:  # x axis label, Subplot on Lower right
                    fig.text(0,
                             -0.6,
                             'Time UTC',
                             ha='center',
                             va='center',
                             fontsize=14,
                             transform=ax.transAxes)

                if j == int(n_rows / 2):  # y axis label on the left
                    k = (n_rows % 2) * 0.5
                    fig.text(-0.1,
                             1 - k,
                             'S4',
                             ha='center',
                             va='center',
                             rotation='vertical',
                             fontsize=14,
                             color='b',
                             transform=ax.transAxes)

                if j == int(n_rows * n_cols -
                            n_rows / 2):  # y axis label on the right
                    k = (n_rows % 2) * 0.5
                    fig.text(1.1,
                             1 - k,
                             'Elevation Angle',
                             ha='center',
                             va='center',
                             rotation=-90,
                             fontsize=14,
                             color=color2,
                             transform=ax.transAxes)

                j += 1

            # Create directory for output files
            new_directory = output_files_path + figure_name + "/plot_2/"
            if not os.path.exists(new_directory):
                os.makedirs(new_directory)

            # Save figure as pdf
            #figure_name2 = figure_name + f"_s4_{self.get_const_name(const)}_{frequency_name}.pdf"
            #plt.savefig(new_directory + figure_name2, bbox_inches='tight')
            #pdf.savefig(fig)

            print(
                f"Plotted successfully; for const: {const}, and freq: {freq}!")
            return fig
        else:
            print(
                f"There is only Null data; for const: {const}, and freq: {freq}!"
            )
            return 0
Ejemplo n.º 25
0
        'max_demand_ind': max_demand_ind,
        'max_part_peak_demand_ind': max_part_peak_demand_ind,
        'max_peak_demand_ind': max_peak_demand_ind
    }
    return retval


def charge_max(index, arrival_time, departure_time, limit_kw):
    retval = np.zeros(len(index))
    # retval[(index > arrival_time) & (index < departure_time)] = limit_kw
    retval[(index > arrival_time)] = limit_kw
    return retval


start = pd.Timestamp("2020-03-09 00:00", tz="US/Pacific")  # Monday
index = pd.date_range(start, start + pd.DateOffset(hours=24),
                      freq="15min")[:-1]
names = ["EV1", "EV2", "EV3", "EV4", "EV5", "EV6", "EV7", "EV8", "EV9", "EV10"]
departures = [
    pd.Timestamp("2020-03-09 11:17", tz="US/Pacific"),
    pd.Timestamp("2020-03-09 17:44", tz="US/Pacific"),
    pd.Timestamp("2020-03-09 16:22", tz="US/Pacific"),
    pd.Timestamp("2020-03-09 17:19", tz="US/Pacific"),
    pd.Timestamp("2020-03-09 17:20", tz="US/Pacific"),
    pd.Timestamp("2020-03-09 12:23", tz="US/Pacific"),
    pd.Timestamp("2020-03-09 13:38", tz="US/Pacific"),
    pd.Timestamp("2020-03-09 14:42", tz="US/Pacific"),
    pd.Timestamp("2020-03-09 16:28", tz="US/Pacific"),
    pd.Timestamp("2020-03-09 19:01", tz="US/Pacific")
]
    def create_dataset(self, len_closeness=3, len_period=3, PeriodInterval=1, len_trend=3, TrendInterval=7, len_y=3):
        """current version
        """
        # offset_week = pd.DateOffset(days=7)
        offset_frame = pd.DateOffset(minutes=24 * 60 // self.T)
        XC = []
        XCS=[list() for i in range(7)]
        XP = []
        XT = []
        XCY = []
        Y = []
        timestamps_Y = []
        cnm=0
        depends = [[-6 * PeriodInterval * self.T - j for j in range(cnm, len_closeness + 1)[::-1]],
                   [-5 * PeriodInterval * self.T - j for j in range(cnm, len_closeness + 1)[::-1]],
                   [-4 * PeriodInterval * self.T - j for j in range(cnm, len_closeness + 1)[::-1]],
                   [-3 * PeriodInterval * self.T - j for j in range(cnm, len_closeness + 1)[::-1]],
                   [-2 * PeriodInterval * self.T - j for j in range(cnm, len_closeness + 1)[::-1]],
                   [-1 * PeriodInterval * self.T - j for j in range(cnm, len_closeness + 1)[::-1]],
                   [-0 * PeriodInterval * self.T - j for j in range(cnm, len_closeness + 1)[::-1]],
                   # [j for j in range(len_y)],
                   [0]]

        i = max(self.T * TrendInterval * len_trend, self.T * PeriodInterval * len_period, len_closeness)

        while i < (len(self.pd_timestamps) - (len_y - 1)):
            Flag = True
            for depend in depends:
                if Flag is False:
                    break
                    # Flag = self.check_it([self.pd_timestamps[i] + j * offset_frame for j in depend])

            if Flag is False:
                i += 1
                continue
            x_c6 = [self.get_matrix(self.pd_timestamps[i] + j * offset_frame) for j in depends[0]]
            x_c5 = [self.get_matrix(self.pd_timestamps[i] + j * offset_frame) for j in depends[1]]
            x_c4 = [self.get_matrix(self.pd_timestamps[i] + j * offset_frame) for j in depends[2]]
            x_c3 = [self.get_matrix(self.pd_timestamps[i] + j * offset_frame) for j in depends[3]]
            x_c2 = [self.get_matrix(self.pd_timestamps[i] + j * offset_frame) for j in depends[4]]
            x_c1 = [self.get_matrix(self.pd_timestamps[i] + j * offset_frame) for j in depends[5]]
            x_c0 = [self.get_matrix(self.pd_timestamps[i] + j * offset_frame) for j in depends[6]]

            # x_c_y = [self.get_matrix(self.pd_timestamps[i] + j * offset_frame) for j in a]


            y = [self.get_matrix(self.pd_timestamps[i] + j * offset_frame) for j in depends[-1]]
            if len_closeness > 0:
                XCS[0].append(x_c6)
                XCS[1].append(x_c5)
                XCS[2].append(x_c4)
                XCS[3].append(x_c3)
                XCS[4].append(x_c2)
                XCS[5].append(x_c1)
                XCS[6].append(x_c0)
            if len_period > 0:
                XP.append((x_c2))
            if len_trend > 0:
                XT.append((x_c3))
            # if len_y > 0:
            #     XCY.append((x_c_y))
            Y.append(y)
            timestamps_Y.append(self.timestamps[i])
            i += 1
        XC = np.asarray(XC)
        XCS=[np.asarray(XC) for XC in XCS]
        XP = np.asarray(XP)
        XT = np.asarray(XT)
        XCY = np.asarray(XCY)
        Y = np.asarray(Y)
        print("STMatrix  XC shape: ", XC.shape, "XP shape: ", XP.shape, "XT shape: ", XT.shape, "XCY shape: ",
              XCY.shape,
              "Y shape:", Y.shape)
        return XCS, XP, XT, XCY, Y, timestamps_Y
Ejemplo n.º 27
0
    def __init__(self, return_calculator=None, options=None):
        """A portfolio selector for bond indexes

        """

        self.options = options
        self.return_calculator = return_calculator
        self.unnalocated_symbol = 'UNNALOCATED_SYMBOL'
        self.concentration_rules = dict()
        # now checking the options
        if 'MAX_MATURITY' in options:
            self.max_maturity = options['MAX_MATURITY']
        else:
            self.max_maturity = pd.DateOffset(years=200)
        if 'MIN_MATURITY' in options:
            self.min_maturity = options['MIN_MATURITY']
        else:
            self.min_maturity = pd.DateOffset(days=0)
        if 'MIN_ELIGIBLE_MATURITY' in options:
            # Bonds with maturity closer than 'MIN_ELIGIBLE_MATURITY' will not enter the portfolio if they are not
            # already in.
            self.min_eligible_maturity = options['MIN_ELIGIBLE_MATURITY']
        else:
            self.min_eligible_maturity = None
        if 'SELL_DEFAULTED' in options:
            self.sell_defaulted = options['SELL_DEFAULTED']
        else:
            self.sell_defaulted = False
        if 'CORPORATE_MIN_OUTSTANDING' in options:
            self.corp_min_outstanding = options['CORPORATE_MIN_OUTSTANDING']
        else:
            self.corp_min_outstanding = 0
        if 'GOVT_MIN_OUTSTANDING' in options:
            self.govt_min_outstanding = options['GOVT_MIN_OUTSTANDING']
        else:
            self.govt_min_outstanding = 0
        if 'CORP_WEIGHT' in options:
            self.corp_weight = options['CORP_WEIGHT']
        else:
            self.corp_weight = 0.8
        if 'GOVT_WEIGHT' in options:
            self.govt_weight = options['GOVT_WEIGHT']
        else:
            self.govt_weight = 1 - self.corp_weight
        if 'MAX_TOLERANCE_AMONG_PORTFOLIOS' in options:
            self.max_tolerance_among_portfolios = options[
                'MAX_TOLERANCE_AMONG_PORTFOLIOS']
        else:
            self.max_tolerance_among_portfolios = 0.0
        if 'MAX_TOLERANCE_AMONG_SECURITIES' in options:
            self.max_tolerance_among_securities = options[
                'MAX_TOLERANCE_AMONG_SECURITIES']
        else:
            self.max_tolerance_among_securities = 0.0
        # Now checking the concentration rules
        if any('CONCENTRATION_RULES' in x for x in options.keys()):
            # Building concentration_rules dict
            for key, value in options.items():
                if 'CONCENTRATION_RULES' in key:
                    # Remove the first two words, which are CONCENTRATION and RULES
                    arguments = list(key.split('_'))[2:]
                    current_dict = self.concentration_rules
                    if len(arguments) >= 2:
                        for arg in arguments[:-1]:
                            try:
                                converted_arg = float(arg)
                            except:
                                converted_arg = arg
                            if converted_arg in current_dict.keys():
                                pass
                            else:
                                current_dict[converted_arg] = dict()
                            current_dict = current_dict[converted_arg]
                    last_arg = arguments[-1]
                    try:
                        last_arg = float(last_arg)
                    except:
                        pass
                    current_dict[last_arg] = value

        print('Successfully instantiated BondIndex Class with options:')
        print('max maturity: ' + str(self.max_maturity))
        print('min maturity: ' + str(self.min_maturity))
        print('min eligible maturity: {}'.format(self.min_eligible_maturity))
        print('sell defaulted : ' + str(self.sell_defaulted))
        print('min govt outstanding: {}'.format(self.govt_min_outstanding))
        print('corp weight: {}'.format(self.corp_weight))
        print('govt weight: {}'.format(self.govt_weight))
        print('rebalancing tolerance between portfolios: {}'.format(
            self.max_tolerance_among_portfolios))
        print('rebalancing tolerance between securities: {}'.format(
            self.max_tolerance_among_securities))
        print('Concentration rules:')
        print(self.concentration_rules)
Ejemplo n.º 28
0
    str_to_convert = "%.6f" % as_float

    # have to do this because of leap seconds
    time_string, dot, microseconds = str_to_convert.partition(".")
    utc_time_tuple = time.strptime(str_to_convert, LONG_DATE_FORMAT)
    as_datetime = datetime.datetime(1970, 1, 1) + datetime.timedelta(
        seconds=calendar.timegm(utc_time_tuple))
    as_datetime = as_datetime.replace(
        microsecond=datetime.datetime.strptime(microseconds, "%f").microsecond)

    return as_datetime


NOTIONAL_CLOSING_TIME = dict(hours=23, minutes=0, seconds=0)
NOTIONAL_CLOSING_TIME_AS_PD_OFFSET = pd.DateOffset(
    hours=NOTIONAL_CLOSING_TIME['hours'],
    minutes=NOTIONAL_CLOSING_TIME['minutes'],
    seconds=NOTIONAL_CLOSING_TIME['seconds'])


def adjust_timestamp_to_include_notional_close_and_time_offset(
    timestamp: datetime.datetime,
    actual_close: pd.DateOffset = NOTIONAL_CLOSING_TIME_AS_PD_OFFSET,
    original_close: pd.DateOffset = pd.DateOffset(hours=23,
                                                  minutes=0,
                                                  seconds=0),
    time_offset: pd.DateOffset = pd.DateOffset(hours=0),
) -> datetime.datetime:

    if timestamp.hour == 0 and timestamp.minute == 0 and timestamp.second == 0:
        new_datetime = timestamp.date() + actual_close
    elif time_matches(timestamp, original_close):
Ejemplo n.º 29
0
    def predict_future(self, days=30):
        
        # Use past self.training_years years for training
        train = self.stock[self.stock['Date'] > (max(self.stock['Date']) - pd.DateOffset(years=self.training_years)).date()]
        model = self.create_model()
        model.fit(train)
        # Future dataframe with specified number of days to predict
        future = model.make_future_dataframe(periods=days, freq='D')
        future = model.predict(future)
        # Only concerned with future dates
        future = future[future['ds'] >= max(self.stock['Date']).date()]
        # Remove the weekends
        future = self.remove_weekends(future)
        # Calculate whether increase or not
        future['diff'] = future['yhat'].diff()
        future = future.dropna()
        # Find the prediction direction and create separate dataframes
        future['direction'] = (future['diff'] > 0) * 1
        # Rename the columns for presentation
        future = future.rename(columns={'ds': 'Date', 'yhat': 'estimate', 'diff': 'change', 
                                        'yhat_upper': 'upper', 'yhat_lower': 'lower'})
        future_increase = future[future['direction'] == 1]
        future_decrease = future[future['direction'] == 0]
        print('\nPredicted Increase: \n')
        print(future_increase[['Date', 'estimate', 'change', 'upper', 'lower']])
        print('\nPredicted Decrease: \n')
        print(future_decrease[['Date', 'estimate', 'change', 'upper', 'lower']])
        
        self.reset_plot()
        
        # Set up plot
        plt.style.use('fivethirtyeight')
        matplotlib.rcParams['axes.labelsize'] = 10
        matplotlib.rcParams['xtick.labelsize'] = 8
        matplotlib.rcParams['ytick.labelsize'] = 8
        matplotlib.rcParams['axes.titlesize'] = 12
        
        # Plot the predictions and indicate if increase or decrease
        fig, ax = plt.subplots(1, 1, figsize=(8, 6))

        # Plot the estimates
        ax.plot(future_increase['Date'], future_increase['estimate'], 'g^', ms = 12, label = 'Pred. Increase')
        ax.plot(future_decrease['Date'], future_decrease['estimate'], 'rv', ms = 12, label = 'Pred. Decrease')

        # Plot errorbars
        ax.errorbar(future['Date'].dt.to_pydatetime(), future['estimate'], 
                    yerr = future['upper'] - future['lower'], 
                    capthick=1.4, color = 'k',linewidth = 2,
                   ecolor='darkblue', capsize = 4, elinewidth = 1, label = 'Pred with Range')

        # Plot formatting
        plt.legend(loc = 2, prop={'size': 10});
        plt.xticks(rotation = '45')
        plt.ylabel('Predicted Stock Price (US $)');
        plt.xlabel('Date'); plt.title('Predictions for %s' % self.symbol);
        plt.show()
        return future
        

        
        # Plot of training and testing average errors
        self.reset_plot()
        
        plt.plot(results['cps'], results['train_err'], 'bo-', ms = 8, label = 'Train Error')
        plt.plot(results['cps'], results['test_err'], 'r*-', ms = 8, label = 'Test Error')
        plt.xlabel('Changepoint Prior Scale'); plt.ylabel('Avg. Absolute Error ($)');
        plt.title('Training and Testing Curves as Function of CPS')
        plt.grid(color='k', alpha=0.3)
        plt.xticks(results['cps'], results['cps'])
        plt.legend(prop={'size':10})
        plt.show();
        
        # Plot of training and testing average uncertainty
        self.reset_plot()

        plt.plot(results['cps'], results['train_range'], 'bo-', ms = 8, label = 'Train Range')
        plt.plot(results['cps'], results['test_range'], 'r*-', ms = 8, label = 'Test Range')
        plt.xlabel('Changepoint Prior Scale'); plt.ylabel('Avg. Uncertainty ($)');
        plt.title('Uncertainty in Estimate as Function of CPS')
        plt.grid(color='k', alpha=0.3)
        plt.xticks(results['cps'], results['cps'])
        plt.legend(prop={'size':10})
        plt.show();
Ejemplo n.º 30
0
sam3['EArray_norm'] = sam3['Subarray 1 DC power gross | (kW)'] / norm_factor_SAM3


# In[31]:


#Add prefixes, Update YEAR to same and MERGE

sam1foo = sam1.add_prefix('sam_')
sam2foo = sam2.add_prefix('sam_')
sam3foo = sam3.add_prefix('sam_')
pvsyst1foo = pvsyst1.add_prefix('pvsyst_')
pvsyst2foo = pvsyst2.add_prefix('pvsyst_')
pvsyst3foo = pvsyst3.add_prefix('pvsyst_')

pvsyst1foo.index = pvsyst1foo.index + pd.DateOffset(year=2020)
pvsyst2foo.index = pvsyst2foo.index + pd.DateOffset(year=2020)
pvsyst3foo.index = pvsyst3foo.index + pd.DateOffset(year=2020)

case1 = pd.concat([sam1foo, pvsyst1foo], axis=1)
case2 = pd.concat([sam2foo, pvsyst2foo], axis=1)
case3 = pd.concat([sam3foo, pvsyst3foo], axis=1)


# ### Input Irradiances comparison
# 
# It is recommended to check days before and after summer-time hour shift (JIC), and also days that can and CANT be exchanged by months (MM/DD vs DD/MM). 
# 
# PvSyst is saved as DD/MM, SAM is saved as MM/DD

# In[32]: