def make_time_features(ts, index=None, epoch=None, epoch_span=None): """Project datetimes into vector space for use in machine learning models Outputs: - projection onto the unit circle of - second of day - day of week - day of year - seconds since `epoch`, normalized by `epoch_span` - binary workday indicator (i.e., Monday-Friday except major US holidays) :param ts: timestamp(s) to process :type ts: datetime.datetime or iterable thereof :param index: index of ts (e.g., if from a larger dataframe) :param epoch: start of time reckoning :param epoch_span: length of time reckoning :rtype: pd.DataFrame :returns: various projections of datetimes into vector space """ # input validation try: if len(ts) == 1: _singleton = True elif len(ts) > 1: _singleton = False elif len(ts) < 1: raise ValueError("must pass non-empty iterable of timestamps") except TypeError: return make_time_features([ts], index=index, epoch=epoch, epoch_span=epoch_span) if not isinstance(ts, pd.DatetimeIndex): ts = pd.Series(0, index=ts).index if not isinstance(ts, pd.DatetimeIndex): raise ValueError("must pass non-empty iterable of timestamps") if index is None: index = pd.RangeIndex(len(ts)) if epoch is None: epoch = min(ts) if epoch_span is None: epoch_span = float((max(ts) - epoch).total_seconds()) time_features = {} start = min(ts) end = max(ts) # Major US holidays NewYearsDay = pd.tseries.holiday.Holiday("New Years Day", month=1, day=1) MemorialDay = pd.tseries.holiday.Holiday("Memorial Day", month=6, day=1, offset=pd.DateOffset(weekday=MO(-1))) IndependenceDay = pd.tseries.holiday.Holiday("Independence Day", month=7, day=4) LaborDay = pd.tseries.holiday.Holiday("Labor Day", month=9, day=1, offset=pd.DateOffset(weekday=MO(1))) ThanksgivingDay = pd.tseries.holiday.Holiday( "Thanksgiving Day", month=11, day=1, offset=pd.DateOffset(weekday=TH(4)) ) ChristmasDay = pd.tseries.holiday.Holiday("Christmas Day", month=12, day=25) holidays = ( NewYearsDay.dates(start.date(), end.date()).tolist() + MemorialDay.dates(start.date(), end.date()).tolist() + IndependenceDay.dates(start.date(), end.date()).tolist() + LaborDay.dates(start.date(), end.date()).tolist() + ThanksgivingDay.dates(start.date(), end.date()).tolist() + ChristmasDay.dates(start.date(), end.date()).tolist() ) holidays = set([h.date() for h in holidays]) # projections onto unit circle time_features["day_cos"] = np.cos((ts.hour * 3600 + ts.minute * 60 + ts.second) * 2 * np.pi / 86400.0) time_features["day_sin"] = np.sin((ts.hour * 3600 + ts.minute * 60 + ts.second) * 2 * np.pi / 86400.0) time_features["week_cos"] = np.cos(ts.dayofweek * 2 * np.pi / 7.0) time_features["week_sin"] = np.sin(ts.dayofweek * 2 * np.pi / 7.0) time_features["year_cos"] = np.cos(ts.dayofyear * 2 * np.pi / 365.0) time_features["year_sin"] = np.sin(ts.dayofyear * 2 * np.pi / 365.0) # linear march through time time_features["epoch"] = (ts - epoch).total_seconds() / epoch_span # workday indicator time_features["workday"] = [int(weekday < 5 and date not in holidays) for weekday, date in zip(ts.weekday, ts.date)] if _singleton: return {k: v[0] for k, v in time_features.items()} else: return pd.DataFrame(time_features, index=index)
'/media/edwin/6F71AD994355D30E/Edwin/Maestría Meteorologia/Tesis/otras_variables/mejores_var_' + pp + '_hum_2m.csv') else: recoleccion_minim_1 = pd.read_csv( '/media/edwin/6F71AD994355D30E/Edwin/Maestría Meteorologia/Tesis/otras_variables/mejores_var_' + pp + varia_1) base_h = [] estacion_sintmp = [] estacion_sintmp_col = [] dist_menor_3 = [] dist_menor_10 = [] fecha_min_rec = [] estaciones_aut = pd.DataFrame({'cod': [], 'inicio': [], 'fin': []}) step = pd.DateOffset(hours=1) for j in recoleccion_minim_1.cod_1: #[21206990.0]:# print(j) min_2 = [] max_2 = [] os.chdir( '/media/edwin/6F71AD994355D30E/Edwin/Maestría Meteorologia/Tesis/datos_ideam/validados_col_col/' ) valores = os.listdir() if 'v_' + str(j)[0:-2] + varia_1 not in valores: estacion_sintmp.append('v_' + str(j)[0:-2] + varia_1) continue base_validada = pd.read_csv('v_' + str(j)[0:-2] + varia_1)
def evaluate_prediction(self, start_date=None, end_date=None, nshares = None, months = 6): # Default start date is one year before end of data # Default end date is end date of data if start_date is None: start_date = self.max_date - pd.DateOffset(months = months) if start_date < self.min_date: start_date = self.min_date + 0.5*(self.max_date-self.min_date) if end_date is None: end_date = self.max_date start_date, end_date = self.handle_dates(start_date, end_date) # Training data starts self.training_years years before start date and goes up to start date train = self.stock[(self.stock['Date'] < start_date.date()) & (self.stock['Date'] > (start_date - pd.DateOffset(years=self.training_years)).date())] # Testing data is specified in the range test = self.stock[(self.stock['Date'] >= start_date.date()) & (self.stock['Date'] <= end_date.date())] # Create and train the model model = self.create_model() model.fit(train) # Make a future dataframe and predictions future = model.make_future_dataframe(periods = 365, freq='D') future = model.predict(future) # Merge predictions with the known values test = pd.merge(test, future, on = 'ds', how = 'inner') train = pd.merge(train, future, on = 'ds', how = 'inner') # Calculate the differences between consecutive measurements test['pred_diff'] = test['yhat'].diff() test['real_diff'] = test['y'].diff() # Correct is when we predicted the correct direction test['correct'] = (np.sign(test['pred_diff']) == np.sign(test['real_diff'])) * 1 # Accuracy when we predict increase and decrease increase_accuracy = 100 * np.mean(test[test['pred_diff'] > 0]['correct']) decrease_accuracy = 100 * np.mean(test[test['pred_diff'] < 0]['correct']) # Calculate mean absolute error test_errors = abs(test['y'] - test['yhat']) test_mean_error = np.mean(test_errors) train_errors = abs(train['y'] - train['yhat']) train_mean_error = np.mean(train_errors) # Calculate percentage of time actual value within prediction range test['in_range'] = False for i in test.index: if (test.ix[i, 'y'] < test.ix[i, 'yhat_upper']) & (test.ix[i, 'y'] > test.ix[i, 'yhat_lower']): test.ix[i, 'in_range'] = True in_range_accuracy = 100 * np.mean(test['in_range']) if not nshares: # Date range of predictions print('\nPrediction Range: {} to {}.'.format(start_date.date(), end_date.date())) # Final prediction vs actual value print('\nPredicted price on {} = ${:.2f}.'.format(max(future['ds']).date(), future.ix[len(future) - 1, 'yhat'])) print('Actual price on {} = ${:.2f}.\n'.format(max(test['ds']).date(), test.ix[len(test) - 1, 'y'])) print('Average Absolute Error on Training Data = ${:.2f}.'.format(train_mean_error)) print('Average Absolute Error on Testing Data = ${:.2f}.\n'.format(test_mean_error)) # Direction accuracy print('When the model predicted an increase, the price increased {:.2f}% of the time.'.format(increase_accuracy)) print('When the model predicted a decrease, the price decreased {:.2f}% of the time.\n'.format(decrease_accuracy)) print('The actual value was within the {:d}% confidence interval {:.2f}% of the time.'.format(int(100 * model.interval_width), in_range_accuracy)) # Reset the plot self.reset_plot() # Set up the plot fig, ax = plt.subplots(1, 1) # Plot the actual values ax.plot(train['ds'], train['y'], 'ko-', linewidth = 1.4, alpha = 0.8, ms = 1.8, label = 'Observations') ax.plot(test['ds'], test['y'], 'ko-', linewidth = 1.4, alpha = 0.8, ms = 1.8, label = 'Observations') # Plot the predicted values ax.plot(future['ds'], future['yhat'], 'navy', linewidth = 2.4, label = 'Predicted'); # Plot the uncertainty interval as ribbon ax.fill_between(future['ds'].dt.to_pydatetime(), future['yhat_upper'], future['yhat_lower'], alpha = 0.6, facecolor = 'gold', edgecolor = 'k', linewidth = 1.4, label = 'Confidence Interval') # Put a vertical line at the start of predictions plt.vlines(x=min(test['ds']).date(), ymin=min(future['yhat_lower']), ymax=max(future['yhat_upper']), colors = 'r', linestyles='dashed', label = 'Prediction Start') # Plot formatting plt.legend(loc = 2, prop={'size': 8}); plt.xlabel('Date'); plt.ylabel('Price $'); plt.grid(linewidth=0.6, alpha = 0.6) plt.title('{} Model Evaluation from {} to {}.'.format(self.symbol, start_date.date(), end_date.date())); plt.show(); # If a number of shares is specified, play the game elif nshares: # Only playing the stocks when we predict the stock will increase test_pred_increase = test[test['pred_diff'] > 0] test_pred_increase.reset_index(inplace=True) prediction_profit = [] # Iterate through all the predictions and calculate profit from playing for i, correct in enumerate(test_pred_increase['correct']): # If we predicted up and the price goes up, we gain the difference if correct == 1: prediction_profit.append(nshares * test_pred_increase.ix[i, 'real_diff']) # If we predicted up and the price goes down, we lose the difference else: prediction_profit.append(nshares * test_pred_increase.ix[i, 'real_diff']) test_pred_increase['pred_profit'] = prediction_profit # Put the profit into the test dataframe test = pd.merge(test, test_pred_increase[['ds', 'pred_profit']], on = 'ds', how = 'left') test.ix[0, 'pred_profit'] = 0 # Profit for either method at all dates test['pred_profit'] = test['pred_profit'].cumsum().ffill() test['hold_profit'] = nshares * (test['y'] - float(test.ix[0, 'y'])) # Display information print('You played the stock market in {} from {} to {} with {} shares.\n'.format( self.symbol, start_date.date(), end_date.date(), nshares)) print('When the model predicted an increase, the price increased {:.2f}% of the time.'.format(increase_accuracy)) print('When the model predicted a decrease, the price decreased {:.2f}% of the time.\n'.format(decrease_accuracy)) # Display some friendly information about the perils of playing the stock market print('The total profit using the Prophet model = ${:.2f}.'.format(np.sum(prediction_profit))) print('The Buy and Hold strategy profit = ${:.2f}.'.format(float(test.ix[len(test) - 1, 'hold_profit']))) print('\nThanks for playing the stock market!\n') # Plot the predicted and actual profits over time self.reset_plot() # Final profit and final smart used for locating text final_profit = test.ix[len(test) - 1, 'pred_profit'] final_smart = test.ix[len(test) - 1, 'hold_profit'] # text location last_date = test.ix[len(test) - 1, 'ds'] text_location = (last_date - pd.DateOffset(months = 1)).date() plt.style.use('dark_background') # Plot smart profits plt.plot(test['ds'], test['hold_profit'], 'b', linewidth = 1.8, label = 'Buy and Hold Strategy') # Plot prediction profits plt.plot(test['ds'], test['pred_profit'], color = 'g' if final_profit > 0 else 'r', linewidth = 1.8, label = 'Prediction Strategy') # Display final values on graph plt.text(x = text_location, y = final_profit + (final_profit / 40), s = '$%d' % final_profit, color = 'g' if final_profit > 0 else 'r', size = 18) plt.text(x = text_location, y = final_smart + (final_smart / 40), s = '$%d' % final_smart, color = 'g' if final_smart > 0 else 'r', size = 18); # Plot formatting plt.ylabel('Profit (US $)'); plt.xlabel('Date'); plt.title('Predicted versus Buy and Hold Profits'); plt.legend(loc = 2, prop={'size': 10}); plt.grid(alpha=0.2); plt.show()
def add_missing_row( df: pd.DataFrame, id_cols: List[str], reference_col: str, complete_index: Union[Dict[str, str], List[str]] = None, method: str = None, cols_to_keep: List[str] = None ) -> pd.DataFrame: """ Add missing row to a df base on a reference column --- ### Parameters *mandatory :* - `id_cols` (*list of str*): names of the columns used to create each group - `reference_col` (*str*): name of the column used to identify missing rows *optional :* - `complete_index` (*list* or *dict*): [A, B, C] a list of values used to add missing rows. It can also be a dict to declare a date range. By default, use all values of reference_col. - `method` (*str*): by default all missing rows are added. The possible values are : - `"between"` : add missing rows having their value between min and max values for each group, - `"between_and_after"` : add missing rows having their value bigger than min value for each group. - `"between_and_before"` : add missing rows having their value smaller than max values for each group. - `cols_to_keep` (*list of str*): name of other columns to keep, linked to the reference_col. --- ### Example **Input** YEAR | MONTH | NAME :---:|:---:|:--: 2017|1|A 2017|2|A 2017|3|A 2017|1|B 2017|3|B ```cson add_missing_row: id_cols: ['NAME'] reference_col: 'MONTH' ``` **Output** YEAR | MONTH | NAME :---:|:---:|:--: 2017|1|A 2017|2|A 2017|3|A 2017|1|B 2017|2|B 2017|3|B """ if cols_to_keep is None: cols_for_index = [reference_col] else: cols_for_index = [reference_col] + cols_to_keep check_params_columns_duplicate(id_cols + cols_for_index) if method == 'between' or method == 'between_and_after': df['start'] = df.groupby(id_cols)[reference_col].transform(min) id_cols += ['start'] if method == 'between' or method == 'between_and_before': df['end'] = df.groupby(id_cols)[reference_col].transform(max) id_cols += ['end'] names = id_cols + cols_for_index new_df = df.set_index(names) index_values = df.groupby(id_cols).sum().index.values if complete_index is None: complete_index = df.groupby(cols_for_index).sum().index.values elif isinstance(complete_index, dict): if complete_index['type'] == 'date': freq = complete_index['freq'] date_format = complete_index['format'] start = complete_index['start'] end = complete_index['end'] if isinstance(freq, dict): freq = pd.DateOffset(**{k: int(v) for k, v in freq.items()}) complete_index = pd.date_range(start=start, end=end, freq=freq) complete_index = complete_index.strftime(date_format) else: raise ParamsValueError(f'Unknown complete index type: ' f'{complete_index["type"]}') if not isinstance(index_values[0], tuple): index_values = [(x,) for x in index_values] if not isinstance(complete_index[0], tuple): complete_index = [(x,) for x in complete_index] new_tuples_index = [x + y for x in index_values for y in complete_index] new_index = pd.MultiIndex.from_tuples(new_tuples_index, names=names) new_df = new_df.reindex(new_index).reset_index() if method == 'between' or method == 'between_and_after': new_df = new_df[new_df[reference_col] >= new_df['start']] del new_df['start'] if method == 'between' or method == 'between_and_before': new_df = new_df[new_df[reference_col] <= new_df['end']] del new_df['end'] return new_df
def get_intraday_history(self, symbol, from_date=None, to_date=None): """ Returns the historical quotes of the specified ticker narroweed by the date. Parameters ---------- symbol : str The name of the symbol used to retrieve the information. from_date : datetime The start date (Argentina Time Zone) used to filter the information. to_date : datetime The end date (Argentina Time Zone) used to filter the information. Raises ------ pyhomebroker.exceptions.SessionException If the user is not logged in. requests.exceptions.HTTPError There is a problem related to the HTTP request. """ if not self._auth.is_user_logged_in: raise SessionException('User is not logged in') headers = { 'User-Agent': user_agent, 'Accept-Encoding': 'gzip, deflate', 'Content-Type': 'application/x-www-form-urlencoded' } if from_date == None: from_date = datetime.date.today() if to_date == None: to_date = from_date + datetime.timedelta(days=1) from_date = from_date + datetime.timedelta(seconds=self.__hours * 3600) to_date = to_date + datetime.timedelta(seconds=self.__hours * 3600) url = '{}/Intradiario/history?symbol={}&resolution=1&from={}&to={}'.format( self._auth.broker['page'], symbol.upper(), self.__convert_datetime_to_epoch(from_date), self.__convert_datetime_to_epoch(to_date)) resp = rq.get(url, headers=headers, cookies=self._auth.cookies, proxies=self._proxies) resp.raise_for_status() resp = resp.json() df = pd.DataFrame({ 'date': resp['t'], 'open': resp['o'], 'high': resp['h'], 'low': resp['l'], 'close': resp['c'], 'volume': resp['v'] }) df.date = pd.to_datetime( df.date, unit='s') - pd.DateOffset(seconds=self.__hours * 3600) df.volume = df.volume.astype(int) return df
period_df_nona_clean, on='User_id') cycle_table_filter = cycle_table[(cycle_table.start_date_clean_y > cycle_table.end_date_clean_x)] # keep only the most relevant possible cycle cycle_table_filter_2 = cycle_table_filter\ .sort_values(by=['start_date_clean_x', 'start_date_clean_y'])\ .drop_duplicates(subset=['User_id', 'start_date_clean_x'], keep='first') cycle_table_filter_2 = cycle_table_filter_2.assign( cycle_length=cycle_table_filter_2.apply( lambda x: (x['start_date_clean_y'] - x['start_date_clean_x']).days, axis=1)) cycle_table_filter_2 = cycle_table_filter_2.assign( end_cycle=cycle_table_filter_2.apply( lambda x: x['start_date_clean_y'] - pd.DateOffset(1) if x['cycle_length'] < 40 else pd.NaT, axis=1)) cols_to_keep = [ 'User_id', 'start_date_x', 'end_date_x', 'start_date_clean_x', 'end_date_clean_x', 'cycle_length', 'end_cycle' ] rich_period_df = pd.merge(period_df, cycle_table_filter_2[cols_to_keep], left_on=['User_id', 'start_date'], right_on=['User_id', 'start_date_x'], how='left') rich_period_df = rich_period_df.drop(columns=['start_date_x', 'end_date_x']) new_col_names = [c.replace('_x', '') for c in rich_period_df.columns]
frequency='1d', field=field, data_frequency='daily') # ### View Data # Let's get returns data for our risk model using the `get_pricing` function. For this model, we'll be looking back to 5 years of data. # In[14]: five_year_returns = get_pricing( data_portal, trading_calendar, universe_tickers, universe_end_date - pd.DateOffset(years=5), universe_end_date)\ .pct_change()[1:].fillna(0) five_year_returns # # Statistical Risk Model # It's time to build the risk model. You'll be creating a statistical risk model using PCA. So, the first thing is building the PCA model. # ## Fit PCA # Implement `fit_pca` to fit a PCA model to the returns data # In[18]: from sklearn.decomposition import PCA
def create_dataset(self, nwps, data_path, start_index=9001, test=False): self.data['dayweek'] = self.data.index.dayofweek self.data['month'] = self.data.index.month self.data['hour'] = self.data.index.hour self.data['sp_index'] = [self.sp_index(d) for d in self.data.index] dataset = pd.DataFrame() target = pd.Series(name='target') dataset_3d = np.array([]) nwps_lstm = nwps.copy(deep=True) for var in self.variables: if var == 'WS': var = 'wind' elif var == 'WD': var = 'direction' elif var == 'Temperature': var = 'Temp' cols = [ col for col in nwps.columns if str.lower(var) in str.lower(col) ] nwps_lstm[str.lower(var)] = nwps_lstm[cols].mean(axis=1).values lags1 = np.hstack([ np.arange(24, 52), np.arange(71, 75), 96, 120, 144, np.arange(166, 175), 192, ]) lags2 = np.hstack([np.arange(8735, 8741), 8760, 8736 + 168]) lags_days = np.arange(1, 8) for date in self.data.index[start_index:]: date_inp1 = [date - pd.DateOffset(hours=int(l)) for l in lags1] date_inp2 = [date - pd.DateOffset(hours=int(l)) for l in lags2] date_days = [date - pd.DateOffset(days=int(l)) for l in lags_days] try: temp_max = nwps[['Temp_max']].loc[date].values var_imp = np.hstack( (temp_max, self.data[['hour', 'month', 'sp_index', 'dayweek']].loc[date].values, nwps.drop(columns=['Temp_max']).loc[date].values, np.power(self.data['month'].loc[date] * temp_max / 12, 3), np.power(self.data['sp_index'].loc[date] * temp_max / 100, 3))) col = ['Temp', 'hour', 'month', 'sp_index', 'dayweek' ] + nwps.drop(columns=['Temp_max']).columns.tolist() + [ 'Temp_month', 'Temp_sp_days' ] var_unimp = np.hstack(( self.data.loc[date_inp1, 'SCADA'].values, self.data.loc[date_inp2, 'SCADA'].values, self.data.loc[date_inp1, 'APE_net'].values, self.data.loc[date_inp1, 'SCADA'].values + self.data.loc[date_inp1, 'APE_net'].values, nwps.loc[date_days, 'Temp_max'].values, nwps.loc[date_days, 'Temp_min'].values, )) col += ['SCADA_' + str(i) for i in range(45)] col += ['SCADA_' + str(i) for i in range(45, 53)] col += ['APE_' + str(i) for i in range(45)] col += ['TOTAL_' + str(i) for i in range(45)] col += ['Temp_max_' + str(i) for i in range(7)] col += ['Temp_min_' + str(i) for i in range(7)] temp_max = nwps[['Temp_max']].loc[date].values var_3d = np.hstack( (np.array([0]), self.data.loc[date, 'APE_net'], self.data.loc[date, 'APE_net'] + self.data.loc[date, 'SCADA'], nwps_lstm[[ 'cloud', 'wind', 'direction', 'Temp_max', 'Temp_min', 'Temp_athens', 'Temp_thessaloniki', 'Temp_ioannina', 'Temp_larissa', 'Temp_patra' ]].loc[date].values, self.data[['hour', 'month', 'sp_index', 'dayweek']].loc[date].values, np.power(self.data['month'].loc[date] * temp_max / 12, 3), np.power(self.data['sp_index'].loc[date] * temp_max / 100, 3))) for d in date_inp1: temp_max = nwps[['Temp_max']].loc[d].values v = np.hstack( (self.data.loc[d, 'SCADA'], self.data.loc[d, 'APE_net'], self.data.loc[d, 'APE_net'] + self.data.loc[d, 'SCADA'], nwps_lstm[[ 'cloud', 'wind', 'direction', 'Temp_max', 'Temp_min', 'Temp_athens', 'Temp_thessaloniki', 'Temp_ioannina', 'Temp_larissa', 'Temp_patra' ]].loc[d].values, self.data[['hour', 'month', 'sp_index', 'dayweek']].loc[d].values, np.power(self.data['month'].loc[d] * temp_max / 12, 3), np.power( self.data['sp_index'].loc[d] * temp_max / 100, 3))) var_3d = np.vstack((var_3d, v)) except: continue inp = np.hstack((var_imp, var_unimp)) inp1 = pd.Series(inp, index=col, name=date) targ1 = pd.Series(self.data['SCADA'].loc[date], index=[date], name='target1') if not inp1.isnull().any() and not targ1.isnull().any(): dataset = dataset.append(inp1) target = target.append(targ1) if dataset_3d.shape[0] == 0: dataset_3d = var_3d elif len(dataset_3d.shape) == 2: dataset_3d = np.stack((dataset_3d, var_3d)) else: dataset_3d = np.vstack( (dataset_3d, var_3d[np.newaxis, :, :])) if not test: corr = [] for f in range(dataset.shape[1]): corr.append( np.abs( np.corrcoef(dataset.values[:, f], target.values.ravel())[1, 0])) ind = np.argsort(np.array(corr))[::-1] columns = dataset.columns[ind] dataset = dataset[columns] joblib.dump( ind, os.path.join(data_path, 'dataset_columns_order.pickle')) else: ind = joblib.load( os.path.join(data_path, 'dataset_columns_order.pickle')) columns = dataset.columns[ind] dataset = dataset[columns] return dataset, target, dataset_3d
def append_feature(data, price): next_index = data.index[-1] + pd.DateOffset(1) data.loc[next_index] = [price, data['Adj. Volume'][-1]] return data
first_new_transaction = new_merchant_transactions.groupby('card_id').agg({'month_lag' : 'min', 'purchase_date' : 'min'}).reset_index() first_new_transaction.columns = ['card_id', 'new_month_lag', 'new_purchase_date'] # In[ ]: # converting to datetime last_hist_transaction['hist_purchase_date'] = pd.to_datetime(last_hist_transaction['hist_purchase_date']) first_new_transaction['new_purchase_date'] = pd.to_datetime(first_new_transaction['new_purchase_date']) # In[ ]: # substracting month_lag for each row last_hist_transaction['observation_date'] = \ last_hist_transaction.apply(lambda x: x['hist_purchase_date'] - pd.DateOffset(months=x['hist_month_lag']), axis=1) first_new_transaction['observation_date'] = \ first_new_transaction.apply(lambda x: x['new_purchase_date'] - pd.DateOffset(months=x['new_month_lag']-1), axis=1) # At this point we just reversed month lag function to get a rought estimate of the `observation_date` to be used for specific `card_id`. As you can see below, the `observation_date` is already different for many cards! # In[ ]: last_hist_transaction.head(20) # In[ ]: first_new_transaction.head(20)
import pandas as pd import dash import dash_core_components as dcc import dash_html_components as html import plotly.graph_objects as go from dash.dependencies import Input, Output global_ndays_range = 20 # --- Start --- Reading base data for the Sunburst industry_sentiment = pd.read_json('covidsm_agg_sentiment2_industry.json.zip', orient='records') industry_sentiment['published_at_date'] = pd.to_datetime( industry_sentiment['published_at_date'], unit='ms') global_start_day = industry_sentiment['published_at_date'].max( ) - pd.DateOffset(days=global_ndays_range) industries_hrchy = pd.read_csv('industries-hrchy.csv') industries_hrchy = industries_hrchy.replace(np.nan, '', regex=True) # --- End --- Reading base data for the Sunburst # --- Start --- Load base Sunburst (no data) fig_layout = dict(margin=dict(t=0, l=0, r=0, b=0), width=800, height=850) fig_ind = go.Figure(data=[ go.Sunburst(ids=['total'], labels=['All Industries'], parents=[''], marker=dict(colors=[0], colorscale='RdBu', cmid=0), hovertemplate= '<b>(%{id})</b> %{label} <br>- Sentiment score: %{color:.2f}') ], layout=fig_layout)
wget.download( 'https://snap.stanford.edu/data/loc-gowalla_edges.txt.gz', out=str(dataset_path / 'loc-gowalla_edges.txt.gz'), bar=print_progressbar) wget.download( 'https://snap.stanford.edu/data/loc-gowalla_totalCheckins.txt.gz', out=str(dataset_path / 'loc-gowalla_totalCheckins.txt.gz'), bar=print_progressbar) gowalla_dataset = pd.read_csv( dataset_path / 'loc-gowalla_totalCheckins.txt.gz', sep='\t', names=['userId', 'timestamp', 'long', 'lat', 'loc_id']) gowalla_dataset['timestamp'] = pd.to_datetime(gowalla_dataset['timestamp']).dt.tz_localize(None) split_date = pd.to_datetime(config['SPLIT_DATE']) start_date = gowalla_dataset['timestamp'].min() \ if 'TRAIN_DAYS' not in config \ else pd.to_datetime(split_date - pd.DateOffset(days=config['TRAIN_DAYS'])) end_test_date = split_date + pd.DateOffset(days=config['TEST_DAYS']) end_date = pd.to_datetime( end_test_date + pd.DateOffset(days=config['VAL_DAYS']) if 'VAL_DAYS' in config else 0) timestamp_filter = (gowalla_dataset['timestamp'] >= start_date) & ( gowalla_dataset['timestamp'] <= end_date) gowalla_dataset = gowalla_dataset[timestamp_filter] gowalla_dataset.sort_values('timestamp', inplace=True) new_user_ids = {k: v for v, k in enumerate(gowalla_dataset['userId'].unique())} new_item_ids = {k: v for v, k in enumerate(gowalla_dataset['loc_id'].unique())} gowalla_dataset['userId'] = gowalla_dataset['userId'].map(new_user_ids) gowalla_dataset['loc_id'] = gowalla_dataset['loc_id'].map(new_item_ids)
def _week_to_date(self, row: int): origin_date = pd.to_datetime("2019-12-29") if row.Vecka >= 52 else pd.to_datetime("2021-01-03") return origin_date + pd.DateOffset(days=7 * int(row.Vecka))
import pandas as pd from fastsr.containers.learning_data import LearningData dat = pd.read_csv('data/hour.csv') datetime_index = list() for i, r in dat.iterrows(): datetime_index.append(pd.to_datetime(r[1]) + pd.DateOffset(hours=r[5])) dt_index = pd.DatetimeIndex(datetime_index) dat.set_index(dt_index) columns = ['atemp', 'windspeed', 'hum', 'cnt'] slim_dat = dat[columns] learning_data = LearningData() learning_data.from_data(slim_dat, columns, 'ucisimplebike') learning_data.lag_predictors(24, column_names=['atemp', 'windspeed', 'hum']) learning_data.to_hdf('data/hour_simple_lagged.hdf5')
def test_dateoffset_instance_subclass_check(): assert not issubclass(pd.DateOffset, cudf.DateOffset) assert not isinstance(pd.DateOffset(), cudf.DateOffset)
def main(): # path of study folder study_path = str(sys.argv[1]) # participants# (eg. "P301 P302 P401") p_nums = str(sys.argv[2]) t0 = time() participants = p_nums.split(' ') for p in participants: print('Comparing in wild for '+p) current_dir = os.getcwd() save_folder = os.path.join(os.getcwd(), 'output_files', 'leave_' + p + '_out') if os.path.exists(save_folder): os.chdir(save_folder) else: os.chdir(os.path.join(os.getcwd(), 'output_files', 'using_all')) # TODO can move to a settings file (test, then delete if not needed) model = XGBClassifier(learning_rate=0.01, n_estimators=400, max_depth=10, min_child_weight=1, gamma=0, subsample=1, colsample_btree=1, scale_pos_weight=1, random_state=7, slient=0, nthread=4 ) model = joblib.load('WRIST.dat') path_table = os.path.join(study_path, p, 'In Wild/Summary/Actigraph/', p + ' In Wild IntensityMETMinLevel.csv') df_table = pd.read_csv(path_table, index_col=None, header=0) path_gyro = os.path.join(study_path, p, 'In Wild/Wrist/Aggregated/Gyroscope/Gyroscope_resampled.csv') df_gyro = pd.read_csv(path_gyro, index_col=None, header=0) df_gyro['Datetime'] = pd.to_datetime(df_gyro['Time'], unit='ms', utc=True).dt.tz_convert( 'America/Chicago').dt.tz_localize(None) # minute level data at 20hz, so 60*20 = 1200 data_length = 1200 nan_limit = 4 prediction = [] for n in df_table['Datetime']: start_time = pd.to_datetime(n) end_time = start_time + pd.DateOffset(minutes=1) temp_gyro = df_gyro.loc[(df_gyro['Datetime'] >= start_time) & (df_gyro['Datetime'] < end_time)].reset_index(drop=True) if len(temp_gyro['rotX']) == data_length: this_min_gyro = [temp_gyro['rotX'], temp_gyro['rotY'], temp_gyro['rotZ']] if np.count_nonzero(np.isnan(this_min_gyro[0])) > nan_limit: prediction.append(-1) else: model_output = model.predict(extract_features([this_min_gyro])) prediction.append(model_output[0]) else: prediction.append(-1) df_table['model_classification'] = prediction print("Hours of data: %g" % (float(len(df_table)) / float(60))) set_realistic_met_estimate(df_table) df_table.to_csv(p+'_in_wild_comparison.csv', index=False, encoding='utf8') l_datetime_all = df_table['Datetime'].tolist() l_freedson_all = df_table['MET (Freedson)'].tolist() l_vm3_all = df_table['MET (VM3)'].tolist() l_estimation_all = df_table['estimation'].tolist() l_freedson_all = [l_freedson_all[i] for i in range(len(l_estimation_all)) if not np.isnan(l_estimation_all[i])] l_vm3_all = [l_vm3_all[i] for i in range(len(l_estimation_all)) if not np.isnan(l_estimation_all[i])] l_estimation_all = [l_estimation_all[i] for i in range(len(l_estimation_all)) if not np.isnan(l_estimation_all[i])] l_datetime_all = [l_datetime_all[i] for i in range(len(l_estimation_all)) if not np.isnan(l_estimation_all[i])] vm3_all_reshaped = np.array(l_vm3_all).reshape(-1, 1) estimation_all_reshaped = np.array(l_estimation_all).reshape(-1, 1) freedson_all_reshaped = np.array(l_freedson_all).reshape(-1, 1) fig = go.Figure() fig.add_trace(go.Scatter(x=l_estimation_all, y=l_vm3_all, mode='markers')) regr = linear_model.LinearRegression() regr.fit(estimation_all_reshaped, vm3_all_reshaped) y_pred = regr.predict(estimation_all_reshaped) y_plot = np.reshape(y_pred, y_pred.shape[0]) fig.add_trace(go.Scatter(x=l_estimation_all, y=y_plot, mode='lines', name='linear regression', line=dict(color='red', width=4))) fig.update_layout(title='Linear Regression', xaxis_title='Estimation', yaxis_title='VM3 METs') outf = open('wild_est_vs_vm3_r2.txt', 'a') outf.write('%g\n' % r2_score(vm3_all_reshaped, y_pred)) outf.close() print("The r2 score for in wild estimation vs VM3 is: %g" % (r2_score(vm3_all_reshaped, y_pred))) # calculate Pearson's correlation corr, _ = pearsonr(np.array(l_estimation_all), np.array(l_vm3_all)) print('Pearsons correlation: %g' % corr) outf = open('wild_est_vs_vm3_pearson.txt', 'a') outf.write('%g\n' % corr) outf.close() # calculate Spearman's correlation corr, _ = spearmanr(np.array(l_estimation_all), np.array(l_vm3_all)) print('Spearmans correlation: %g' % corr) outf = open('wild_est_vs_vm3_spearman.txt', 'a') outf.write('%g\n' % corr) outf.close() py.offline.plot(fig, filename='in_wild_model_to_vm3.html', auto_open=False) fig = go.Figure() fig.add_trace(go.Scatter(x=l_estimation_all, y=l_freedson_all, mode='markers')) regr = linear_model.LinearRegression() regr.fit(estimation_all_reshaped, freedson_all_reshaped) y_pred = regr.predict(estimation_all_reshaped) y_plot = np.reshape(y_pred, y_pred.shape[0]) fig.add_trace(go.Scatter(x=l_estimation_all, y=y_plot, mode='lines', name='linear regression', line=dict(color='red', width=4))) fig.update_layout(title='Linear Regression', xaxis_title='Estimation', yaxis_title='Freedson METs') outf = open('wild_est_vs_freedson_r2.txt', 'a') outf.write('%g\n' % r2_score(freedson_all_reshaped, y_pred)) outf.close() print("The r2 score for in wild estimation vs Freedson is: %g" % (r2_score(freedson_all_reshaped, y_pred))) py.offline.plot(fig, filename='in_wild_model_to_freedson.html', auto_open=False) fig = go.Figure() fig.add_trace(go.Scatter(x=l_datetime_all, y=l_estimation_all, mode='markers', name='model estimation')) fig.add_trace(go.Scatter(x=l_datetime_all, y=l_vm3_all, mode='markers', name='actigraph vm3')) fig.add_trace(go.Scatter(x=l_datetime_all, y=l_freedson_all, mode='markers', name='actigraph freedson')) fig.update_layout(title='Model and ActiGraph Estimation', xaxis_title='Datetime', yaxis_title='MET') py.offline.plot(fig, filename='in_wild_comparison.html', auto_open=False) os.chdir(current_dir) t1 = time() print("Total in wild comparison time: %g minutes" % (float(t1 - t0) / float(60)))
def test_datetime64_with_DateOffset(klass, assert_func): s = klass(date_range('2000-01-01', '2000-01-31'), name='a') result = s + pd.DateOffset(years=1) result2 = pd.DateOffset(years=1) + s exp = klass(date_range('2001-01-01', '2001-01-31'), name='a') assert_func(result, exp) assert_func(result2, exp) result = s - pd.DateOffset(years=1) exp = klass(date_range('1999-01-01', '1999-01-31'), name='a') assert_func(result, exp) s = klass([ Timestamp('2000-01-15 00:15:00', tz='US/Central'), pd.Timestamp('2000-02-15', tz='US/Central') ], name='a') result = s + pd.offsets.Day() result2 = pd.offsets.Day() + s exp = klass([ Timestamp('2000-01-16 00:15:00', tz='US/Central'), Timestamp('2000-02-16', tz='US/Central') ], name='a') assert_func(result, exp) assert_func(result2, exp) s = klass([ Timestamp('2000-01-15 00:15:00', tz='US/Central'), pd.Timestamp('2000-02-15', tz='US/Central') ], name='a') result = s + pd.offsets.MonthEnd() result2 = pd.offsets.MonthEnd() + s exp = klass([ Timestamp('2000-01-31 00:15:00', tz='US/Central'), Timestamp('2000-02-29', tz='US/Central') ], name='a') assert_func(result, exp) assert_func(result2, exp) # array of offsets - valid for Series only if klass is Series: with tm.assert_produces_warning(PerformanceWarning): s = klass([Timestamp('2000-1-1'), Timestamp('2000-2-1')]) result = s + Series( [pd.offsets.DateOffset(years=1), pd.offsets.MonthEnd()]) exp = klass([Timestamp('2001-1-1'), Timestamp('2000-2-29')]) assert_func(result, exp) # same offset result = s + Series([ pd.offsets.DateOffset(years=1), pd.offsets.DateOffset(years=1) ]) exp = klass([Timestamp('2001-1-1'), Timestamp('2001-2-1')]) assert_func(result, exp) s = klass([ Timestamp('2000-01-05 00:15:00'), Timestamp('2000-01-31 00:23:00'), Timestamp('2000-01-01'), Timestamp('2000-03-31'), Timestamp('2000-02-29'), Timestamp('2000-12-31'), Timestamp('2000-05-15'), Timestamp('2001-06-15') ]) # DateOffset relativedelta fastpath relative_kwargs = [('years', 2), ('months', 5), ('days', 3), ('hours', 5), ('minutes', 10), ('seconds', 2), ('microseconds', 5)] for i, kwd in enumerate(relative_kwargs): op = pd.DateOffset(**dict([kwd])) assert_func(klass([x + op for x in s]), s + op) assert_func(klass([x - op for x in s]), s - op) op = pd.DateOffset(**dict(relative_kwargs[:i + 1])) assert_func(klass([x + op for x in s]), s + op) assert_func(klass([x - op for x in s]), s - op) # assert these are equal on a piecewise basis offsets = [ 'YearBegin', ('YearBegin', { 'month': 5 }), 'YearEnd', ('YearEnd', { 'month': 5 }), 'MonthBegin', 'MonthEnd', 'SemiMonthEnd', 'SemiMonthBegin', 'Week', ('Week', { 'weekday': 3 }), 'BusinessDay', 'BDay', 'QuarterEnd', 'QuarterBegin', 'CustomBusinessDay', 'CDay', 'CBMonthEnd', 'CBMonthBegin', 'BMonthBegin', 'BMonthEnd', 'BusinessHour', 'BYearBegin', 'BYearEnd', 'BQuarterBegin', ('LastWeekOfMonth', { 'weekday': 2 }), ('FY5253Quarter', { 'qtr_with_extra_week': 1, 'startingMonth': 1, 'weekday': 2, 'variation': 'nearest' }), ('FY5253', { 'weekday': 0, 'startingMonth': 2, 'variation': 'nearest' }), ('WeekOfMonth', { 'weekday': 2, 'week': 2 }), 'Easter', ('DateOffset', { 'day': 4 }), ('DateOffset', { 'month': 5 }) ] with warnings.catch_warnings(record=True): for normalize in (True, False): for do in offsets: if isinstance(do, tuple): do, kwargs = do else: do = do kwargs = {} for n in [0, 5]: if (do in [ 'WeekOfMonth', 'LastWeekOfMonth', 'FY5253Quarter', 'FY5253' ] and n == 0): continue op = getattr(pd.offsets, do)(n, normalize=normalize, **kwargs) assert_func(klass([x + op for x in s]), s + op) assert_func(klass([x - op for x in s]), s - op) assert_func(klass([op + x for x in s]), op + s)
volumeData['lag6'] = volumeData['volume'].shift(6) volumeData['lag7'] = volumeData['volume'].shift(7) # COMMAND ---------- volumeData # COMMAND ---------- import seaborn as sns #sns.heatmap(volumeData.corr(), annot=True, fmt=".2f") #display() # COMMAND ---------- volumeDataTest['time'] = volumeDataTest['time'] + pd.DateOffset(hours=2) volumeDataTest2 = volumeDataTest del volumeDataTest # COMMAND ---------- volumeDataTest2.head() # COMMAND ---------- print volumeData.shape, volumeDataTest2.shape # COMMAND ---------- # append columns to find whether it is a holiday, weekend or weekday and what is the hour in 24 hours from datetime import datetime
# 导入数据 filename = "c:\\LOG\\879448_2020-11-18-13-00-00_2020-11-25-09-00-00" # !! 修改!!!! data = pd.read_csv(filename+"_RAW.csv", parse_dates=[2]) data['flag'] = '' print('records:', len(data)) # 建立时间轴 # 根据数据生成目标时间格子 min = _min if _min < data['TIME'].min() else data['TIME'].min() #!! 修改!!!! max = _max if _max > data['TIME'].max() else data['TIME'].max() # !! 修改!!!! # min = data['TIME'].min() # max = data['TIME'].min() # Gridlist = pd.date_range(min.replace(microsecond=0, second=0, minute=min.minute//5*5), max+pd.DateOffset(minutes=5), freq='5T') # Gridlist = pd.date_range(min, max, freq='5T') Gridlist = pd.date_range(min.replace(microsecond=0, second=0, minute=min.minute//5*5), max+pd.DateOffset(minutes=5), freq='5T') Gridlist = pd.DataFrame(Gridlist, columns=['TIME']) # 数据处理 # 建立ID列表 IDSet = set(data['ID'].values) # 每个ID循环处理 for id in IDSet: print('\nProcessing ', id) data_per_ID = data[data.ID == id] data1 = data_per_ID.values.tolist() print('raw records:', len(data1))
distances = pd.read_csv(args.distances, sep="\t") distances_by_sample_names = get_distances_by_sample_names(distances) # Load model details with open(args.model, "r") as fh: model_json = json.load(fh) predictors = model_json["predictors"] cost_function = model_json["cost_function"] l1_lambda = model_json["l1_lambda"] coefficients = np.array(model_json["coefficients_mean"]) mean_stds = np.array(model_json["mean_stds_mean"]) delta_month = args.delta_months[-1] delta_time = delta_month / 12.0 delta_offset = pd.DateOffset(months=delta_month) model = DistanceExponentialGrowthModel(predictors=predictors, delta_time=delta_time, cost_function=cost_function, l1_lambda=l1_lambda, distances=distances_by_sample_names) model.coef_ = coefficients model.mean_stds_ = mean_stds # collect fitness and projection forecasts_df = model.predict(tips) forecasts_df["weighted_distance_to_future_by_%s" % "-".join(predictors)] = forecasts_df["y"] # collect dicts from dataframe
adjustment_reader=bundle_data.adjustment_reader) #Fonction pour récupérer les prix à partir du portail de données def get_pricing(data_portal, trading_calendar, assets, start_date, end_date, field='close'): end_dt = pd.Timestamp(end_date.strftime('%Y-%m-%d'), tz='UTC', offset='C') start_dt = pd.Timestamp(start_date.strftime('%Y-%m-%d'), tz='UTC', offset='C') end_loc = trading_calendar.closes.index.get_loc(end_dt) start_loc = trading_calendar.closes.index.get_loc(start_dt) return data_portal.get_history_window( assets=assets, end_dt=end_dt, bar_count=end_loc - start_loc, frequency='1d', field=field, data_frequency='daily') #Nous récupérer les données pour notre modèle de risque à l'aide de la fonction get_pricing. #Pour ce modèle, nous utiliserons sur 5 ans de données. five_year_returns = \ get_pricing( data_portal, trading_calendar, universe_tickers, universe_end_date - pd.DateOffset(years=5), universe_end_date)\ .pct_change()[1:].fillna(0) five_year_returns.sample(5)
def custom1(inst): new_index = inst.index+pds.DateOffset(milliseconds=500) d = pds.Series(2.0 * inst['mlt'], index=new_index) d.name = 'doubleMLT' print(new_index) return d
def evaluate_prediction(self, start_date=None, end_date=None, nshares = None): # Default start date is one year before end of data # Default end date is end date of data if start_date is None: start_date = self.max_date - pd.DateOffset(years=1) if end_date is None: end_date = self.max_date start_date, end_date = self.handle_dates(start_date, end_date) # Training data starts self.training_years years before start date and goes up to start date train = self.stock[(self.stock['Date'] < start_date) & (self.stock['Date'] > (start_date - pd.DateOffset(years=self.training_years)))] # Testing data is specified in the range test = self.stock[(self.stock['Date'] >= start_date) & (self.stock['Date'] <= end_date)] # Create and train the model model = self.create_model() model.fit(train) # Make a future dataframe and predictions future = model.make_future_dataframe(periods = 365, freq='D') future = model.predict(future) # Merge predictions with the known values test = pd.merge(test, future, on = 'ds', how = 'inner') train = pd.merge(train, future, on = 'ds', how = 'inner') # Calculate the differences between consecutive measurements test['pred_diff'] = test['yhat'].diff() test['real_diff'] = test['y'].diff() # Correct is when we predicted the correct direction test['correct'] = (np.sign(test['pred_diff'][1:]) == np.sign(test['real_diff'][1:])) * 1 # Accuracy when we predict increase and decrease increase_accuracy = 100 * np.mean(test[test['pred_diff'] > 0]['correct']) decrease_accuracy = 100 * np.mean(test[test['pred_diff'] < 0]['correct']) # Calculate mean absolute error test_errors = abs(test['y'] - test['yhat']) test_mean_error = np.mean(test_errors) train_errors = abs(train['y'] - train['yhat']) train_mean_error = np.mean(train_errors) # Calculate percentage of time actual value within prediction range test['in_range'] = False for i in test.index: if (test.loc[i, 'y'] < test.loc[i, 'yhat_upper']) & (test.loc[i, 'y'] > test.loc[i, 'yhat_lower']): test.loc[i, 'in_range'] = True in_range_accuracy = 100 * np.mean(test['in_range']) if not nshares: # Date range of predictions print('\nPrediction Range: {} to {}.'.format(start_date, end_date)) # Final prediction vs actual value print('\nPredicted price on {} = ${:.2f}.'.format(max(future['ds']), future.loc[future.index[-1], 'yhat'])) print('Actual price on {} = ${:.2f}.\n'.format(max(test['ds']), test.loc[test.index[-1], 'y'])) print('Average Absolute Error on Training Data = ${:.2f}.'.format(train_mean_error)) print('Average Absolute Error on Testing Data = ${:.2f}.\n'.format(test_mean_error)) # Direction accuracy print('When the model predicted an increase, the price increased {:.2f}% of the time.'.format(increase_accuracy)) print('When the model predicted a decrease, the price decreased {:.2f}% of the time.\n'.format(decrease_accuracy)) print('The actual value was within the {:d}% confidence interval {:.2f}% of the time.'.format(int(100 * model.interval_width), in_range_accuracy)) # If a number of shares is specified, play the game elif nshares: # Only playing the stocks when we predict the stock will increase test_pred_increase = test[test['pred_diff'] > 0] test_pred_increase.reset_index(inplace=True) prediction_profit = [] # Iterate through all the predictions and calculate profit from playing for i, correct in enumerate(test_pred_increase['correct']): # If we predicted up and the price goes up, we gain the difference if correct == 1: prediction_profit.append(nshares * test_pred_increase.loc[i, 'real_diff']) # If we predicted up and the price goes down, we lose the difference else: prediction_profit.append(nshares * test_pred_increase.loc[i, 'real_diff']) test_pred_increase['pred_profit'] = prediction_profit # Put the profit into the test dataframe test = pd.merge(test, test_pred_increase[['ds', 'pred_profit']], on = 'ds', how = 'left') test.loc[0, 'pred_profit'] = 0 # Profit for either method at all dates test['pred_profit'] = test['pred_profit'].cumsum().ffill() test['hold_profit'] = nshares * (test['y'] - float(test.loc[0, 'y'])) # Display information print('You played the stock market in {} from {} to {} with {} shares.\n'.format( self.symbol, start_date, end_date, nshares)) print('When the model predicted an increase, the price increased {:.2f}% of the time.'.format(increase_accuracy)) print('When the model predicted a decrease, the price decreased {:.2f}% of the time.\n'.format(decrease_accuracy)) # Display some friendly information about the perils of playing the stock market print('The total profit using the Prophet model = ${:.2f}.'.format(np.sum(prediction_profit))) print('The Buy and Hold strategy profit = ${:.2f}.'.format(float(test.loc[test.index[-1], 'hold_profit']))) print('\nThanks for playing the stock market!\n') # Plot the predicted and actual profits over time # Final profit and final smart used for locating text final_profit = test.loc[test.index[-1], 'pred_profit'] final_smart = test.loc[test.index[-1], 'hold_profit'] # text location last_date = test.loc[test.index[-1], 'ds'] text_location = (last_date - pd.DateOffset(months = 1)) return test
def plot1_s4(self, const='G', freq='S4_sig1', sbas=False): if self._check_noNull_values(const, freq): # Get file UTC date figure_name = self._figure_name() fecha = figure_name[5:] # e.g. 200926 fecha2 = datetime.datetime.strptime(fecha, "%y%m%d") fecha3 = datetime.datetime.strftime(fecha2, "%Y/%m/%d") fecha2_tomorrow = fecha2 + pd.DateOffset(days=1) fecha2_tomorrow = fecha2_tomorrow.to_pydatetime() # Get UTC day range, to add a vertical strip fecha_morning_first = fecha2 + pd.DateOffset(hours=11) fecha_morning_first = fecha_morning_first.to_pydatetime() fecha_morning_last = fecha2 + pd.DateOffset(hours=23) fecha_morning_last = fecha_morning_last.to_pydatetime() # Get the PRNs PRNs = self.extract_prns(const, freq) # Include SBAS data if corresponds if sbas: PRNs = self._append_sbas_prns(const, freq, PRNs) # Create the figure with the subplots n_rows = (len(PRNs) + 1) // 2 n_cols = 2 fig, axs = plt.subplots(n_rows, n_cols, figsize=(7 * n_cols, 1 * n_rows), sharex="col", sharey="row", gridspec_kw={ 'hspace': 0, 'wspace': 0 }) j = 0 for ax in axs.T.reshape( -1): # Plot up to down, rather than left to right # ax -> s4 # ax2 -> elevation ax2 = ax.twinx() if j < len(PRNs): # Plot s4 info prn_value = PRNs[j] # -> Get the correct freq if sbas==True if sbas and prn_value[0] == 'S': freq_n = self._change_frequency(const, freq) else: freq_n = freq df3_s4 = self.get_s4(prn_value, freq_n) color1 = "blue" # This color is used in y axis labels, ticks and border colors1 = ["lightsteelblue", "cornflowerblue", "navy"] # These colors are used for the plots for k in range(3): df4_s4 = df3_s4[k + 1] ax.plot(df4_s4.index, df4_s4.values, '.', color=colors1[k], markersize=2) ax.set_facecolor(color="lightgrey") ax.axvspan(fecha_morning_first, fecha_morning_last, color="white") # strip morning/night # Plot elevation info df3_elev = self.get_elevation(PRNs[j], freq) color2 = "orange" ax2.plot(df3_elev.index, df3_elev.values, '.', color=color2, markersize=1) # Annotate the prn in the subplot x_location = fecha2 + pd.Timedelta(minutes=30) ax2.text(x_location, 35, self._convert2SVID(PRNs[j]), fontsize=15, weight='roman') # 0.375 # Set axis limits ax.set_xlim([fecha2, fecha2_tomorrow]) ax.set_ylim([0, 1]) ax2.set_ylim([0, 90]) # Set ticks and tick labels # Set y axis format, labels odds subplots only len_half_ax = len(axs.T.reshape(-1)) / 2 if j >= len_half_ax: # change only for the 2nd column k = j - len_half_ax # Set y labels only to even subplots ax.yaxis.set_minor_locator(AutoMinorLocator(4)) ax.set_yticks([0, 1]) ax2.yaxis.set_minor_locator(AutoMinorLocator(4)) ax2.set_yticks([0, 90]) if k % 2 == 0: ax.set_yticklabels([0, 1]) ax2.set_yticklabels([0, 90]) else: ax.set_yticklabels(['', '']) ax2.set_yticklabels(['', '']) # Set yellow color to the right y axis for axis in ['top', 'bottom', 'left']: ax.spines[axis].set_linewidth(2) ax2.spines[axis].set_linewidth(2) ax.spines['right'].set_color(color2) ax.spines['right'].set_linewidth(2) ax2.spines['right'].set_color(color2) ax2.spines['right'].set_linewidth(2) ax2.tick_params(axis='y', which='both', colors=color2) else: # apply some changes to the 1st column # remove y tick labels for elevation ax2.yaxis.set_minor_locator(AutoMinorLocator(4)) ax2.set_yticks([0, 90]) ax2.set_yticklabels(['', '']) # set linewidth to top, bottom and right borders of the subplot for axis in ['top', 'bottom', 'right']: ax.spines[axis].set_linewidth(2) ax2.spines[axis].set_linewidth(2) # Set blue color to the left y axis ax.spines['left'].set_color(color1) ax.spines['left'].set_linewidth(2) ax2.spines['left'].set_color(color1) ax2.spines['left'].set_linewidth(2) ax.tick_params(axis='y', which='both', colors=color1) # set x axis format hours = mdates.HourLocator(interval=2) ax.xaxis.set_major_locator(hours) # ticks interval: 2h ax.xaxis.set_minor_locator( AutoMinorLocator(2)) # minor tick division: 2 myFmt = DateFormatter("%H") ax.xaxis.set_major_formatter(myFmt) # x format: hours # set the ticks style ax.xaxis.set_tick_params(width=2, length=8, which='major', direction='out') ax.xaxis.set_tick_params(width=1, length=4, which='minor', direction='out') ax.yaxis.set_tick_params(width=2, length=15, which='major', direction='inout') ax.yaxis.set_tick_params(width=1, length=4, which='minor', direction='out') ax2.yaxis.set_tick_params(width=2, length=15, which='major', direction='inout') ax2.yaxis.set_tick_params(width=1, length=4, which='minor', direction='out') # set the label ticks ax.tick_params(axis='x', which='major', labelsize=12) ax.tick_params(axis='y', labelsize=12) ax2.tick_params(axis='y', labelsize=12) # set grid ax.grid(which='major', axis='both', ls=':', linewidth=1.2) ax.grid(which='minor', axis='both', ls=':', alpha=0.5) # Set title and axis labels aux = self.get_freq_name(const, int(freq[-1])) frequency_name = aux["name"] frequency_value = aux["value"] + "MHz" # -> Title if j == 0: # Subplot on Upper left fig.text(0, 1, fecha3, ha='left', va='bottom', fontsize=17, weight='semibold', transform=ax.transAxes) fig.text(0.5, 1, 'Jicamarca', ha='left', va='bottom', fontsize=17, weight='semibold', transform=ax.transAxes) if j == n_rows - 1: # Subplot on Lower left pass if j == n_rows: # Subplot on Upper right fig.text(0, 1, 'S4', ha='center', va='bottom', fontsize=17, weight='semibold', transform=ax.transAxes) fig.text(0.4, 1, frequency_value, ha='center', va='bottom', fontsize=17, weight='semibold', transform=ax.transAxes) fig.text( 1, 1, f"{frequency_name} | {self.get_const_name(const)}", ha='right', va='bottom', fontsize=17, weight='semibold', transform=ax.transAxes) # -> Labels if j == n_rows * n_cols - 1: # x axis label, Subplot on Lower right fig.text(0, -0.6, 'Time UTC', ha='center', va='center', fontsize=14, transform=ax.transAxes) if j == int(n_rows / 2): # y axis label on the left k = (n_rows % 2) * 0.5 fig.text(-0.1, 1 - k, 'S4', ha='center', va='center', rotation='vertical', fontsize=14, color='b', transform=ax.transAxes) if j == int(n_rows * n_cols - n_rows / 2): # y axis label on the right k = (n_rows % 2) * 0.5 fig.text(1.1, 1 - k, 'Elevation Angle', ha='center', va='center', rotation=-90, fontsize=14, color=color2, transform=ax.transAxes) j += 1 # Create directory for output files new_directory = output_files_path + figure_name + "/plot_2/" if not os.path.exists(new_directory): os.makedirs(new_directory) # Save figure as pdf #figure_name2 = figure_name + f"_s4_{self.get_const_name(const)}_{frequency_name}.pdf" #plt.savefig(new_directory + figure_name2, bbox_inches='tight') #pdf.savefig(fig) print( f"Plotted successfully; for const: {const}, and freq: {freq}!") return fig else: print( f"There is only Null data; for const: {const}, and freq: {freq}!" ) return 0
'max_demand_ind': max_demand_ind, 'max_part_peak_demand_ind': max_part_peak_demand_ind, 'max_peak_demand_ind': max_peak_demand_ind } return retval def charge_max(index, arrival_time, departure_time, limit_kw): retval = np.zeros(len(index)) # retval[(index > arrival_time) & (index < departure_time)] = limit_kw retval[(index > arrival_time)] = limit_kw return retval start = pd.Timestamp("2020-03-09 00:00", tz="US/Pacific") # Monday index = pd.date_range(start, start + pd.DateOffset(hours=24), freq="15min")[:-1] names = ["EV1", "EV2", "EV3", "EV4", "EV5", "EV6", "EV7", "EV8", "EV9", "EV10"] departures = [ pd.Timestamp("2020-03-09 11:17", tz="US/Pacific"), pd.Timestamp("2020-03-09 17:44", tz="US/Pacific"), pd.Timestamp("2020-03-09 16:22", tz="US/Pacific"), pd.Timestamp("2020-03-09 17:19", tz="US/Pacific"), pd.Timestamp("2020-03-09 17:20", tz="US/Pacific"), pd.Timestamp("2020-03-09 12:23", tz="US/Pacific"), pd.Timestamp("2020-03-09 13:38", tz="US/Pacific"), pd.Timestamp("2020-03-09 14:42", tz="US/Pacific"), pd.Timestamp("2020-03-09 16:28", tz="US/Pacific"), pd.Timestamp("2020-03-09 19:01", tz="US/Pacific") ]
def create_dataset(self, len_closeness=3, len_period=3, PeriodInterval=1, len_trend=3, TrendInterval=7, len_y=3): """current version """ # offset_week = pd.DateOffset(days=7) offset_frame = pd.DateOffset(minutes=24 * 60 // self.T) XC = [] XCS=[list() for i in range(7)] XP = [] XT = [] XCY = [] Y = [] timestamps_Y = [] cnm=0 depends = [[-6 * PeriodInterval * self.T - j for j in range(cnm, len_closeness + 1)[::-1]], [-5 * PeriodInterval * self.T - j for j in range(cnm, len_closeness + 1)[::-1]], [-4 * PeriodInterval * self.T - j for j in range(cnm, len_closeness + 1)[::-1]], [-3 * PeriodInterval * self.T - j for j in range(cnm, len_closeness + 1)[::-1]], [-2 * PeriodInterval * self.T - j for j in range(cnm, len_closeness + 1)[::-1]], [-1 * PeriodInterval * self.T - j for j in range(cnm, len_closeness + 1)[::-1]], [-0 * PeriodInterval * self.T - j for j in range(cnm, len_closeness + 1)[::-1]], # [j for j in range(len_y)], [0]] i = max(self.T * TrendInterval * len_trend, self.T * PeriodInterval * len_period, len_closeness) while i < (len(self.pd_timestamps) - (len_y - 1)): Flag = True for depend in depends: if Flag is False: break # Flag = self.check_it([self.pd_timestamps[i] + j * offset_frame for j in depend]) if Flag is False: i += 1 continue x_c6 = [self.get_matrix(self.pd_timestamps[i] + j * offset_frame) for j in depends[0]] x_c5 = [self.get_matrix(self.pd_timestamps[i] + j * offset_frame) for j in depends[1]] x_c4 = [self.get_matrix(self.pd_timestamps[i] + j * offset_frame) for j in depends[2]] x_c3 = [self.get_matrix(self.pd_timestamps[i] + j * offset_frame) for j in depends[3]] x_c2 = [self.get_matrix(self.pd_timestamps[i] + j * offset_frame) for j in depends[4]] x_c1 = [self.get_matrix(self.pd_timestamps[i] + j * offset_frame) for j in depends[5]] x_c0 = [self.get_matrix(self.pd_timestamps[i] + j * offset_frame) for j in depends[6]] # x_c_y = [self.get_matrix(self.pd_timestamps[i] + j * offset_frame) for j in a] y = [self.get_matrix(self.pd_timestamps[i] + j * offset_frame) for j in depends[-1]] if len_closeness > 0: XCS[0].append(x_c6) XCS[1].append(x_c5) XCS[2].append(x_c4) XCS[3].append(x_c3) XCS[4].append(x_c2) XCS[5].append(x_c1) XCS[6].append(x_c0) if len_period > 0: XP.append((x_c2)) if len_trend > 0: XT.append((x_c3)) # if len_y > 0: # XCY.append((x_c_y)) Y.append(y) timestamps_Y.append(self.timestamps[i]) i += 1 XC = np.asarray(XC) XCS=[np.asarray(XC) for XC in XCS] XP = np.asarray(XP) XT = np.asarray(XT) XCY = np.asarray(XCY) Y = np.asarray(Y) print("STMatrix XC shape: ", XC.shape, "XP shape: ", XP.shape, "XT shape: ", XT.shape, "XCY shape: ", XCY.shape, "Y shape:", Y.shape) return XCS, XP, XT, XCY, Y, timestamps_Y
def __init__(self, return_calculator=None, options=None): """A portfolio selector for bond indexes """ self.options = options self.return_calculator = return_calculator self.unnalocated_symbol = 'UNNALOCATED_SYMBOL' self.concentration_rules = dict() # now checking the options if 'MAX_MATURITY' in options: self.max_maturity = options['MAX_MATURITY'] else: self.max_maturity = pd.DateOffset(years=200) if 'MIN_MATURITY' in options: self.min_maturity = options['MIN_MATURITY'] else: self.min_maturity = pd.DateOffset(days=0) if 'MIN_ELIGIBLE_MATURITY' in options: # Bonds with maturity closer than 'MIN_ELIGIBLE_MATURITY' will not enter the portfolio if they are not # already in. self.min_eligible_maturity = options['MIN_ELIGIBLE_MATURITY'] else: self.min_eligible_maturity = None if 'SELL_DEFAULTED' in options: self.sell_defaulted = options['SELL_DEFAULTED'] else: self.sell_defaulted = False if 'CORPORATE_MIN_OUTSTANDING' in options: self.corp_min_outstanding = options['CORPORATE_MIN_OUTSTANDING'] else: self.corp_min_outstanding = 0 if 'GOVT_MIN_OUTSTANDING' in options: self.govt_min_outstanding = options['GOVT_MIN_OUTSTANDING'] else: self.govt_min_outstanding = 0 if 'CORP_WEIGHT' in options: self.corp_weight = options['CORP_WEIGHT'] else: self.corp_weight = 0.8 if 'GOVT_WEIGHT' in options: self.govt_weight = options['GOVT_WEIGHT'] else: self.govt_weight = 1 - self.corp_weight if 'MAX_TOLERANCE_AMONG_PORTFOLIOS' in options: self.max_tolerance_among_portfolios = options[ 'MAX_TOLERANCE_AMONG_PORTFOLIOS'] else: self.max_tolerance_among_portfolios = 0.0 if 'MAX_TOLERANCE_AMONG_SECURITIES' in options: self.max_tolerance_among_securities = options[ 'MAX_TOLERANCE_AMONG_SECURITIES'] else: self.max_tolerance_among_securities = 0.0 # Now checking the concentration rules if any('CONCENTRATION_RULES' in x for x in options.keys()): # Building concentration_rules dict for key, value in options.items(): if 'CONCENTRATION_RULES' in key: # Remove the first two words, which are CONCENTRATION and RULES arguments = list(key.split('_'))[2:] current_dict = self.concentration_rules if len(arguments) >= 2: for arg in arguments[:-1]: try: converted_arg = float(arg) except: converted_arg = arg if converted_arg in current_dict.keys(): pass else: current_dict[converted_arg] = dict() current_dict = current_dict[converted_arg] last_arg = arguments[-1] try: last_arg = float(last_arg) except: pass current_dict[last_arg] = value print('Successfully instantiated BondIndex Class with options:') print('max maturity: ' + str(self.max_maturity)) print('min maturity: ' + str(self.min_maturity)) print('min eligible maturity: {}'.format(self.min_eligible_maturity)) print('sell defaulted : ' + str(self.sell_defaulted)) print('min govt outstanding: {}'.format(self.govt_min_outstanding)) print('corp weight: {}'.format(self.corp_weight)) print('govt weight: {}'.format(self.govt_weight)) print('rebalancing tolerance between portfolios: {}'.format( self.max_tolerance_among_portfolios)) print('rebalancing tolerance between securities: {}'.format( self.max_tolerance_among_securities)) print('Concentration rules:') print(self.concentration_rules)
str_to_convert = "%.6f" % as_float # have to do this because of leap seconds time_string, dot, microseconds = str_to_convert.partition(".") utc_time_tuple = time.strptime(str_to_convert, LONG_DATE_FORMAT) as_datetime = datetime.datetime(1970, 1, 1) + datetime.timedelta( seconds=calendar.timegm(utc_time_tuple)) as_datetime = as_datetime.replace( microsecond=datetime.datetime.strptime(microseconds, "%f").microsecond) return as_datetime NOTIONAL_CLOSING_TIME = dict(hours=23, minutes=0, seconds=0) NOTIONAL_CLOSING_TIME_AS_PD_OFFSET = pd.DateOffset( hours=NOTIONAL_CLOSING_TIME['hours'], minutes=NOTIONAL_CLOSING_TIME['minutes'], seconds=NOTIONAL_CLOSING_TIME['seconds']) def adjust_timestamp_to_include_notional_close_and_time_offset( timestamp: datetime.datetime, actual_close: pd.DateOffset = NOTIONAL_CLOSING_TIME_AS_PD_OFFSET, original_close: pd.DateOffset = pd.DateOffset(hours=23, minutes=0, seconds=0), time_offset: pd.DateOffset = pd.DateOffset(hours=0), ) -> datetime.datetime: if timestamp.hour == 0 and timestamp.minute == 0 and timestamp.second == 0: new_datetime = timestamp.date() + actual_close elif time_matches(timestamp, original_close):
def predict_future(self, days=30): # Use past self.training_years years for training train = self.stock[self.stock['Date'] > (max(self.stock['Date']) - pd.DateOffset(years=self.training_years)).date()] model = self.create_model() model.fit(train) # Future dataframe with specified number of days to predict future = model.make_future_dataframe(periods=days, freq='D') future = model.predict(future) # Only concerned with future dates future = future[future['ds'] >= max(self.stock['Date']).date()] # Remove the weekends future = self.remove_weekends(future) # Calculate whether increase or not future['diff'] = future['yhat'].diff() future = future.dropna() # Find the prediction direction and create separate dataframes future['direction'] = (future['diff'] > 0) * 1 # Rename the columns for presentation future = future.rename(columns={'ds': 'Date', 'yhat': 'estimate', 'diff': 'change', 'yhat_upper': 'upper', 'yhat_lower': 'lower'}) future_increase = future[future['direction'] == 1] future_decrease = future[future['direction'] == 0] print('\nPredicted Increase: \n') print(future_increase[['Date', 'estimate', 'change', 'upper', 'lower']]) print('\nPredicted Decrease: \n') print(future_decrease[['Date', 'estimate', 'change', 'upper', 'lower']]) self.reset_plot() # Set up plot plt.style.use('fivethirtyeight') matplotlib.rcParams['axes.labelsize'] = 10 matplotlib.rcParams['xtick.labelsize'] = 8 matplotlib.rcParams['ytick.labelsize'] = 8 matplotlib.rcParams['axes.titlesize'] = 12 # Plot the predictions and indicate if increase or decrease fig, ax = plt.subplots(1, 1, figsize=(8, 6)) # Plot the estimates ax.plot(future_increase['Date'], future_increase['estimate'], 'g^', ms = 12, label = 'Pred. Increase') ax.plot(future_decrease['Date'], future_decrease['estimate'], 'rv', ms = 12, label = 'Pred. Decrease') # Plot errorbars ax.errorbar(future['Date'].dt.to_pydatetime(), future['estimate'], yerr = future['upper'] - future['lower'], capthick=1.4, color = 'k',linewidth = 2, ecolor='darkblue', capsize = 4, elinewidth = 1, label = 'Pred with Range') # Plot formatting plt.legend(loc = 2, prop={'size': 10}); plt.xticks(rotation = '45') plt.ylabel('Predicted Stock Price (US $)'); plt.xlabel('Date'); plt.title('Predictions for %s' % self.symbol); plt.show() return future # Plot of training and testing average errors self.reset_plot() plt.plot(results['cps'], results['train_err'], 'bo-', ms = 8, label = 'Train Error') plt.plot(results['cps'], results['test_err'], 'r*-', ms = 8, label = 'Test Error') plt.xlabel('Changepoint Prior Scale'); plt.ylabel('Avg. Absolute Error ($)'); plt.title('Training and Testing Curves as Function of CPS') plt.grid(color='k', alpha=0.3) plt.xticks(results['cps'], results['cps']) plt.legend(prop={'size':10}) plt.show(); # Plot of training and testing average uncertainty self.reset_plot() plt.plot(results['cps'], results['train_range'], 'bo-', ms = 8, label = 'Train Range') plt.plot(results['cps'], results['test_range'], 'r*-', ms = 8, label = 'Test Range') plt.xlabel('Changepoint Prior Scale'); plt.ylabel('Avg. Uncertainty ($)'); plt.title('Uncertainty in Estimate as Function of CPS') plt.grid(color='k', alpha=0.3) plt.xticks(results['cps'], results['cps']) plt.legend(prop={'size':10}) plt.show();
sam3['EArray_norm'] = sam3['Subarray 1 DC power gross | (kW)'] / norm_factor_SAM3 # In[31]: #Add prefixes, Update YEAR to same and MERGE sam1foo = sam1.add_prefix('sam_') sam2foo = sam2.add_prefix('sam_') sam3foo = sam3.add_prefix('sam_') pvsyst1foo = pvsyst1.add_prefix('pvsyst_') pvsyst2foo = pvsyst2.add_prefix('pvsyst_') pvsyst3foo = pvsyst3.add_prefix('pvsyst_') pvsyst1foo.index = pvsyst1foo.index + pd.DateOffset(year=2020) pvsyst2foo.index = pvsyst2foo.index + pd.DateOffset(year=2020) pvsyst3foo.index = pvsyst3foo.index + pd.DateOffset(year=2020) case1 = pd.concat([sam1foo, pvsyst1foo], axis=1) case2 = pd.concat([sam2foo, pvsyst2foo], axis=1) case3 = pd.concat([sam3foo, pvsyst3foo], axis=1) # ### Input Irradiances comparison # # It is recommended to check days before and after summer-time hour shift (JIC), and also days that can and CANT be exchanged by months (MM/DD vs DD/MM). # # PvSyst is saved as DD/MM, SAM is saved as MM/DD # In[32]: