def build_features(): data = load('../../data/raw/stock_data.pickle') # features = data['tesla'] features = data['amazon'] features = get_technical_indicators(features) # features = get_corr_assets(features) features = get_fourier_transforms(features) # features = get_automotive_industry_close_prices(features, data) # save(features, '../../data/interim/features.pickle') save(features, '../../data/interim/features_amazon.pickle')
def get_raw_stock_data(): columns = { 'Open': "1. open", 'High': "2. high", 'Low': "3. low", 'Close': "4. close", 'Volume': "5. volume" } api_requests = { 'tesla': 'https://www.alphavantage.co/query?function=TIME_SERIES_DAILY&symbol=TSLA&interval=60min&outputsize=full&apikey=CMSZIWYWAHTR01BR', 'google': 'https://www.alphavantage.co/query?function=TIME_SERIES_DAILY&symbol=GOOGL&interval=60min&outputsize=full&apikey=CMSZIWYWAHTR01BR', 'bmw': 'https://www.alphavantage.co/query?function=TIME_SERIES_DAILY&symbol=FRA:BMW&interval=60min&outputsize=full&apikey=CMSZIWYWAHTR01BR', 'daimler': 'https://www.alphavantage.co/query?function=TIME_SERIES_DAILY&symbol=ETR:DAI&interval=60min&outputsize=full&apikey=CMSZIWYWAHTR01BR', 'porshe': 'https://www.alphavantage.co/query?function=TIME_SERIES_DAILY&symbol=ETR:PAH3&interval=60min&outputsize=full&apikey=CMSZIWYWAHTR01BR', 'amazon': 'https://www.alphavantage.co/query?function=TIME_SERIES_DAILY&symbol=AMZN&interval=60min&outputsize=full&apikey=CMSZIWYWAHTR01BR' } raw_stock_data = {} for company, request in api_requests.items(): req = requests.get(request) data = req.json() num_rows = len(list(data['Time Series (Daily)'].keys())) num_cols = 6 array = np.full((num_rows, num_cols), fill_value=np.NaN) df = pd.DataFrame( data=array, columns=['Date', 'Close', 'Open', 'High', 'Low', 'Volume']) for i, (date, values) in enumerate(data['Time Series (Daily)'].items()): row = [ date, values[columns['Close']], values[columns['Open']], values[columns['High']], values[columns['Low']], values[columns['Volume']] ] df.iloc[(num_rows - i - 1), :] = np.array(row) df[['Close', 'Open', 'High', 'Low', 'Volume']] = df[['Close', 'Open', 'High', 'Low', 'Volume']].apply(pd.to_numeric) df['Date'] = pd.to_datetime(df.Date, format='%Y-%m-%d') # df.index = df['Date'] # df.drop('Date', axis=1, inplace=True) raw_stock_data[company] = df save(raw_stock_data, '../../data/raw/stock_data.pickle') return raw_stock_data
def build_data_trading_plot(): features = load('../../data/processed/features_amazon_corr.pickle') targets = load('../../data/processed/targets_amazon.pickle') not_relevant_days = 1660 features, targets = drop_not_relevant(features, targets, not_relevant_days) test_size = int(0.05 * features.shape[0]) val_size = int(0.15 * features.shape[0]) data, name = build_data(features, targets, lookback=1, scaled=False, encode_binary=False, test_size=test_size, val_size=val_size, pct_change=False) save(data, '../../data/timeseries/' + name + '_trading_vis_amazon.pickle')
def build_default_data(): features = load('../../data/processed/features_amazon_corr.pickle') targets = load('../../data/processed/targets_amazon.pickle') not_relevant_days = 1660 features, targets = drop_not_relevant(features, targets, not_relevant_days) test_size = int(0.05 * features.shape[0]) val_size = int(0.15 * features.shape[0]) for encode_binary in [True, False]: for scaled in [True, False]: for lookback in [1, 60]: data, name = build_data(features, targets, lookback=lookback, scaled=scaled, encode_binary=encode_binary, test_size=test_size, val_size=val_size, pct_change=True) save(data, '../../data/timeseries/' + name + '_amazon.pickle')
def select_features(): features = load('../../data/interim/features.pickle') lookback = 1 model = build_model(features, lookback) xgb.plot_importance(model) if lookback == 1: fig = plt.figure(figsize=(8, 8)) plt.xticks(rotation='vertical') plt.barh([i for i in range(len(model.feature_importances_))], model.feature_importances_.tolist(), tick_label=features.columns[list(range(6,14)) + list(range(15, features.shape[1]))]) plt.title('Istotność cech') plt.xlim((0, 0.3)) plt.ylabel('Cecha') plt.xlabel('Istotność') plt.show() else: feature_importances = calculate_feature_importances(features, lookback, model) plot_feature_importance(feature_importances) selected_features, selected_features_with_lookback = threshold_features(feature_importances, lookback) print('Selected features with trees: ', selected_features) not_selected_features = list(set(features.columns.to_list()).difference(set(selected_features))) print('Not selected features: ', not_selected_features) save(selected_features, '../../data/interim/selected_features_labels_trees.pickle') save(features[selected_features], '../../data/processed/selected_features_trees.pickle') save(selected_features_with_lookback, '../../data/interim/selected_features_labels_with_lookback_trees.pickle')
def save_targets(): data = load('../../data/raw/stock_data.pickle') save(pd.DataFrame(data['amazon'][['Date', 'Close']], columns=['Date', 'Close']), '../../data/processed/targets_amazon.pickle')
def select_features(): features = load('../../data/interim/features_amazon.pickle') date = features['Date'] features.drop('Date', inplace=True, axis=1) # calculate difference between rows features_diff = features.pct_change().replace([np.inf, -np.inf], np.nan) # impute NaN values imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean') imp_mean.fit(features_diff) features_diff = imp_mean.transform(features_diff) # features_diff.dropna() features_diff = pd.DataFrame(data=features_diff, columns=features.columns) corr_matrix = features_diff.corr().abs() # calculate covariance matrix close_price_cov = pd.DataFrame( corr_matrix['Close'].sort_values(ascending=False)) # select features cov_threshold = 0.001 selected_features = close_price_cov[ close_price_cov['Close'] > cov_threshold].index.to_list() # print('Selected features with covariance: ', selected_features) not_selected_features = list( set(features.columns.to_list()).difference(set(selected_features))) print('Not selected features: ', not_selected_features) save(selected_features, '../../data/interim/selected_features_labels_cov.pickle') features = features[selected_features] # Select upper triangle of correlation matrix upper = corr_matrix.where( np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool)) # Find index of feature columns with correlation greater than 0.95 to_drop = [column for column in upper.columns if any(upper[column] > 0.95)] for f in not_selected_features: if f in to_drop: to_drop.remove(f) features.drop(to_drop, axis=1, inplace=True) print(to_drop) # normalize # for col in features_diff.columns: # features_diff[col] = preprocessing.scale(features_diff[col].values) # # # imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean') # features = pd.DataFrame(data=imp_mean.fit_transform(features), columns=features.columns) # ftrs_pct = features.pct_change() # ftrs_pct = ftrs_pct.replace([np.inf, -np.inf], np.nan) # ftrs_pct = pd.DataFrame(data=imp_mean.fit_transform(ftrs_pct), columns=ftrs_pct.columns) # ftrs_pct.astype(np.float64) # ftrs_pct['Date'] = date features['Date'] = date features.set_index('Date', inplace=True) save(features, '../../data/processed/features_amazon_corr.pickle') print()