def make_prediction(self, model_name, model_for_each_ticker_dict, preprocessed_data_dict, original_df_dict): """ Make future stock price prediction. :param model_name: str, name of the model. :param model_for_each_ticker_dict: dict. :param preprocessed_data_dict: dict. :param original_df_dict: dict. :return: """ logger.info( "----------------Predicting future prices using the {} model----------------" .format(model_name)) forecast_df_dict = {} for ticker_symbol, model in model_for_each_ticker_dict.items(): ticker_symbol = ticker_symbol.replace("_", "/") logger.info( "Predicting future prices for {}".format(ticker_symbol)) df_copy = original_df_dict[ticker_symbol].copy(deep=True) df_copy.dropna(inplace=True) X_forecast = preprocessed_data_dict[ticker_symbol][1] logger.debug("len(X_forecast) = {}".format(len(X_forecast))) forecast_set = model.predict(X_forecast) df_copy["{} - Forecast".format(ticker_symbol)] = forecast_set forecast_df_dict[ticker_symbol] = df_copy return forecast_df_dict
def stock_ticker_list(self): """ Returns the stock ticker list. :return _stock_ticker_list: list. """ logger.debug("Getting stock ticker list") return self._stock_ticker_list
def optimize_hyperparameters(self, model_name, cv_iterator): """ Optimize hyperparameters based on the cross validation score. :param model_name: str, name of the model :param cv_iterator: iterator, split train and test data. :return: """ logger.debug("Optimizing hyper-parameters") parameters_dict = self.parameters_dict[model_name] model = self.models_dict[model_name] # Hyperparameter optimization optimized_model = GridSearchCV(estimator=model, param_grid=parameters_dict, cv=cv_iterator) return optimized_model
def build_model(self, model_name, preprocessed_data_dict, force_build): """ Build machine learning models using different supervised learning regression algorithms :param model_name: str, name of the model to be built. :param preprocessed_data_dict: dict. :param force_build: bool, if True, will force the function to build the model, even if there is a saved model which was built before that is available. :return model_dict: dict, dictionary containing model name as key and the built model object as value. :return model_scores_dict: dictionary containing model name as key and the model training score as value. """ logger.info("----------------Building model using {}----------------".format(model_name)) model_dict = {} model_scores_dict = {} curr_dir = os.getcwd() for ticker_symbol, preprocessed_data in preprocessed_data_dict.items(): [X, X_forecast, y] = preprocessed_data tscv = TimeSeriesSplit(n_splits=5) ticker_symbol = ticker_symbol.replace("/", "_") if force_build or not os.path.exists( "{}/{}_{}_model.pickle".format(self.saved_models_path, model_name, ticker_symbol)): # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) # Create a cv iterator for splitting train and test data using TimeSeriesSplit # Optimize the hyperparameters based on the cross validation scores optimized_model = self.optimize_hyperparameters(model_name, tscv) model = make_pipeline(StandardScaler(), optimized_model) X_train, X_test, y_train, y_test = self.get_train_and_test_data(X, y, tscv) model.fit(X_train, y_train) self.save_to_pickle_file(model_name, ticker_symbol, model, "model") else: model = self.load_from_pickle_file(model_name, ticker_symbol, "model") X_train, X_test, y_train, y_test = self.get_train_and_test_data(X, y, tscv) # Training score confidence_score = model.score(X_test, y_test) # Plot learning curves title = "{}_{}_Learning Curves".format(model_name, ticker_symbol) save_file_path = "{}/learning_curve_plots/{}_{}.png".format(curr_dir, model_name, ticker_symbol) # Create the CV iterator self.plot_learning_curve(model, title, X, y, save_file_path, cv=tscv) # Cross validation cv_scores = cross_validate(model, X=X, y=y, cv=tscv) logger.info("Training score for {} = {}".format(ticker_symbol, confidence_score)) logger.debug("Cross validation scores for {} = {}".format(ticker_symbol, cv_scores["test_score"])) logger.info("Cross validation score for {} = {} +/- {}".format( ticker_symbol, cv_scores["test_score"].mean(), cv_scores["test_score"].std() * 2)) logger.debug("Cross validation scoring time = {}s".format(cv_scores["score_time"].sum())) model_dict[ticker_symbol] = model model_scores_dict[ticker_symbol] = confidence_score return model_dict, model_scores_dict
def get_train_and_test_data(self, X, y, tscv): """ Get the train and test data sets. :param X: ndarray. :param y: array :param tscv: iterator, TimeSeriesSplit iterator :return X_train, X_test, y_train, y_test: arrays, train and test data. """ split_data = [] for train_indices, test_indices in tscv.split(X): X_train, X_test = X[train_indices], X[test_indices] y_train, y_test = y[train_indices], y[test_indices] split_data.append((X_train, X_test, y_train, y_test)) # Get cross validation score for the last index as it will have the most training data which is good for time # series data best_split_index = -1 X_train, X_test, y_train, y_test = split_data[best_split_index] logger.debug("Last train_data size = {}".format(len(X_train) * 100 / len(X))) return X_train, X_test, y_train, y_test
def get_stock_data(self, update_data=False): """ Get stock data for the ticker symbols in the json file (stockdata/stockdatainfo.json) from quandl :param update_data: bool, tells the function whether to pull data everytime or not. :return df: Dataframe """ logger.info( "----------------Getting stock data from Quandl----------------") logger.info("Stock ticker list = {}".format(self._stock_ticker_list)) # df = quandl.get("WIKI/GOOGL") # Pull data if stockdata/stockdata.csv does not exist or if update_data is True. if update_data or not os.path.exists("{}/{}".format( os.getcwd(), self.stock_data_path)): df = quandl.get(self._stock_ticker_list) logger.info("Writing stock data to {}".format( self.stock_data_path)) # Write the dataframe to a csv fle df.to_csv("{}".format(self.stock_data_path)) logger.info("Reading stock data from {}".format(self.stock_data_path)) # df = pd.read_csv("{}".format(self.stock_data_path), index_col="Date") # Read the data from the csv file df = pd.read_csv("{}".format(self.stock_data_path)) logger.debug("df.shape = {}".format(df.shape)) return df
def preprocess_data(self, df, ticker_symbol_list): """ Preprocess stock data :param df: dataframe, original dataframe :param ticker_symbol_list: list, list of ticker symbols :return preprocessed_data_dict: dict, dictionary with ticker symbols as keys, preprocessed stock data dataframes as values :return original_df_dict: dict, dictionary with ticker symbols as keys, original stock data dataframes as values """ self.ticker_symbol_list = ticker_symbol_list logger.info("----------------Pre-processing data----------------") # Extract data frames for each ticker from the original data frame and put it in a dictionary. self.get_df_for_each_ticker(df) useful_features = ["Adj. Close", "HL_PCT", "PCT_change", "Adj. Volume"] for ticker_symbol, original_df in self.original_df_dict.items(): ticker_domain = ticker_symbol.split("/")[0] feature_list = self.get_feature_list(ticker_domain) logger.debug("Feature list for {} = {}".format( ticker_symbol, feature_list)) preprocessed_feature_list = list( map(lambda x, x1: "{} - {}".format(x, x1), [ticker_symbol] * len(feature_list), feature_list)) preprocessed_df = original_df[preprocessed_feature_list].copy( deep=True) if ticker_domain in ["WIKI"]: # Compute high to low and open to close stock price percentage values and add them to feature list preprocessed_df = self.get_high_to_low_pcnt_change( preprocessed_df, ticker_symbol) preprocessed_df = self.get_open_to_close_pcnt_change( preprocessed_df, ticker_symbol) preprocessed_feature_list = list( map(lambda x, x1: "{} - {}".format(x, x1), [ticker_symbol] * len(useful_features), useful_features)) preprocessed_df = preprocessed_df[preprocessed_feature_list] # Forecast column labels depending on the domain forecast_col_labels = { "WIKI": "{} - Adj. Close".format(ticker_symbol), "BCB": "{} - Value".format(ticker_symbol), "NASDAQOMX": "{} - Index Value".format(ticker_symbol) } preprocessed_df.dropna(inplace=True) preprocessed_df["label"] = preprocessed_df[ forecast_col_labels[ticker_domain]] X_forecast = np.array(preprocessed_df.drop(["label"], 1)) # Number of future data points to be predicted. forecast_out = int( math.ceil(self.future_prediction_pcnt * 0.01 * len(preprocessed_df))) preprocessed_df = preprocessed_df.iloc[ 0:int((1 - self.future_prediction_pcnt * 0.01) * len(preprocessed_df)), :] preprocessed_df["label"] = preprocessed_df["label"].shift( -forecast_out) preprocessed_df.dropna(inplace=True) X = np.array(preprocessed_df.drop(["label"], 1)) X = X[:-forecast_out] y = np.array(preprocessed_df["label"]) y = y[:-forecast_out] self.preprocessed_data_dict[ticker_symbol] = [X, X_forecast, y] return self.preprocessed_data_dict, self.original_df_dict