def main(self): """ Pulls stock data for the ticker symbols from a json file and pull data from quandl, preprocesses the data and then build different supervised learning machine learning models and predicts future stock price. :return: None """ logger.info( "------------------Started Stock Price Prediction-----------------" ) # Create instances of all the classes used for stock prediction get_data = GetData(api_key=sys.argv[1]) # Number of dates/data points into the future for which the stock price is to be predicted as a percentage of the # number of dates/data points for which historical data which is already available future_prediction_pcnt = 1 preprocess_data = PreprocessData( future_prediction_pcnt=future_prediction_pcnt) build_models = BuildModels() forecast_prices = Predictions() # Get data from quandl. df = get_data.get_stock_data(update_data=False) # Preprocess data preprocessed_data_dict, original_df_dict = preprocess_data.preprocess_data( df, get_data.stock_ticker_list) models_list = [ "Linear Regression", "Decision Tree Regressor", "Random Forest Regressor" ] # Build models models_dict, model_scores_dict = build_models.build_models( models_list, preprocessed_data_dict, force_build=False) # Predict future stock prices forecast_df_dict = forecast_prices.make_predictions( models_dict, preprocessed_data_dict, original_df_dict) self.plot_forecast(forecast_df_dict, original_df_dict, future_prediction_pcnt)
def make_prediction(self, model_name, model_for_each_ticker_dict, preprocessed_data_dict, original_df_dict): """ Make future stock price prediction. :param model_name: str, name of the model. :param model_for_each_ticker_dict: dict. :param preprocessed_data_dict: dict. :param original_df_dict: dict. :return: """ logger.info( "----------------Predicting future prices using the {} model----------------" .format(model_name)) forecast_df_dict = {} for ticker_symbol, model in model_for_each_ticker_dict.items(): ticker_symbol = ticker_symbol.replace("_", "/") logger.info( "Predicting future prices for {}".format(ticker_symbol)) df_copy = original_df_dict[ticker_symbol].copy(deep=True) df_copy.dropna(inplace=True) X_forecast = preprocessed_data_dict[ticker_symbol][1] logger.debug("len(X_forecast) = {}".format(len(X_forecast))) forecast_set = model.predict(X_forecast) df_copy["{} - Forecast".format(ticker_symbol)] = forecast_set forecast_df_dict[ticker_symbol] = df_copy return forecast_df_dict
def get_built_models(self): """ Get the models if they are already build. :return built_models_dict: dictionary containing model names as keys and built model objects as values. """ if self.built_models_dict: return self.built_models_dict else: logger.info("No models found. Run build_models first and then call this method.") exit(1)
def load_from_pickle_file(self, model_name, ticker_symbol, obj_name): """ Load the built model from a pickle file. :param model_name: str, name of the model. :param ticker_symbol: str, ticker symbol. :param obj_name: str, name of the built model object. :return loaded_obj: object, model object. """ logger.info("Loading {} model for {} from pickle file".format(model_name, ticker_symbol)) pickle_in = open("{}/{}_{}_{}.pickle".format( self.saved_models_dir, model_name, ticker_symbol, obj_name), "rb") loaded_obj = pickle.load(pickle_in) return loaded_obj
def save_to_pickle_file(self, model_name, ticker_symbol, obj_to_be_saved, obj_name): """ Save the built model to a pickle file. :param model_name: str, name of the model. :param ticker_symbol: str, ticker symbol. :param obj_to_be_saved: object, model object. :param obj_name: str, name of the built model object. :return None: """ logger.info("Saving {} model for {} to pickle file".format(model_name, ticker_symbol)) pickle_out = open("{}/{}_{}_{}.pickle".format( self.saved_models_dir, model_name, ticker_symbol, obj_name), "wb") pickle.dump(obj_to_be_saved, pickle_out) pickle_out.close()
def plot_forecast(self, forecast_df_dict, original_df_dict, future_prediction_pcnt=1): """ Plots the actual data and the forecast data in the dataframe. :param forecast_df_dict: dict, dictionary containing model names as keys and dictionaries containing ticker symbols as keys and preprocessed dataframes containing forecast data as values. :param original_df_dict: dict, dictionary containing ticker symbols as keys and original dataframes as values. :param future_prediction_pcnt: float, Number of dates/data points into the future for which the stock price is to be predicted as a percentage of the number of dates/data points for which historical data which is already available :return: None """ for model_name, df_dict in forecast_df_dict.items(): logger.info( "----------------Plotting stock prices for {} model----------------" .format(model_name)) for ticker_symbol, df in df_dict.items(): ticker_domain = ticker_symbol.split("/")[0] original_df = original_df_dict[ticker_symbol].dropna( ).reset_index() df = df.reset_index() forecast_col_labels = { "WIKI": "{} - Adj. Close".format(ticker_symbol), "BCB": "{} - Value".format(ticker_symbol), "NASDAQOMX": "{} - Index Value".format(ticker_symbol) } logger.info( "----------------Plotting stock prices for {}".format( ticker_symbol)) # Number of future data points to be predicted. forecast_out = int( math.ceil(future_prediction_pcnt * 0.01 * len(df))) original_df["Date"] = original_df["Date"].shift(-forecast_out) df["{} - Forecast".format(ticker_symbol)].plot(color='b') original_df[forecast_col_labels[ticker_domain]].plot(color='g') plt.legend(loc="best") plt.xlabel("Date") plt.ylabel("Price") plt.title("Forecast for {} model for {}".format( model_name, ticker_symbol)) # fig = plt.figure() plt.savefig("{}/{}_{}.png".format( self.stock_price_plots_dir, model_name, ticker_symbol.replace("/", "_"))) plt.clf() plt.close()
def build_model(self, model_name, preprocessed_data_dict, force_build): """ Build machine learning models using different supervised learning regression algorithms :param model_name: str, name of the model to be built. :param preprocessed_data_dict: dict. :param force_build: bool, if True, will force the function to build the model, even if there is a saved model which was built before that is available. :return model_dict: dict, dictionary containing model name as key and the built model object as value. :return model_scores_dict: dictionary containing model name as key and the model training score as value. """ logger.info("----------------Building model using {}----------------".format(model_name)) model_dict = {} model_scores_dict = {} curr_dir = os.getcwd() for ticker_symbol, preprocessed_data in preprocessed_data_dict.items(): [X, X_forecast, y] = preprocessed_data tscv = TimeSeriesSplit(n_splits=5) ticker_symbol = ticker_symbol.replace("/", "_") if force_build or not os.path.exists( "{}/{}_{}_model.pickle".format(self.saved_models_path, model_name, ticker_symbol)): # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) # Create a cv iterator for splitting train and test data using TimeSeriesSplit # Optimize the hyperparameters based on the cross validation scores optimized_model = self.optimize_hyperparameters(model_name, tscv) model = make_pipeline(StandardScaler(), optimized_model) X_train, X_test, y_train, y_test = self.get_train_and_test_data(X, y, tscv) model.fit(X_train, y_train) self.save_to_pickle_file(model_name, ticker_symbol, model, "model") else: model = self.load_from_pickle_file(model_name, ticker_symbol, "model") X_train, X_test, y_train, y_test = self.get_train_and_test_data(X, y, tscv) # Training score confidence_score = model.score(X_test, y_test) # Plot learning curves title = "{}_{}_Learning Curves".format(model_name, ticker_symbol) save_file_path = "{}/learning_curve_plots/{}_{}.png".format(curr_dir, model_name, ticker_symbol) # Create the CV iterator self.plot_learning_curve(model, title, X, y, save_file_path, cv=tscv) # Cross validation cv_scores = cross_validate(model, X=X, y=y, cv=tscv) logger.info("Training score for {} = {}".format(ticker_symbol, confidence_score)) logger.debug("Cross validation scores for {} = {}".format(ticker_symbol, cv_scores["test_score"])) logger.info("Cross validation score for {} = {} +/- {}".format( ticker_symbol, cv_scores["test_score"].mean(), cv_scores["test_score"].std() * 2)) logger.debug("Cross validation scoring time = {}s".format(cv_scores["score_time"].sum())) model_dict[ticker_symbol] = model model_scores_dict[ticker_symbol] = confidence_score return model_dict, model_scores_dict
def get_stock_data(self, update_data=False): """ Get stock data for the ticker symbols in the json file (stockdata/stockdatainfo.json) from quandl :param update_data: bool, tells the function whether to pull data everytime or not. :return df: Dataframe """ logger.info( "----------------Getting stock data from Quandl----------------") logger.info("Stock ticker list = {}".format(self._stock_ticker_list)) # df = quandl.get("WIKI/GOOGL") # Pull data if stockdata/stockdata.csv does not exist or if update_data is True. if update_data or not os.path.exists("{}/{}".format( os.getcwd(), self.stock_data_path)): df = quandl.get(self._stock_ticker_list) logger.info("Writing stock data to {}".format( self.stock_data_path)) # Write the dataframe to a csv fle df.to_csv("{}".format(self.stock_data_path)) logger.info("Reading stock data from {}".format(self.stock_data_path)) # df = pd.read_csv("{}".format(self.stock_data_path), index_col="Date") # Read the data from the csv file df = pd.read_csv("{}".format(self.stock_data_path)) logger.debug("df.shape = {}".format(df.shape)) return df
def preprocess_data(self, df, ticker_symbol_list): """ Preprocess stock data :param df: dataframe, original dataframe :param ticker_symbol_list: list, list of ticker symbols :return preprocessed_data_dict: dict, dictionary with ticker symbols as keys, preprocessed stock data dataframes as values :return original_df_dict: dict, dictionary with ticker symbols as keys, original stock data dataframes as values """ self.ticker_symbol_list = ticker_symbol_list logger.info("----------------Pre-processing data----------------") # Extract data frames for each ticker from the original data frame and put it in a dictionary. self.get_df_for_each_ticker(df) useful_features = ["Adj. Close", "HL_PCT", "PCT_change", "Adj. Volume"] for ticker_symbol, original_df in self.original_df_dict.items(): ticker_domain = ticker_symbol.split("/")[0] feature_list = self.get_feature_list(ticker_domain) logger.debug("Feature list for {} = {}".format( ticker_symbol, feature_list)) preprocessed_feature_list = list( map(lambda x, x1: "{} - {}".format(x, x1), [ticker_symbol] * len(feature_list), feature_list)) preprocessed_df = original_df[preprocessed_feature_list].copy( deep=True) if ticker_domain in ["WIKI"]: # Compute high to low and open to close stock price percentage values and add them to feature list preprocessed_df = self.get_high_to_low_pcnt_change( preprocessed_df, ticker_symbol) preprocessed_df = self.get_open_to_close_pcnt_change( preprocessed_df, ticker_symbol) preprocessed_feature_list = list( map(lambda x, x1: "{} - {}".format(x, x1), [ticker_symbol] * len(useful_features), useful_features)) preprocessed_df = preprocessed_df[preprocessed_feature_list] # Forecast column labels depending on the domain forecast_col_labels = { "WIKI": "{} - Adj. Close".format(ticker_symbol), "BCB": "{} - Value".format(ticker_symbol), "NASDAQOMX": "{} - Index Value".format(ticker_symbol) } preprocessed_df.dropna(inplace=True) preprocessed_df["label"] = preprocessed_df[ forecast_col_labels[ticker_domain]] X_forecast = np.array(preprocessed_df.drop(["label"], 1)) # Number of future data points to be predicted. forecast_out = int( math.ceil(self.future_prediction_pcnt * 0.01 * len(preprocessed_df))) preprocessed_df = preprocessed_df.iloc[ 0:int((1 - self.future_prediction_pcnt * 0.01) * len(preprocessed_df)), :] preprocessed_df["label"] = preprocessed_df["label"].shift( -forecast_out) preprocessed_df.dropna(inplace=True) X = np.array(preprocessed_df.drop(["label"], 1)) X = X[:-forecast_out] y = np.array(preprocessed_df["label"]) y = y[:-forecast_out] self.preprocessed_data_dict[ticker_symbol] = [X, X_forecast, y] return self.preprocessed_data_dict, self.original_df_dict
def plot_learning_curve(self, estimator, title, X, y, save_file_path, ylim=None, cv=None, train_sizes=np.linspace(.1, 1.0, 5)): """ Generate a simple plot of the test and training learning curve. Parameters ---------- estimator : object type that implements the "fit" and "predict" methods An object of that type which is cloned for each validation. title : string Title for the chart. X : array-like, shape (n_samples, n_features) Training vector, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape (n_samples) or (n_samples, n_features), optional Target relative to X for classification or regression; None for unsupervised learning. ylim : tuple, shape (ymin, ymax), optional Defines minimum and maximum yvalues plotted. cv : int, cross-validation generator or an iterable, optional Determines the cross-validation splitting strategy. Possible inputs for cv are: - None, to use the default 3-fold cross-validation, - integer, to specify the number of folds. - An object to be used as a cross-validation generator. - An iterable yielding train/test splits. For integer/None inputs, if ``y`` is binary or multiclass, :class:`StratifiedKFold` used. If the estimator is not a classifier or if ``y`` is neither binary nor multiclass, :class:`KFold` is used. Refer :ref:`User Guide <cross_validation>` for the various cross-validators that can be used here. train_sizes : array-like, shape (n_ticks,), dtype float or int Relative or absolute numbers of training examples that will be used to generate the learning curve. If the dtype is float, it is regarded as a fraction of the maximum size of the training set (that is determined by the selected validation method), i.e. it has to be within (0, 1]. Otherwise it is interpreted as absolute sizes of the training sets. Note that for classification the number of samples usually have to be big enough to contain at least one sample from each class. (default: np.linspace(0.1, 1.0, 5)) """ logger.info("Plotting {}".format(title)) plt.figure() plt.title(title) if ylim is not None: plt.ylim(*ylim) plt.xlabel("Training examples") plt.ylabel("Score") train_sizes, train_scores, test_scores = learning_curve( estimator, X, y, cv=cv, train_sizes=train_sizes) train_scores_mean = np.mean(train_scores, axis=1) train_scores_std = np.std(train_scores, axis=1) test_scores_mean = np.mean(test_scores, axis=1) test_scores_std = np.std(test_scores, axis=1) plt.grid() plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color="r") plt.fill_between(train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.1, color="g") plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score") plt.plot(train_sizes, test_scores_mean, 'o-', color="g", label="Cross-validation score") plt.legend(loc="best") plt.savefig("{}".format(save_file_path)) plt.close()