def make_prediction(self, model_name, model_for_each_ticker_dict,
                        preprocessed_data_dict, original_df_dict):
        """
		Make future stock price prediction.
		:param model_name: str, name of the model.
		:param model_for_each_ticker_dict: dict.
		:param preprocessed_data_dict: dict.
		:param original_df_dict: dict.
		:return:
		"""
        logger.info(
            "----------------Predicting future prices using the {} model----------------"
            .format(model_name))
        forecast_df_dict = {}
        for ticker_symbol, model in model_for_each_ticker_dict.items():
            ticker_symbol = ticker_symbol.replace("_", "/")
            logger.info(
                "Predicting future prices for {}".format(ticker_symbol))
            df_copy = original_df_dict[ticker_symbol].copy(deep=True)
            df_copy.dropna(inplace=True)
            X_forecast = preprocessed_data_dict[ticker_symbol][1]
            logger.debug("len(X_forecast) = {}".format(len(X_forecast)))
            forecast_set = model.predict(X_forecast)
            df_copy["{} - Forecast".format(ticker_symbol)] = forecast_set
            forecast_df_dict[ticker_symbol] = df_copy
        return forecast_df_dict
 def stock_ticker_list(self):
     """
 Returns the stock ticker list.
 :return _stock_ticker_list: list.
 """
     logger.debug("Getting stock ticker list")
     return self._stock_ticker_list
Beispiel #3
0
	def optimize_hyperparameters(self, model_name, cv_iterator):
		"""
		Optimize hyperparameters based on the cross validation score.
		:param model_name: str, name of the model
		:param cv_iterator: iterator, split train and test data.
		:return:
		"""
		logger.debug("Optimizing hyper-parameters")
		parameters_dict = self.parameters_dict[model_name]
		model = self.models_dict[model_name]
		# Hyperparameter optimization
		optimized_model = GridSearchCV(estimator=model, param_grid=parameters_dict, cv=cv_iterator)
		return optimized_model
Beispiel #4
0
	def build_model(self, model_name, preprocessed_data_dict, force_build):
		"""
		Build machine learning models using different supervised learning regression algorithms
		:param model_name: str, name of the model to be built.
		:param preprocessed_data_dict: dict.
		:param force_build: bool, if True, will force the function to build the model, even if there is a saved model
		which was built before that is available.
		:return model_dict: dict, dictionary containing model name as key and the built model object as value.
		:return model_scores_dict: dictionary containing model name as key and the model training score as value.
		"""
		logger.info("----------------Building model using {}----------------".format(model_name))
		model_dict = {}
		model_scores_dict = {}
		curr_dir = os.getcwd()
		for ticker_symbol, preprocessed_data in preprocessed_data_dict.items():
			[X, X_forecast, y] = preprocessed_data
			tscv = TimeSeriesSplit(n_splits=5)
			ticker_symbol = ticker_symbol.replace("/", "_")
			if force_build or not os.path.exists(
					"{}/{}_{}_model.pickle".format(self.saved_models_path, model_name,	ticker_symbol)):
				# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
				# Create a cv iterator for splitting train and test data using TimeSeriesSplit
				# Optimize the hyperparameters based on the cross validation scores
				optimized_model = self.optimize_hyperparameters(model_name, tscv)
				model = make_pipeline(StandardScaler(), optimized_model)
				X_train, X_test, y_train, y_test = self.get_train_and_test_data(X, y, tscv)
				model.fit(X_train, y_train)
				self.save_to_pickle_file(model_name, ticker_symbol, model, "model")
			else:
				model = self.load_from_pickle_file(model_name, ticker_symbol, "model")
				X_train, X_test, y_train, y_test = self.get_train_and_test_data(X, y, tscv)
			# Training score
			confidence_score = model.score(X_test, y_test)
			# Plot learning curves
			title = "{}_{}_Learning Curves".format(model_name, ticker_symbol)
			save_file_path = "{}/learning_curve_plots/{}_{}.png".format(curr_dir, model_name, ticker_symbol)
			# Create the CV iterator
			self.plot_learning_curve(model, title, X, y, save_file_path, cv=tscv)
			# Cross validation
			cv_scores = cross_validate(model, X=X, y=y, cv=tscv)
			logger.info("Training score for {} = {}".format(ticker_symbol, confidence_score))
			logger.debug("Cross validation scores for {} = {}".format(ticker_symbol, cv_scores["test_score"]))
			logger.info("Cross validation score for {} = {} +/- {}".format(
				ticker_symbol, cv_scores["test_score"].mean(), cv_scores["test_score"].std() * 2))
			logger.debug("Cross validation scoring time = {}s".format(cv_scores["score_time"].sum()))
			model_dict[ticker_symbol] = model
			model_scores_dict[ticker_symbol] = confidence_score
		return model_dict, model_scores_dict
Beispiel #5
0
	def get_train_and_test_data(self, X, y, tscv):
		"""
		Get the train and test data sets.
		:param X: ndarray.
		:param y: array
		:param tscv: iterator, TimeSeriesSplit iterator
		:return X_train, X_test, y_train, y_test: arrays, train and test data.
		"""
		split_data = []
		for train_indices, test_indices in tscv.split(X):
			X_train, X_test = X[train_indices], X[test_indices]
			y_train, y_test = y[train_indices], y[test_indices]
			split_data.append((X_train, X_test, y_train, y_test))
		# Get cross validation score for the last index as it will have the most training data which is good for time
		# series data
		best_split_index = -1
		X_train, X_test, y_train, y_test = split_data[best_split_index]
		logger.debug("Last train_data size = {}".format(len(X_train) * 100 / len(X)))
		return X_train, X_test, y_train, y_test
 def get_stock_data(self, update_data=False):
     """
 Get stock data for the ticker symbols in the json file (stockdata/stockdatainfo.json) from quandl
 :param update_data: bool, tells the function whether to pull data everytime or not.
 :return df: Dataframe
 """
     logger.info(
         "----------------Getting stock data from Quandl----------------")
     logger.info("Stock ticker list = {}".format(self._stock_ticker_list))
     # df = quandl.get("WIKI/GOOGL")
     # Pull data if stockdata/stockdata.csv does not exist or if update_data is True.
     if update_data or not os.path.exists("{}/{}".format(
             os.getcwd(), self.stock_data_path)):
         df = quandl.get(self._stock_ticker_list)
         logger.info("Writing stock data to {}".format(
             self.stock_data_path))
         # Write the dataframe to a csv fle
         df.to_csv("{}".format(self.stock_data_path))
     logger.info("Reading stock data from {}".format(self.stock_data_path))
     # df = pd.read_csv("{}".format(self.stock_data_path), index_col="Date")
     # Read the data from the csv file
     df = pd.read_csv("{}".format(self.stock_data_path))
     logger.debug("df.shape = {}".format(df.shape))
     return df
Beispiel #7
0
    def preprocess_data(self, df, ticker_symbol_list):
        """
		Preprocess stock data
		:param df: dataframe, original dataframe
		:param ticker_symbol_list: list, list of ticker symbols
		:return preprocessed_data_dict: dict, dictionary with ticker symbols as keys, preprocessed stock data dataframes as
		values
		:return original_df_dict: dict, dictionary with ticker symbols as keys, original stock data dataframes as values
		"""
        self.ticker_symbol_list = ticker_symbol_list
        logger.info("----------------Pre-processing data----------------")
        # Extract data frames for each ticker from the original data frame and put it in a dictionary.
        self.get_df_for_each_ticker(df)
        useful_features = ["Adj. Close", "HL_PCT", "PCT_change", "Adj. Volume"]
        for ticker_symbol, original_df in self.original_df_dict.items():
            ticker_domain = ticker_symbol.split("/")[0]
            feature_list = self.get_feature_list(ticker_domain)
            logger.debug("Feature list for {} = {}".format(
                ticker_symbol, feature_list))
            preprocessed_feature_list = list(
                map(lambda x, x1: "{} - {}".format(x, x1),
                    [ticker_symbol] * len(feature_list), feature_list))
            preprocessed_df = original_df[preprocessed_feature_list].copy(
                deep=True)
            if ticker_domain in ["WIKI"]:
                # Compute high to low and open to close stock price percentage values and add them to feature list
                preprocessed_df = self.get_high_to_low_pcnt_change(
                    preprocessed_df, ticker_symbol)
                preprocessed_df = self.get_open_to_close_pcnt_change(
                    preprocessed_df, ticker_symbol)
                preprocessed_feature_list = list(
                    map(lambda x, x1: "{} - {}".format(x, x1),
                        [ticker_symbol] * len(useful_features),
                        useful_features))
                preprocessed_df = preprocessed_df[preprocessed_feature_list]
            # Forecast column labels depending on the domain
            forecast_col_labels = {
                "WIKI": "{} - Adj. Close".format(ticker_symbol),
                "BCB": "{} - Value".format(ticker_symbol),
                "NASDAQOMX": "{} - Index Value".format(ticker_symbol)
            }
            preprocessed_df.dropna(inplace=True)
            preprocessed_df["label"] = preprocessed_df[
                forecast_col_labels[ticker_domain]]
            X_forecast = np.array(preprocessed_df.drop(["label"], 1))
            # Number of future data points to be predicted.
            forecast_out = int(
                math.ceil(self.future_prediction_pcnt * 0.01 *
                          len(preprocessed_df)))
            preprocessed_df = preprocessed_df.iloc[
                0:int((1 - self.future_prediction_pcnt * 0.01) *
                      len(preprocessed_df)), :]
            preprocessed_df["label"] = preprocessed_df["label"].shift(
                -forecast_out)
            preprocessed_df.dropna(inplace=True)
            X = np.array(preprocessed_df.drop(["label"], 1))
            X = X[:-forecast_out]
            y = np.array(preprocessed_df["label"])
            y = y[:-forecast_out]
            self.preprocessed_data_dict[ticker_symbol] = [X, X_forecast, y]
        return self.preprocessed_data_dict, self.original_df_dict