def train_classifier(self, X_train, X_test, y_train, y_test): classifiers = [ svm.SVR(), linear_model.SGDRegressor(), linear_model.BayesianRidge(), linear_model.LassoLars(), linear_model.ARDRegression(), linear_model.PassiveAggressiveRegressor(), linear_model.TheilSenRegressor(), linear_model.LinearRegression() ] best_accuracy = self.lowest_accuracy_wanted best_classifier = None for classifier in classifiers: classifier.fit(X_train, y_train) prediction_score = classifier.score(X_test, y_test) if prediction_score > best_accuracy: best_accuracy = prediction_score best_classifier = classifier if best_classifier: print( f"{classifier} performed the best with an accuracy of: {best_accuracy}" ) joblib.dump( classifier, relative_path( f'models/files/predict_{self.prediction_column_name}.pkl'))
def __get_reddit_df(path): with open(relative_path(path)) as f: data = json.load(f) dict_collection = [] for company in data['companies'].keys(): for date in data['companies'][company].keys(): row = data['companies'][company][date].copy() row['company'] = company row['date'] = date dict_collection.append(row) return pd.DataFrame(dict_collection)
def __get_companies_related_reddit_submissions(companies): for company in companies: submission_dict_collection = RedditInterface.get_all_submissions_for(company) DataFileInterface.write_to_file(relative_path('data/files/stock_submissions.csv'), submission_dict_collection)
def __get_historical_data_for_all_tech_companies(companies): for company in companies: print(f"getting historical data for company: {company}") company_historical_data_collection = IEXCloudInterface.get_historical_data(company, '1m') DataFileInterface.write_to_file(relative_path('data/files/stock_data.csv'), company_historical_data_collection)
def __get_stats_for_all_tech_companies(companies): for company in companies: print(f"getting stat data for company: {company}") company_stat_data_collection = IEXCloudInterface.get_stats(company) DataFileInterface.write_to_file(relative_path('data/files/stock_stats.csv'), company_stat_data_collection)
def __get_news_data_for_all_tech_companies(companies): for company in companies: company_news_collection = IEXCloudInterface.get_news(company, 'last-week') DataFileInterface.write_to_file(relative_path('data/files/news.csv'), company_news_collection)
row['company'] = company row['date'] = date dict_collection.append(row) return pd.DataFrame(dict_collection) if __name__ == '__main__': if '--get-data' in sys.argv: companies = __get_companies() __retrieve_data(companies) if '--analyze-text' in sys.argv: reddit_submissions = pd.read_csv(relative_path('data/files/stock_submissions.csv')) ssm = SubmissionSentimentModel(reddit_submissions) ssm.analyze(relative_path('data/files/reddit_submissions_analyzed.json')) if '--train' in sys.argv: prediction_column_names = ['high', 'low', 'close'] stock_stats_df = pd.read_csv(relative_path('data/files/stock_stats.csv')) spm = StockPriceModel() for prediction_column_name in prediction_column_names: stock_price_movement_df = pd.read_csv(relative_path('data/files/stock_data.csv')) stock_price_movement_df = stock_price_movement_df[['company', 'date', 'open', prediction_column_name]].copy() reddit_df = __get_reddit_df('data/files/reddit_submissions_analyzed.json')