def backtest(strategy, start="2014-1-1", end="2015-11-02", log=False, correct=True): """ :param start: starting date in %Y-%m-%d :param end: ending date in %Y-%m-%d :param log: flag to turn on logging :return: return relative to first stock purchase """ # df = get_data(strategy.tickers, start, end) df = load_s_and_p_data(start=start, end=end, only_close=False) if df.empty: raise ValueError("No stock data found") if log: print(df.describe()) strategy._log = True starting_balance = strategy.portfolio.balance strategy.run(df) ending_value = strategy.value(correct=correct) if log: for transaction in strategy.portfolio.transactions: print(transaction) print(starting_balance, ending_value) return (ending_value - starting_balance) * 100. / starting_balance
import numpy as np from sklearn.linear_model import LinearRegression import matplotlib.pyplot as plt from TimeSeriesEstimator import TimeSeriesRegressor, time_series_split, cascade_cv from utils import load_s_and_p_data, cache __author__ = 'Mark' X = load_s_and_p_data(start="2009-1-1", only_close=True) names = list(X.columns.values) @cache('data/data_dependence_cache.pkl') def get_data_dependece(X, data_sizes, folds=20, test_size=30, n_prev=2, log=True): bests = np.empty((len(data_sizes), folds, X.shape[1])) for i, data_size in enumerate(data_sizes): pairs = cascade_cv(len(X), folds, data_size=data_size, test_size=test_size, number=True) for j, pair in enumerate(pairs): if log: print('data size: {} trial {} '.format(data_size, j)) X_train, X_test = np.array(X.iloc[pair[0], :]), np.array(X.iloc[pair[1], :]) tsr = TimeSeriesRegressor(LinearRegression(), n_prev=n_prev) tsr.fit(X_train) fc = tsr.forecast(X_train, len(X_test)) def changes(X, start=0, end=-1): return np.array([X[end, i] - X[start, i] for i in range(X.shape[1])]) best_is = changes(fc).argsort()[::-1] for k in range(X.shape[1]): bests[i, j, k] = changes(X_test)[best_is[k]] - np.mean(changes(X_test)) return bests
import numpy as np from sklearn.linear_model import LinearRegression import matplotlib.pyplot as plt from TimeSeriesEstimator import TimeSeriesRegressor, time_series_split, cascade_cv from utils import load_s_and_p_data, cache __author__ = 'Mark' X = load_s_and_p_data(start="2009-1-1", only_close=True) names = list(X.columns.values) @cache('data/data_dependence_cache.pkl') def get_data_dependece(X, data_sizes, folds=20, test_size=30, n_prev=2, log=True): bests = np.empty((len(data_sizes), folds, X.shape[1])) for i, data_size in enumerate(data_sizes): pairs = cascade_cv(len(X), folds, data_size=data_size, test_size=test_size, number=True) for j, pair in enumerate(pairs): if log: print('data size: {} trial {} '.format(data_size, j)) X_train, X_test = np.array(X.iloc[pair[0], :]), np.array( X.iloc[pair[1], :]) tsr = TimeSeriesRegressor(LinearRegression(), n_prev=n_prev)
def main(): p = optparse.OptionParser() p.add_option('--load_data', action="store_true", default=False) p.add_option('--save_data', action="store_true", default=False) p.add_option('--load_model', action="store_true", default=False) p.add_option('--no_run_model', action="store_false", dest="run_model", default=True) p.add_option('--no_save_model', action="store_false", dest="save_model", default=True) p.add_option('--load_results', action="store_true", default=False) p.add_option('--no_save_results', action="store_false", dest="save_results", default=True) p.add_option('--no_plot_results', action="store_false", dest="plot_results", default=True) p.add_option('--model_name', default='shallow_RNN', type="string", help='Options: shallow_RNN,shallow_LSTM,shallow_GRU,' 'deep_RNN, deep_LSTM, deep_GRU, seq2seq') p.add_option('--base_path', default="~/machine_learning/stock_sandbox/") p.add_option('--dataset', default='jigsaw', type="string", help='Options: jigsaw, synthetic, sp500') p.add_option('--n_samples', default=100, type="int") p.add_option('--n_ahead', default=50, type="int") p.add_option('--patience', default=5, type="int") p.add_option('--batch_size', default=20, type="int") p.add_option('--max_epochs', default=1000, type="int") ops, args = p.parse_args() if (not ops.load_results and not ops.run_model) and ops.save_results: raise ValueError("Cannot save what has not been loaded or run ") if not os.path.exists(os.path.expanduser(ops.base_path + 'results')): os.makedirs(ops.base_path + 'results') if not os.path.exists(os.path.expanduser(ops.base_path + 'data')): os.makedirs(ops.base_path + 'data') base_name = ops.dataset + '_' + ops.model_name data_fname = ops.base_path + 'data/' + ops.dataset + "_data.pkl" data_fname = os.path.expanduser(data_fname) arch_fname = ops.base_path + 'results/' + base_name + '_model_architecture.json' arch_fname = os.path.expanduser(arch_fname) weights_fname = ops.base_path + 'results/' + base_name + '_model_weights.h5' weights_fname = os.path.expanduser(weights_fname) plot_fname = ops.base_path + 'results/' + base_name + '_results.png' plot_fname = os.path.expanduser(plot_fname) results_fname = ops.base_path + 'results/' + ops.model_name + '_results.pkl' results_fname = os.path.expanduser(results_fname) #########################BEGIN CODE####################################### # tickers = ['AAPL','VZ','NKE','KMI','M','MS','WMT','DOW','MPC'] tickers = None if not ops.load_results: if ops.load_data: print('Loading data...') data = pickle.load(open(data_fname, 'r')) if tickers: data.loc(tickers) else: if ops.dataset == "sp500": ##### Real Stock Data print('Using sp500 data') data = load_s_and_p_data(start="2014-1-1", tickers=tickers) elif ops.dataset == "synthetic": ##### Synthetic data for testing purposes print('Using Synthetic data') values = 10000 s = pd.Series(range(values)) noise = pd.Series(np.random.randn(values)) s = s / 1000 # + noise / 100 d = {'one': s * s * 100 / values, 'two': np.sin(s * 10.), 'three': np.cos(s * 10), 'four': np.sin(s * s / 10) * np.sqrt(s)} data = pd.DataFrame(d) elif ops.dataset == "jigsaw": ##### Easy synthetic data for testing purposes print('Using jigsaw data') flow = (list(range(1, 10, 1)) + list(range(10, 1, -1))) * 1000 pdata = pd.DataFrame({"a": flow, "b": flow}) pdata.b = pdata.b.shift(9) data = pdata.iloc[10:] * random.random() # some noise else: raise ValueError('Not a legal dataset name') if ops.save_data: print('Saving data...') pickle.dump(data, open(data_fname, 'wb+')) if ops.model_name == 'seq2seq': (X_train, y_train), (X_test, y_test) = test_train_split(data, splitting_method='seq2seq', n_samples=ops.n_samples, n_ahead=ops.n_ahead) print(X_train.shape, y_train.shape) else: (X_train, y_train), (X_test, y_test) = test_train_split(data, n_samples=ops.n_samples, n_ahead=ops.n_ahead) if not ops.load_model: print('compiling model') in_out_neurons = len(data.columns) if ops.model_name == "shallow_RNN": model = make_RNN(X_train.shape, [300], SimpleRNN, dropout=0) elif ops.model_name == "shallow_LSTM": model = make_RNN(X_train.shape, [300], LSTM, dropout=0) elif ops.model_name == "shallow_GRU": model = make_RNN(X_train.shape, [300], GRU, dropout=0) elif ops.model_name == "deep_RNN": model = make_RNN(X_train.shape, [300, 500, 200], SimpleRNN, dropout=.2) elif ops.model_name == "deep_LSTM": model = make_RNN(X_train.shape, [300, 500, 200], LSTM, dropout=.2) elif ops.model_name == "deep_GRU": model = make_RNN(X_train.shape, [300, 500, 200], GRU, dropout=.2) elif ops.model_name == "seq2seq": maxlen = 100 # length of input sequence and output sequence hidden_dim = 500 # memory size of seq2seq seq2seq = Seq2seq(input_length=X_train.shape[1], input_dim=X_train.shape[2], hidden_dim=hidden_dim, output_dim=X_train.shape[2], output_length=y_train.shape[1], batch_size=ops.batch_size, depth=4) model = Sequential() model.add(seq2seq) model.compile(loss="mean_squared_error", optimizer="rmsprop") else: raise ValueError('Not a legal model name') model.compile(loss="mean_squared_error", optimizer="rmsprop") print('Training model...') early_stopping = EarlyStopping(monitor='val_loss', patience=ops.patience, verbose=0) model.fit(X_train, y_train, batch_size=ops.batch_size, nb_epoch=ops.max_epochs, validation_split=0.1, callbacks=[early_stopping]) else: print('Loading model...') model = model_from_json(open(arch_fname).read()) model.load_weights(weights_fname) if ops.save_model: print("Saving model...") json_string = model.to_json() open(arch_fname, 'w+').write(json_string) model.save_weights(weights_fname, overwrite=True) if ops.run_model: print('Running forecast...') forecasted = forecast(model, X_train[-1, :, :], n_ahead=len(y_test[0])) predicted = model.predict(X_test) rmse = np.sqrt(((predicted - y_test) ** 2).mean(axis=0)).mean() print("RMSE:", rmse) if ops.save_results: print('Saving results...') pickle.dump((predicted, forecasted, y_test), open(results_fname, 'wb+')) else: print('Loading results...') predicted, forecasted, y_test = pickle.load(open(results_fname, 'r')) if ops.plot_results: print('Plotting results...') print(predicted.shape, y_test.shape, forecasted.shape) fig = plt.figure() for i in range(min(4, predicted.shape[2])): ax = fig.add_subplot(2, 2, i + 1) ax.plot(forecasted[:, i], color='r') ax.plot(predicted[0, :, i], color='g') ax.plot(y_test[0, :, i], color='b') if tickers: ax.set_title(tickers[i]) fig.savefig(plot_fname)