Esempio n. 1
0
def debug():
    year = None
    total = 2000
    train = 1000

    np.set_printoptions(precision=2, linewidth=150, suppress=True)
    data = retrieve_feature_data(total, year)
    assert total == len(data), 'only got {} out of {} datapoints from server'.format(len(data), total)

    print('Random state = {random_state}'.format(**globals()))
    X, Y = to_matrix(data, power)
    trainX, testX, trainY, testY = train_test_split(X, Y, train_size=train/total, random_state=random_state)
    # print('train data = \n{!s}'.format(np.hstack((trainX, trainY))))
    reg = LinearRegression()
    reg.fit(trainX, trainY)

    # predicted price increase factors
    predY = reg.predict(testX)
    # print('test data = \n{!s}'.format(np.hstack((testX, testY, predY))))
    print('Coefficients (x1e3) = {!s}'.format(reg.coef_ * 1e3))

    print('Mean Squared Error = {:.4f}'.format(mse(predY**(1/power), testY**(1/power))))
    pos = position(predY, threshold=1.1)
    print('Taking {} stocks'.format(np.sum(pos > 0)))
    print('Prediction, actual, benchmark, position, return = \n{!s}'.format(np.hstack((predY, testY, benchmark(testY), pos, returns(pos, testY)))))
    print('Return = {:.5f}%, Adjusted return = {:.5f}%, with a Sharpe ratio of {:.2f}'
          .format((np.sum(returns(pos, testY)) - 1) * 100,
                  np.sum(returns(pos, testY) - benchmark(testY)) * 100,
                  sharpe(pos, testY)))
Esempio n. 2
0
def convergence(minN, maxN, spaceN, testN, mult):
    ns = np.arange(minN, maxN, spaceN)
    plt.figure()
    plt.xlabel('Number of data points')
    plt.title('Convergence of Ordinary Least Squares Regression')

    data = retrieve_feature_data(maxN + testN, None)
    X, Y = to_matrix(data, power)
    testX, testY = X[-testN:], Y[-testN:]
    params = np.zeros_like(ns, dtype=np.float64)
    r2 = np.zeros_like(ns, dtype=np.float64)
    m = np.zeros_like(ns, dtype=np.float64)
    reg = LinearRegression()
    final = reg.fit(X[:maxN], Y[:maxN]).coef_
    for i, n in enumerate(ns):
        trainX, trainY = X[:n], Y[:n]
        reg.fit(trainX, trainY)
        params[i] = np.sum(((reg.coef_ - final) * mult)**2)
        r2[i] = reg.score(testX, testY)
        m[i] = mse(np.clip(reg.predict(testX), 0, 10)**(1/power), testY**(1/power))
    plt.ylim(0, 1)
    plt.plot(ns, params / params.max() * 2.2, 'g', label=r'Parameter distance')
    plt.plot(ns, -np.arctan(r2) / np.pi * 2, 'r', label='Correlation coefficient')
    plt.plot(ns, m / m.max() * 0.1, 'y', label=r'Mean-squared error')
    plt.gca().set_yticklabels([])
    plt.legend()
    plt.savefig('convergence.png')
    plt.close()
Esempio n. 3
0
def cross_val(N=4000, M=1000, k=60):
    X, Y = to_matrix(retrieve_feature_data(N, None), power)
    reg = LinearRegression()

    def my_cv(scoring):
        # http://scikit-learn.org/stable/modules/cross_validation.html
        return cross_val_score(reg, X, Y, cv=ShuffleSplit(n_splits=k, train_size=M/N, random_state=random_state), scoring=scoring)

    MSE = my_cv(lambda f, X, Y: mse(np.clip(f.predict(X), 0, 10)**(1/power), Y**(1/power)))
    print(np.mean(MSE), np.median(MSE))
    r = my_cv(lambda f, X, Y: 100 * (np.sum(returns(position(f.predict(X)), Y) - benchmark(Y))))
    print(np.mean(r), np.median(r))
    s = my_cv(lambda f, X, Y: sharpe(position(f.predict(X), threshold=1.1), Y))
    print(np.mean(s), np.median(s))
    n = my_cv(lambda f, X, Y: np.sum(f.predict(X) > 0))
    print(np.mean(n), np.median(n))

    plt.figure()
    plt.title('Mean squared error in {k}-fold cross validation'.format(**locals()))
    plt.ylabel('Frequency')
    plt.xlabel('MSE')
    plt.hist(MSE, bins=7, normed=True)
    plt.savefig('mse.png')
    plt.close()

    plt.figure()
    plt.title('Adjusted return in {k}-fold cross validation'.format(**locals()))
    plt.ylabel('Frequency')
    plt.xlabel('Adjusted Return (%)')
    plt.hist(r, bins=10, normed=True)
    plt.savefig('adj_return.png')
    plt.close()

    plt.figure()
    plt.title('Sharpe ratio in {k}-fold cross validation'.format(**locals()))
    plt.ylabel('Frequency')
    plt.xlabel('Sharpe')
    plt.hist(s, bins=10, normed=True)
    plt.savefig('sharpe.png')
    plt.close()
Esempio n. 4
0
def parameter_tuning(end, step, N=4000, M=1000, k=30):
    reg = LinearRegression()
    powers = np.arange(1, end, step)
    MSE = np.zeros(len(powers), dtype=np.float64)
    sharpe2 = np.zeros(len(powers), dtype=np.float64)
    data = retrieve_feature_data(N, None)
    for i, power in enumerate(powers):
        X, Y = to_matrix(data, power=power)
        def my_cv(scoring):
            # http://scikit-learn.org/stable/modules/cross_validation.html
            return cross_val_score(reg, X, Y, cv=ShuffleSplit(n_splits=k, train_size=M/N, random_state=random_state), scoring=scoring)
        MSE[i] = np.mean(my_cv(lambda f, X, Y: mse(np.clip(f.predict(X), 0, 10)**(1/power), Y**(1/power))))
        sharpe2[i] = np.mean(my_cv(lambda f, X, Y: sharpe(position(f.predict(X), threshold=1.1), Y)))
    print(MSE)
    print(sharpe2)
    plt.figure()
    plt.xlabel('$p$')
    plt.ylim(0, 1)
    plt.plot(powers, MSE / MSE.max(), 'b', label=r'Mean Squared Error')
    plt.plot(powers, sharpe2 / sharpe2.max(), 'y', label='Sharpe Ratio')
    plt.legend()
    plt.savefig('parameter.png')
    plt.close()
Esempio n. 5
0
def fit(total):
    data = retrieve_feature_data(total, rand=False)
    X, Y = to_matrix(data, power)
    reg = LinearRegression()
    reg.fit(X, Y)
    return reg