Exemple #1
0
def linear_regression_sklearn(df, xcols):
    y = df['target_proxy']
    X = df[list(xcols)[0]]

    # Standardize and split the training nad test data
    X_std = standardize(X)
    ts = 0.3
    X_train, X_test, y_train, y_test = \
          train_test_split(X_std, y, test_size=ts, random_state=0)

    X = np.transpose(np.array([X]))
    slr = LinearRegression()
    slr.fit(X, y.values)
    y_pred = slr.predict(X)
    print('Slope: %.3f' % slr.coef_[0])
    print('Intercept: %.3f' % slr.intercept_)

    lin_regplot(X, y.values, slr)
    plt.xlabel('x val')
    plt.ylabel('Return')
    plt.tight_layout()
    plt.savefig(IMG_PATH + 'scikit_lr_fit.png', dpi=300)
    plt.close()

    # Closed-form solution
    Xb = np.hstack((np.ones((X.shape[0], 1)), X))
    w = np.zeros(X.shape[1])
    z = np.linalg.inv(np.dot(Xb.T, Xb))
    w = np.dot(z, np.dot(Xb.T, y))
    print('Slope: %.3f' % w[1])
    print('Intercept: %.3f' % w[0])
Exemple #2
0
def kfold_cross_validation(df, xcols, folds=10):
    pdb.set_trace()
    y = df['target']
    X = df[list(xcols)]

    # Standardize and split the training nad test data
    X_std = standardize(X)
    ts = 0.3
    X_train, X_test, y_train, y_test = train_test_split(X_std,
                                                        y,
                                                        test_size=ts,
                                                        random_state=0)

    pipe_lr = Pipeline([('scl', StandardScaler()),
                        ('pca', PCA(n_components=2)),
                        ('clf', LogisticRegression(random_state=1))])

    kfold = StratifiedKFold(y=y_train, n_folds=folds, random_state=1)

    scores = []
    for k, (train, test) in enumerate(kfold):
        pipe_lr.fit(X_train[train], y_train.values[train])
        score = pipe_lr.score(X_train[test], y_train.values[test])
        scores.append(score)
        print('Fold: %s, Class dist.: %s, Acc: %.3f' %
              (k + 1, np.bincount(y_train.values[train]), score))
    print('\nCV accuracy: %.3f +/- %.3f' % (np.mean(scores), np.std(scores)))

    scores = cross_val_score(estimator=pipe_lr,
                             X=X_train,
                             y=y_train.values,
                             cv=10,
                             n_jobs=1)
    print('CV accuracy scores: %s' % scores)
    print('CV accuracy: %.3f +/- %.3f' % (np.mean(scores), np.std(scores)))
Exemple #3
0
def heat_map(df, xcols):
    y = df['target']
    X = df[list(xcols)]
    cols = ['target_proxy'] + list(xcols)

    # Standardize and split the training nad test data
    X_std = standardize(X)
    ts = 0.3
    X_train, X_test, y_train, y_test = \
          train_test_split(X_std, y, test_size=ts, random_state=0)

    sns.set(style='whitegrid', context='notebook')
    sns.pairplot(df[cols], size=2.5)
    plt.tight_layout()
    plt.savefig(IMG_PATH + 'corr_mat.png', dpi=300)
    plt.close()

    cm = np.corrcoef(df[cols].values.T)
    sns.set(font_scale=1.5)
    hm = sns.heatmap(cm,
                     cbar=True,
                     annot=True,
                     square=True,
                     fmt='.2f',
                     annot_kws={'size': 15},
                     yticklabels=cols,
                     xticklabels=cols)
    plt.tight_layout()
    plt.savefig(IMG_PATH + 'heat_map.png', dpi=300)
    plt.close()
def lda_scikit(df, xcols):
    y = df['target']
    X = df[list(xcols)]
    
    # Standardize and split the training nad test data
    X_std = standardize(X)
    ts = 0.3
    X_train, X_test, y_train, y_test = \
          train_test_split(X_std, y, test_size=ts, random_state=0)
    
    lda = LDA(n_components=2)
    X_train_lda = lda.fit_transform(X_train, y_train)
    lr = LogisticRegression()
    lr = lr.fit(X_train_lda, y_train)
    
    plot_decision_regions(X_train_lda, y_train.values, classifier=lr)
    plt.xlabel('LD 1')
    plt.ylabel('LD 2')
    plt.legend(loc='lower left')
    plt.tight_layout()
    plt.savefig(IMG_PATH + 'lda_scikit.png', dpi=300)
    plt.close()
    
    X_test_lda = lda.transform(X_test)
    
    plot_decision_regions(X_test_lda, y_test.values, classifier=lr)
    plt.xlabel('LD 1')
    plt.ylabel('LD 2')
    plt.legend(loc='lower left')
    plt.tight_layout()
    plt.savefig(IMG_PATH + 'lda_scikit_test.png', dpi=300)
def random_forest_feature_importance(df, xcols):
    y_s = df['target']
    x_s = df[list(xcols)]

    # Standardize and split the training nad test data
    x_std = standardize(x_s)
    t_s = 0.3
    x_train, x_test, y_train, y_test = train_test_split(x_std, y_s, test_size=t_s, random_state=0)
    
    feat_labels = df[list(xcols)].columns
    forest = RandomForestClassifier(n_estimators=100, random_state=0, n_jobs=-1)
    forest.fit(x_train, y_train)
    importances = forest.feature_importances_
    indices = np.argsort(importances)[::-1]

    for f in range(x_train.shape[1]):
        print("%2d) %-*s %f" % (f + 1, 30, feat_labels[indices[f]], importances[indices[f]]))
    
    plt.title('Feature Importances')
    plt.bar(range(x_train.shape[1]), importances[indices],color='lightblue', align='center')

    plt.xticks(range(x_train.shape[1]), feat_labels[indices], rotation=90)
    plt.xlim([-1, x_train.shape[1]])
    plt.tight_layout()
    plt.savefig(IMG_ROOT + 'random_forest_{}.png'
                "".format(dt.datetime.now().strftime("%Y%m%d")), dpi=300)
    plt.close()

    x_selected = forest.transform(x_train, threshold=0.05)
    print(x_selected.shape)

    # Shows the percentage of falling into each class
    print("Class breakdowns: " + str(forest.predict_proba(x_test[0:1])))
    print('Training accuracy:', forest.score(x_train, y_train))
    print('Test accuracy:', forest.score(x_test, y_test))
def support_vector_machines(df, xcols, C=100):
    y = df['target']
    X = df[list(xcols)]

    # Standardize and split the training nad test data
    X_std = standardize(X)
    t_s = 0.3
    X_train, X_test, y_train, y_test = train_test_split(X_std,
                                                        y,
                                                        test_size=t_s,
                                                        random_state=0)

    svm = SVC(kernel='linear', C=C, random_state=0)
    svm.fit(X_train, y_train)

    print('Training accuracy:', svm.score(X_train, y_train))
    print('Test accuracy:', svm.score(X_test, y_test))

    # plot_decision_regions(X.values, y.values, classifier=svm, test_break_idx=int(len(y)*(1-ts)))
    plot_decision_regions(X_std, y.values, classifier=svm)
    plt.title('Support Vector Machines')
    plt.xlabel(list(X.columns)[0])
    plt.ylabel(list(X.columns)[1])
    plt.legend(loc='upper left')
    plt.tight_layout()
    plt.savefig(IMG_PATH + 'svm_C' + str(C) + '.png', dpi=300)
    plt.close()
    return svm
def adalineGD(df, xcols, eta=0.1, n_iter=10):
    t0 = time.time()
    # Need this replace to comply with the -1 and 1 of the perceptron binary classifier
    y = df['target'].replace(0, -1)
    X = df[list(xcols)]

    # Standardize and split the training nad test data
    X_std = standardize(X)
    ts = 0.3
    X_train, X_test, y_train, y_test = train_test_split(X_std,
                                                        y,
                                                        test_size=ts,
                                                        random_state=0)

    ada = AdalineGD(n_iter=15, eta=0.001)
    ada.fit(X_std, y)

    plot_decision_regions(X_std, y.values, classifier=ada)
    plt.title('Adaline - Gradient Descent')
    plt.xlabel(list(X.columns)[0])
    plt.ylabel(list(X.columns)[1])
    plt.legend(loc='upper left')
    plt.tight_layout()
    plt.savefig(IMG_ROOT + 'dow/adaline_2.png', dpi=300)
    plt.close()

    plt.plot(range(1, len(ada.cost_) + 1), ada.cost_, marker='o')
    plt.xlabel('Epochs')
    plt.ylabel('Sum-squared-error')
    plt.tight_layout()
    plt.savefig(IMG_ROOT + 'dow/adaline_3.png', dpi=300)
    plt.close()
def decision_tree(df, xcols, md=3):
    y = df['target']
    X = df[list(xcols)]

    # Standardize and split the training nad test data
    X_std = standardize(X)
    ts = 0.3
    X_train, X_test, y_train, y_test = \
          train_test_split(X_std, y, test_size=ts, random_state=0)

    tree = DecisionTreeClassifier(criterion='entropy',
                                  max_depth=md,
                                  random_state=0)
    tree.fit(X_train, y_train)

    print('Training accuracy:', tree.score(X_train, y_train))
    print('Test accuracy:', tree.score(X_test, y_test))

    plot_decision_regions(X_std, y.values, classifier=tree)
    plt.title('Decision Tree')
    plt.xlabel(list(X.columns)[0])
    plt.ylabel(list(X.columns)[1])
    plt.legend(loc='upper left')
    plt.tight_layout()
    plt.savefig(IMG_PATH + 'dec_tree' + '.png', dpi=300)
    plt.close()

    export_graphviz(tree, out_file='tree.dot', feature_names=list(xcols))
def k_nearest_neighbors(df, xcols, k=5):
    """
    Run k-nearest neighbors algo
    """
    y = df['target']
    X = df[list(xcols)]

    # Standardize and split the training and test data
    X_std = standardize(X)
    ts = 0.3
    X_train, X_test, y_train, y_test = train_test_split(X_std,
                                                        y,
                                                        test_size=ts,
                                                        random_state=0)

    knn = KNeighborsClassifier(n_neighbors=k, p=2, metric='minkowski')
    knn.fit(X_train, y_train)

    print('Training accuracy:', knn.score(X_train, y_train))
    print('Test accuracy:', knn.score(X_test, y_test))

    plot_decision_regions(X_std, y.values, classifier=knn)
    plt.title('Randaom Forest (Decision Tree Ensemble)')
    plt.xlabel(list(X.columns)[0])
    plt.ylabel(list(X.columns)[1])
    plt.legend(loc='upper left')
    plt.tight_layout()
    plt.savefig(IMG_ROOT + 'snp/kmeans/kkn.png', dpi=300)
    plt.close()

    return knn
Exemple #10
0
def linear_regressor(df, xcols):
    y = df['target_proxy']
    X = df[list(xcols)[0]]

    # Standardize and split the training nad test data
    X_std = standardize(X)
    ts = 0.3
    X_train, X_test, y_train, y_test = \
          train_test_split(X_std, y, test_size=ts, random_state=0)

    lr = LinearRegressionGD()
    lr.fit(np.transpose(np.array([X_train])), y_train)
    plt.plot(range(1, lr.n_iter + 1), lr.cost_)
    plt.ylabel('SSE')
    plt.xlabel('Epoch')
    plt.tight_layout()
    plt.savefig(IMG_PATH + 'cost.png', dpi=300)
    plt.close()

    lin_regplot(np.transpose(np.array([X_train])), y_train, lr)
    plt.savefig(IMG_PATH + 'lin_reg_cost.png', dpi=300)
    plt.close()

    # Find the average return of a stock with PE = 20
    # Note: will give odd results if x values are standardized and input is not
    y_val_std = lr.predict([20.0])
    print("Estimated Return: %.3f" % y_val_std)
    print('Slope: %.3f' % lr.w_[1])
    print('Intercept: %.3f' % lr.w_[0])
Exemple #11
0
def pml_knn_test():
    """
    Test our knn vs sklearn
    """
    # Get Data
    iris = datasets.load_iris()
    x_vals = iris.data[:, [2, 3]]
    y_vals = iris.target
    x_train, x_test, y_train, y_test = train_test_split(x_vals,
                                                        y_vals,
                                                        test_size=0.3,
                                                        random_state=0)
    x_train_std = standardize(x_train)
    x_test_std = standardize(x_test)
    # x_combined = np.vstack((x_train, x_test))
    x_combined_std = np.vstack((x_train_std, x_test_std))
    y_combined = np.hstack((y_train, y_test))
    iris_data = np.concatenate((x_train_std, np.array([y_train]).T), axis=1)

    # Sklearn KNN
    knn = KNeighborsClassifier(n_neighbors=5, p=2, metric='minkowski')
    knn.fit(x_train_std, y_train)
    # x_combined = np.vstack((x_train, x_test))
    y_combined = np.hstack((y_train, y_test))
    plot_decision_regions(x_combined_std,
                          y_combined,
                          classifier=knn,
                          test_break_idx=range(105, 150))
    plt.xlabel('petal length [standardized]')
    plt.ylabel('petal width [standardized]')
    plt.legend(loc='upper left')
    plt.tight_layout()
    plt.savefig(IMG_ROOT + "PML/" + 'knn_sklearn.png', dpi=300)
    plt.close()

    # Custom KNN
    cust_knn = KNN(iris_data, k_nbrs=5, dont_div=True)
    plot_decision_regions(x_combined_std,
                          y_combined,
                          classifier=cust_knn,
                          test_break_idx=range(105, 150))
    plt.xlabel('petal length [cm]')
    plt.ylabel('petal width [cm]')
    plt.legend(loc='upper left')
    plt.tight_layout()
    plt.savefig(IMG_ROOT + "PML/" + 'knn_cust.png', dpi=300)
    plt.close()
Exemple #12
0
def validation_curves(df, xcols):
    y = df['target']
    X = df[list(xcols)]

    # Standardize and split the training nad test data
    X_std = standardize(X)
    ts = 0.3
    X_train, X_test, y_train, y_test = \
          train_test_split(X_std, y, test_size=ts, random_state=0)

    pipe_lr = Pipeline([('scl', StandardScaler()),
                        ('clf', LogisticRegression(penalty='l2',
                                                   random_state=0))])

    param_range = [0.001, 0.01, 0.1, 1.0, 10.0, 100.0]
    train_scores, test_scores = validation_curve(estimator=pipe_lr,
                                                 X=X_train,
                                                 y=y_train,
                                                 param_name='clf__C',
                                                 param_range=param_range,
                                                 cv=10)

    train_mean = np.mean(train_scores, axis=1)
    train_std = np.std(train_scores, axis=1)
    test_mean = np.mean(test_scores, axis=1)
    test_std = np.std(test_scores, axis=1)

    plt.plot(param_range,
             train_mean,
             color='blue',
             marker='o',
             markersize=5,
             label='training accuracy')
    plt.fill_between(param_range,
                     train_mean + train_std,
                     train_mean - train_std,
                     alpha=0.15,
                     color='blue')
    plt.plot(param_range,
             test_mean,
             color='green',
             linestyle='--',
             marker='s',
             markersize=5,
             label='validation accuracy')
    plt.fill_between(param_range,
                     test_mean + test_std,
                     test_mean - test_std,
                     alpha=0.15,
                     color='green')
    plt.grid()
    plt.xscale('log')
    plt.legend(loc='best')
    plt.xlabel('Parameter C')
    plt.ylabel('Accuracy')
    plt.ylim([0.8, 1.0])
    plt.tight_layout()
    plt.savefig(IMG_PATH + 'val_curve.png', dpi=300)
    plt.close()
Exemple #13
0
def learning_curves(df, xcols):
    y = df['target']
    X = df[list(xcols)]

    # Standardize and split the training nad test data
    X_std = standardize(X)
    ts = 0.3
    X_train, X_test, y_train, y_test = \
          train_test_split(X_std, y, test_size=ts, random_state=0)

    pipe_lr = Pipeline([('scl', StandardScaler()),
                        ('clf', LogisticRegression(penalty='l2',
                                                   random_state=0))])

    train_sizes, train_scores, test_scores = learning_curve(
        estimator=pipe_lr,
        X=X_train,
        y=y_train,
        train_sizes=np.linspace(0.1, 1.0, 10),
        cv=10,
        n_jobs=1)

    train_mean = np.mean(train_scores, axis=1)
    train_std = np.std(train_scores, axis=1)
    test_mean = np.mean(test_scores, axis=1)
    test_std = np.std(test_scores, axis=1)

    plt.plot(train_sizes,
             train_mean,
             color='blue',
             marker='o',
             markersize=5,
             label='training accuracy')
    plt.fill_between(train_sizes,
                     train_mean + train_std,
                     train_mean - train_std,
                     alpha=0.15,
                     color='blue')
    plt.plot(train_sizes,
             test_mean,
             color='green',
             linestyle='--',
             marker='s',
             markersize=5,
             label='validation accuracy')
    plt.fill_between(train_sizes,
                     test_mean + test_std,
                     test_mean - test_std,
                     alpha=0.15,
                     color='green')
    plt.grid()
    plt.xlabel('Number of training samples')
    plt.ylabel('Accuracy')
    plt.legend(loc='best')
    plt.ylim([0.8, 1.0])
    plt.tight_layout()
    plt.savefig(IMG_PATH + 'learning_curve.png', dpi=300)
    plt.close()
Exemple #14
0
def precision_vs_recall(df, xcols):
    y = df['target']
    X = df[list(xcols)]

    # Standardize and split the training nad test data
    X_std = standardize(X)
    ts = 0.3

    # Need this just for specific cases, need postive results to be a value of 1
    y = y.map({4: 1, 0: 0})

    X_train, X_test, y_train, y_test = \
          train_test_split(X_std, y, test_size=ts, random_state=0)

    pipe_svc = Pipeline([('scl', StandardScaler()),
                         ('clf', SVC(random_state=1))])

    pipe_svc.fit(X_train, y_train)
    y_pred = pipe_svc.predict(X_test)
    confmat = confusion_matrix(y_true=y_test, y_pred=y_pred)
    print(confmat)

    fig, ax = plt.subplots(figsize=(2.5, 2.5))
    ax.matshow(confmat, cmap=plt.cm.Blues, alpha=0.3)
    for i in range(confmat.shape[0]):
        for j in range(confmat.shape[1]):
            ax.text(x=j, y=i, s=confmat[i, j], va='center', ha='center')
    plt.xlabel('predicted label')
    plt.ylabel('true label')
    plt.tight_layout()
    plt.savefig(IMG_PATH + 'confusion_matrix.png', dpi=300)
    plt.close()

    print('Precision: %.3f' % precision_score(y_true=y_test, y_pred=y_pred))
    print('Recall: %.3f' % recall_score(y_true=y_test, y_pred=y_pred))
    print('F1: %.3f' % f1_score(y_true=y_test, y_pred=y_pred))

    scorer = make_scorer(f1_score, pos_label=1)
    c_gamma_range = [0.01, 0.1, 1.0, 10.0]
    param_grid = [{
        'clf__C': c_gamma_range,
        'clf__kernel': ['linear']
    }, {
        'clf__C': c_gamma_range,
        'clf__gamma': c_gamma_range,
        'clf__kernel': ['rbf'],
    }]
    gs = GridSearchCV(estimator=pipe_svc,
                      param_grid=param_grid,
                      scoring=scorer,
                      cv=10,
                      n_jobs=-1)
    gs = gs.fit(X_train, y_train)
    print(gs.best_score_)
    print(gs.best_params_)
Exemple #15
0
def run_perceptron(train_df, xcols, eta=0.1, n_iter=10):
    ''' Takes the pruned dataframe and runs it through the perceptron class

        Parameters
        ==========
        df : dataframe
            dataframe with the inputs and target
        eta : float
            learning rate between 0 and 1
        n_iter : int
            passes over the training dataset

        Return
        ======
        NONE
    '''
    time0 = time.time()
    # Need this replace to comply with the -1 and 1 of the perceptron binary classifier
    y_df = train_df['target'].replace(0, -1)
    x_df = train_df[list(xcols)]

    # Standardize and split the training nad test data
    x_std = standardize(x_df)
    t_s = 0.3
    x_train, x_test, y_train, y_test = train_test_split(x_std,
                                                        y_df,
                                                        test_size=t_s,
                                                        random_state=0)

    plt.figure(figsize=(7, 4))
    plt.legend()
    ppn = Perceptron(eta, n_iter)
    ppn.fit(x_train, y_train.values)

    print('Training accuracy:', ppn.score(x_train, y_train))
    print('Test accuracy:', ppn.score(x_test, y_test))

    pdb.set_trace()
    plot_decision_regions(x_train, y_train.values, classifier=ppn)
    # plot_decision_regions(x_df.values, y_df.values, classifier=ppn)
    plt.xlabel(x_df.columns[0])
    plt.ylabel(x_df.columns[1])
    plt.savefig(IMG_ROOT + "perceptron_{}.png"
                "".format(dt.datetime.now().strftime("%Y%m%d")))
    plt.close()

    plt.plot(range(1, len(ppn.errors_) + 1), ppn.errors_, marker='o')
    plt.xlabel('Iterations')
    plt.ylabel('Number of misclassifications')
    plt.savefig(IMG_ROOT + "perceptron_misses_{}.png"
                "".format(dt.datetime.now().strftime("%Y%m%d")))
    plt.close()
    time1 = time.time()
    print("Done training data and creating charts, took {0} seconds"
          "".format(time1 - time0))
Exemple #16
0
def random_forest_regression(df, xcols):
    y = df['target_proxy']
    X = df[list(xcols)[0]]
    X = np.transpose(np.array([X]))

    # Standardize and split the training nad test data
    X_std = standardize(X)
    ts = 0.3
    X_train, X_test, y_train, y_test = \
          train_test_split(X_std, y, test_size=ts, random_state=0)

    tree = DecisionTreeRegressor(max_depth=3)
    tree.fit(X, y)
    sort_idx = X.flatten().argsort()
    lin_regplot(X[sort_idx], y[sort_idx], tree)
    plt.xlabel('x-val')
    plt.ylabel('Return')
    plt.savefig(IMG_PATH + 'tree_regression.png', dpi=300)
    plt.close()

    forest = RandomForestRegressor(n_estimators=1000,
                                   criterion='mse',
                                   random_state=1,
                                   n_jobs=-1)
    forest.fit(X_train, y_train)
    y_train_pred = forest.predict(X_train)
    y_test_pred = forest.predict(X_test)
    print('MSE train: %.3f, test: %.3f' % (mean_squared_error(
        y_train, y_train_pred), mean_squared_error(y_test, y_test_pred)))
    print('R^2 train: %.3f, test: %.3f' %
          (r2_score(y_train, y_train_pred), r2_score(y_test, y_test_pred)))

    plt.scatter(y_train_pred,
                y_train_pred - y_train,
                c='black',
                marker='o',
                s=35,
                alpha=0.5,
                label='Training data')
    plt.scatter(y_test_pred,
                y_test_pred - y_test,
                c='lightgreen',
                marker='s',
                s=35,
                alpha=0.7,
                label='Test data')
    plt.xlabel('Predicted values')
    plt.ylabel('Residuals')
    plt.legend(loc='best')
    plt.hlines(y=0, xmin=-10, xmax=50, lw=2, color='red')
    plt.xlim([-10, 50])
    plt.tight_layout()
    plt.savefig(IMG_PATH + 'slr_residuals.png', dpi=300)
Exemple #17
0
def grid_search_analysis(df, xcols):
    y = df['target']
    X = df[list(xcols)]

    # Standardize and split the training nad test data
    X_std = standardize(X)
    ts = 0.3
    X_train, X_test, y_train, y_test = \
          train_test_split(X_std, y, test_size=ts, random_state=0)

    pipe_svc = Pipeline([('scl', StandardScaler()),
                         ('clf', SVC(random_state=1))])

    param_range = [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0]

    param_grid = [{
        'clf__C': param_range,
        'clf__kernel': ['linear']
    }, {
        'clf__C': param_range,
        'clf__gamma': param_range,
        'clf__kernel': ['rbf']
    }]

    gs = GridSearchCV(estimator=pipe_svc,
                      param_grid=param_grid,
                      scoring='accuracy',
                      cv=10,
                      n_jobs=-1)
    gs = gs.fit(X_train, y_train)
    print(gs.best_score_)
    print(gs.best_params_)
    clf = gs.best_estimator_
    clf.fit(X_train, y_train)
    print('Test accuracy: %.3f' % clf.score(X_test, y_test))

    gs = GridSearchCV(estimator=pipe_svc,
                      param_grid=param_grid,
                      scoring='accuracy',
                      cv=2)
    scores = cross_val_score(gs, X_train, y_train, scoring='accuracy', cv=5)
    print('CV accuracy: %.3f +/- %.3f' % (np.mean(scores), np.std(scores)))

    gs = GridSearchCV(estimator=DecisionTreeClassifier(random_state=0),
                      param_grid=[{
                          'max_depth': [1, 2, 3, 4, 5, 6, 7, None]
                      }],
                      scoring='accuracy',
                      cv=2)
    scores = cross_val_score(gs, X_train, y_train, scoring='accuracy', cv=5)
    print('CV accuracy: %.3f +/- %.3f' % (np.mean(scores), np.std(scores)))
def eval_on_curr_companies(model, df, inputs):
    pdb.set_trace()
    df_ind = df[['ticker', 'date', 'month']]
    df_trimmed = pd.DataFrame(standardize(df[inputs]), columns=inputs)
    df_combine = pd.concat([df_ind.reset_index(drop=True), df_trimmed], axis=1)
    predictions = {}
    for ix, row in df_combine.iterrows():
        print(row['ticker'] + "   " + row['date'] + "   " + str(row['month']), end="")
        pred = model.predict(row[inputs])[0]
        try:
            predictions[pred].append(row['ticker'])
        except:
            predictions[pred] = [row['ticker']]
        print("    Class Prediction: " + str(pred))
    return predictions
def run_perceptron_multi(df, xcols, eta=0.1, n_iter=15):
    time0 = time.time()
    y = df['target']
    X = df[list(xcols)]

    # Split up the training and test data and standardize inputs
    # Standardize and split the training nad test data
    X_std = standardize(X)
    ts = 0.3
    X_train, X_test, y_train, y_test = train_test_split(X_std,
                                                        y,
                                                        test_size=ts,
                                                        random_state=0)

    # pdb.set_trace()
    # strong_buy = df[df['target'] == 3][list(X.columns)].values
    # buy = df[df['target'] == 2][list(X.columns)].values
    # sell = df[df['target'] == 1][list(X.columns)].values
    # strong_sell = df[df['target'] == 0][list(X.columns)].values

    # plt.figure(figsize=(7,4))
    # plt.scatter(buy[:, 0], buy[:, 1], color='blue', marker='x', label='Buy')
    # plt.scatter(sell[:, 0], sell[:, 1], color='red', marker='s', label='Sell')
    # plt.scatter(strong_buy[:, 0], strong_buy[:, 1], color='blue', marker='*',
    #             label='Strong Buy')
    # plt.scatter(strong_sell[:, 0], strong_sell[:, 1], color='red', marker='^',
    #             label='Strong Sell')
    # plt.xlabel(list(X.columns)[0])
    # plt.ylabel(list(X.columns)[1])
    # plt.legend()

    ppn = perceptron_skl(n_iter=40, eta0=0.1, random_state=0)
    ppn.fit(X_train, y_train)
    y_pred = ppn.predict(X_test)

    print('Accuracy: %.2f' % accuracy_score(y_test, y_pred))
    plot_decision_regions(X_train, y_train.values, classifier=ppn)
    plt.savefig(IMG_ROOT + "dow/perceptron_multi.png")
    plt.close()

    time1 = time.time()
    print("Done training data and creating charts, took {0} seconds"
          "".format(time1 - time0))
def logisticRegression(df, xcols, C=100, penalty='l2'):
    # Need xcols to be a tuple for the timeme method to work VERY HACKY
    y = df['target']
    X = df[list(xcols)]

    # Standardize and split the training nad test data
    X_std = standardize(X)
    ts = 0.3
    X_train, X_test, y_train, y_test = train_test_split(X_std,
                                                        y,
                                                        test_size=ts,
                                                        random_state=0)

    # Normalization of the data --> max = 1, min=0, etc
    # mms = MinMaxScaler()
    # X_train_norm = mms.fit_transform(X_train)
    # X_test_norm = mms.transform(X_test)

    # C: regularization parameter, (C = 1/lambda)
    # smaller C = more regulatiazion, smaller wieghts,  higher C = less regularization, lareger weights
    # penalty: type of regulatizaion function used for weight shrinkage / decay to prevent overfitting
    lr = LogisticRegression(C=C, random_state=0, penalty=penalty)
    lr.fit(X_train, y_train)

    # Shows the percentage of falling into each class
    print("Class breakdowns: " + str(lr.predict_proba(X_test[0:1])))
    print('Training accuracy:', lr.score(X_train, y_train))
    print('Test accuracy:', lr.score(X_test, y_test))
    print("y-intercept:" + str(lr.intercept_))
    print("coeffs:" + str(lr.coef_))

    try:
        plot_decision_regions(X_train, y_train.values, classifier=lr)
        plt.title('Logistic Regression')
        plt.xlabel(list(X.columns)[0])
        plt.ylabel(list(X.columns)[1])
        plt.legend(loc='upper left')
        plt.tight_layout()
        plt.savefig(IMG_ROOT + 'dow/log_reg_1.png', dpi=300)
        plt.close()
    except Exception as e:
        print("May have more than 2 variables")
    return lr
Exemple #21
0
def ransac(df, xcols):
    # function to deal with outliers
    y = df['target_proxy']
    X = df[list(xcols)[0]]
    X = np.transpose(np.array([X]))

    # Standardize and split the training nad test data
    X_std = standardize(X)
    ts = 0.3
    X_train, X_test, y_train, y_test = \
          train_test_split(X_std, y, test_size=ts, random_state=0)

    ransac = RANSACRegressor(
        LinearRegression(),
        max_trials=100,
        min_samples=50,
        residual_metric=lambda x: np.sum(np.abs(x), axis=1),
        residual_threshold=5.0,
        random_state=0)

    ransac.fit(X, y)
    inlier_mask = ransac.inlier_mask_
    outlier_mask = np.logical_not(inlier_mask)
    line_X = np.arange(3, 10, 1)
    line_y_ransac = ransac.predict(line_X[:, np.newaxis])
    plt.scatter(X[inlier_mask],
                y[inlier_mask],
                c='blue',
                marker='o',
                label='Inliers')
    plt.scatter(X[outlier_mask],
                y[outlier_mask],
                c='lightgreen',
                marker='s',
                label='Outliers')
    plt.plot(line_X, line_y_ransac, color='red')
    plt.xlabel('x-val')
    plt.ylabel('Returns')
    plt.legend(loc='best')
    plt.tight_layout()
    plt.savefig(IMG_PATH + 'ransac_fit.png', dpi=300)
    plt.close()
def principal_component_analysis(df, xcols):
    y = df['target']
    X = df[list(xcols)]
    
    # Standardize and split the training nad test data
    X_std = standardize(X)
    ts = 0.3
    X_train, X_test, y_train, y_test = train_test_split(X_std, y, test_size=ts, random_state=0)
          
    cov_mat = np.cov(X_train.T)
    eigen_vals, eigen_vecs = np.linalg.eig(cov_mat)
    print('Eigenvalues \n%s' % eigen_vals)
    tot = sum(eigen_vals)
    var_exp = [(i / tot) for i in sorted(eigen_vals, reverse=True)]
    cum_var_exp = np.cumsum(var_exp)
    
    plt.bar(range(1, 14), var_exp, alpha=0.5, align='center', label='individual explained variance')
    plt.step(range(1, 14), cum_var_exp, where='mid', label='cumulative explained variance')
    plt.ylabel('Explained variance ratio')
    plt.xlabel('Principal components')
    plt.legend(loc='best')
    plt.tight_layout()
    plt.savefig(IMG_PATH + 'pca1.png', dpi=300)
    plt.close()
    # plt.show()
    
    eigen_pairs = [(np.abs(eigen_vals[i]), eigen_vecs[:,i]) for i in range(len(eigen_vals))]
    eigen_pairs.sort(reverse=True)
    w = np.hstack((eigen_pairs[0][1][:, np.newaxis], eigen_pairs[1][1][:, np.newaxis]))
    # print('Matrix W:\n', w)
    
    X_train_pca = X_train.dot(w)
    colors = ['r', 'b', 'g']
    markers = ['s', 'x', 'o']
    
    for l, c, m in zip(np.unique(y_train), colors, markers):
        plt.scatter(X_train_pca[y_train.values==l, 0], X_train_pca[y_train.values==l, 1], c=c, label=l, marker=m)
    plt.xlabel('PC 1')
    plt.ylabel('PC 2')
    plt.legend(loc='lower left')
    plt.tight_layout()
    plt.savefig(IMG_PATH + 'pca2.png', dpi=300)
Exemple #23
0
def polynomial_regression(df, xcols):
    y = df['target_proxy']
    X = df[list(xcols)[0]]
    X = np.transpose(np.array([X]))

    # Standardize and split the training nad test data
    X_std = standardize(X)
    ts = 0.3
    X_train, X_test, y_train, y_test = \
          train_test_split(X_std, y, test_size=ts, random_state=0)

    lr = LinearRegression()
    pr = LinearRegression()
    quadratic = PolynomialFeatures(degree=2)
    X_quad = quadratic.fit_transform(X)
    # fit linear features
    lr.fit(X, y)
    X_fit = np.arange(-2, 50, 1)[:, np.newaxis]
    y_lin_fit = lr.predict(X_fit)

    # fit quadratic features
    pr.fit(X_quad, y)
    y_quad_fit = pr.predict(quadratic.fit_transform(X_fit))

    # plot results
    plt.scatter(X, y.values, label='training points')
    plt.plot(X_fit, y_lin_fit, label='linear fit', linestyle='--')
    plt.plot(X_fit, y_quad_fit, label='quadratic fit')
    plt.legend(loc='best')

    plt.tight_layout()
    plt.savefig(IMG_PATH + 'poly_regression.png', dpi=300)
    plt.close()

    y_lin_pred = lr.predict(X)
    y_quad_pred = pr.predict(X_quad)
    print('Training MSE linear: %.3f, quadratic: %.3f' % (mean_squared_error(
        y, y_lin_pred), mean_squared_error(y, y_quad_pred)))
    print('Training R^2 linear: %.3f, quadratic: %.3f' %
          (r2_score(y, y_lin_pred), r2_score(y, y_quad_pred)))
def logistic_regression_feature_importance(df, xcols, C=100, penalty='l2'):
    y = df['target']
    X = df[list(xcols)]
    
    # Standardize and split the training nad test data
    X_std = standardize(X)
    ts = 0.3
    X_train, X_test, y_train, y_test = train_test_split(X_std, y, test_size=ts, random_state=0)

    feat_labels = df[list(xcols)].columns
    lr = LogisticRegression(C=C, random_state=0, penalty=penalty)
    lr.fit(X_train, y_train)
    importances = lr.coef_[0]
    indices = np.argsort(abs(importances))[::-1]

    for f in range(X_train.shape[1]):
        print("%2d) %-*s %f" % (f + 1, 30, feat_labels[indices[f]], importances[indices[f]]))
    
    plt.title('Feature Importances')
    plt.bar(range(X_train.shape[1]), importances[indices],
            color='lightblue', align='center')
    plt.xticks(range(X_train.shape[1]), feat_labels[indices], rotation=90)
    plt.xlim([-1, X_train.shape[1]])
    plt.tight_layout()
    plt.savefig(IMG_ROOT + 'snp/logistic_regression_feat.png', dpi=300)
    plt.close()

    X_selected = lr.transform(X_train, threshold=0.05)
    print(X_selected.shape)

    # Shows the percentage of falling into each class
    print("Class breakdowns: " + str(lr.predict_proba(X_test[0:1])))
    print('Training accuracy:', lr.score(X_train, y_train))
    print('Test accuracy:', lr.score(X_test, y_test))
    print("y-intercept:" + str(lr.intercept_))
    print("coeffs:" + str(lr.coef_))
def random_forest(df, xcols, estimators=5):
    """
    Run random forest algorithm
    """
    y = df['target']
    X = df[list(xcols)]

    # Standardize and split the training nad test data
    X_std = standardize(X)
    t_s = 0.3
    X_train, X_test, y_train, y_test = train_test_split(X_std,
                                                        y,
                                                        test_size=t_s,
                                                        random_state=0)

    forest = RandomForestClassifier(criterion='entropy',
                                    n_estimators=estimators,
                                    random_state=1,
                                    n_jobs=3)
    forest.fit(X_train, y_train)

    # Shows the percentage of falling into each class
    print("Class breakdowns: " + str(forest.predict_proba(X_test[0:1])))
    print('Training accuracy:', forest.score(X_train, y_train))
    print('Test accuracy:', forest.score(X_test, y_test))
    print("Feature Importances :" + str(forest.feature_importances_))

    pdb.set_trace()
    plot_decision_regions(X_std, y.values, classifier=forest)
    plt.title('Randaom Forest (Decision Tree Ensemble)')
    plt.xlabel(list(X.columns)[0])
    plt.ylabel(list(X.columns)[1])
    plt.legend(loc='upper left')
    plt.tight_layout()
    plt.savefig(IMG_ROOT + 'snp/kmeans/random_forest.png', dpi=300)
    plt.close()
def nonlinear_svm(df, xcols, C=100, gamma=0.10):
    y = df['target']
    X = df[list(xcols)]

    # Standardize and split the training nad test data
    X_std = standardize(X)
    ts = 0.3
    X_train, X_test, y_train, y_test = \
          train_test_split(X_std, y, test_size=ts, random_state=0)

    svm = SVC(kernel='rbf', random_state=0, gamma=gamma, C=C)
    svm.fit(X_train, y_train)

    print('Training accuracy:', svm.score(X_train, y_train))
    print('Test accuracy:', svm.score(X_test, y_test))

    plot_decision_regions(X_std, y.values, classifier=svm)
    plt.title('Support Vector Machines - Non Linear')
    plt.xlabel(list(X.columns)[0])
    plt.ylabel(list(X.columns)[1])
    plt.legend(loc='upper left')
    plt.tight_layout()
    plt.savefig(IMG_PATH + 'svm_nonlinear_C' + str(C) + '.png', dpi=300)
    plt.close()
def linear_discriminant_analysis(df, xcols):
    y = df['target']
    X = df[list(xcols)]
    
    # Standardize and split the training nad test data
    X_std = standardize(X)
    ts = 0.3
    X_train, X_test, y_train, y_test = \
          train_test_split(X_std, y, test_size=ts, random_state=0)
    
    np.set_printoptions(precision=4)
    mean_vecs = []
    y_set = list(y.unique())
    for label in y_set:
        mean_vecs.append(np.mean(X_train[y_train.values==label], axis=0))
        # print('MV %s: %s\n' %(label, mean_vecs[label-1]))
    
    d = len(xcols) # number of features
    S_W = np.zeros((d, d))
    for label,mv in zip(y_set, mean_vecs):
        class_scatter = np.zeros((d, d)) # scatter matrix for each class
        for row in X_train[y_train.values == label]:
            row, mv = row.reshape(d, 1), mv.reshape(d, 1) # make column vectors
            class_scatter += (row-mv).dot((row-mv).T)
        S_W += class_scatter                             # sum class scatter matrices
    print('Within-class scatter matrix: %s' % (S_W))
    print('Class label distribution: %s' % np.bincount(y_train))
    
    S_W = np.zeros((d, d))
    for label,mv in zip(y_set, mean_vecs):
        class_scatter = np.cov(X_train[y_train.values==label].T)
        S_W += class_scatter
    print('Scaled within-class scatter matrix: %s' % (S_W))
    
    mean_overall = np.mean(X_train, axis=0)
    d = len(xcols) # number of features
    S_B = np.zeros((d, d))
    for i,mean_vec in enumerate(mean_vecs):
        n = X_train[y_train==i+1, :].shape[0]
        mean_vec = mean_vec.reshape(d, 1) # make column vector
        mean_overall = mean_overall.reshape(d, 1) # make column vector
        S_B += n * (mean_vec - mean_overall).dot((mean_vec - mean_overall).T)
    print('Between-class scatter matrix: %s' % (S_B))
    
    eigen_vals, eigen_vecs = np.linalg.eig(np.linalg.inv(S_W).dot(S_B))
    eigen_pairs = [(np.abs(eigen_vals[i]), eigen_vecs[:,i]) for i in range(len(eigen_vals))]
    # Sort the (eigenvalue, eigenvector) tuples from high to low
    eigen_pairs = sorted(eigen_pairs, key=lambda k: k[0], reverse=True)
    # Visually confirm that the list is correctly sorted by decreasing eigenvalues
    print('Eigenvalues in decreasing order:\\n')
    for eigen_val in eigen_pairs:
        print(eigen_val[0])
    
    tot = sum(eigen_vals.real)
    discr = [(i / tot) for i in sorted(eigen_vals.real, reverse=True)]
    cum_discr = np.cumsum(discr)
    
    plt.bar(range(0, d), discr, alpha=0.5, align='center',
            label='individual \"discriminability\"')
    plt.step(range(0, d), cum_discr, where='mid',
             label='cumulative \"discriminability\"')
    plt.ylabel('\"discriminability\" ratio')
    plt.xlabel('Linear Discriminants')
    plt.ylim([-0.1, 1.1])
    plt.legend(loc='best')
    plt.tight_layout()
    plt.savefig(IMG_PATH + 'lda1.png', dpi=300)
    plt.close()
    
    w = np.hstack((eigen_pairs[0][1][:, np.newaxis].real,
                          eigen_pairs[1][1][:, np.newaxis].real))
    print('Matrix W:\\n', w)
    
    X_train_lda = X_train.dot(w)
    colors = ['r', 'b', 'g']
    markers = ['s', 'x', 'o']
    
    for l, c, m in zip(np.unique(y_train), colors, markers):
        plt.scatter(X_train_lda[y_train.values==l, 0] * (-1), 
                    X_train_lda[y_train.values==l, 1] * (-1), 
                    c=c, label=l, marker=m)
    plt.xlabel('LD 1')
    plt.ylabel('LD 2')
    plt.legend(loc='lower right')
    plt.tight_layout()
    plt.savefig(IMG_PATH + 'lda2.png', dpi=300)
def sbs_run(train_df, xcols, k_feats=1, est=KNeighborsClassifier(n_neighbors=3), test=pd.DataFrame(), name=None):
    """
    Starting from the full set, sequentially remove the feature ЁЭСе
    that least reduces (or most increases) the value of the predictive score

    k_feats = number of chosen columns
    est = is the learning algorithm used to rank the features

    """
    y_val = train_df['target']
    x_val = train_df[list(xcols)]

    # Standardize and split the training and test data
    x_std = standardize(x_val)
    if test.empty:
        test_sz = 0.3
        x_train, x_test, y_train, y_test = train_test_split(x_std, y_val,
                                                            test_size=test_sz, random_state=0)
    else:
        x_train = x_std
        y_train = train_df['target']
        test = test[list(xcols)]
        x_test = standardize(test)
        y_test = test['target']

    # selecting features
    sbs = SBS(est, k_features=k_feats)
    sbs.fit(x_train, y_train)
    order = []
    if k_feats == 1:
        print("Removed Order, first to last: "
              "" + str(list(x_val.columns[sbs.removed_order + list(sbs.subsets_[-1])])))
        order = list(x_val.columns[sbs.removed_order + 
                                   list(sbs.subsets_[-1])])[::-1]
    else:
        print("Removed Order, first to last:" + str(list(x_val.columns[sbs.removed_order])))
        print("Chosen columns: " + str(list(x_val.columns[list(sbs.subsets_[-1])])))

    # plotting performance of feature subsets
    # This will chart the accuracy of each model as we remove features
    k_feat = [len(k) for k in sbs.subsets_]
    plt.plot(k_feat, sbs.scores_, marker='o')
    plt.ylim([0.0, 1.1])
    plt.ylabel('Accuracy')
    plt.xlabel('Number of features')
    plt.grid()
    plt.tight_layout()
    dt_time = dt.datetime.now().strftime("%Y%m%d_%H_%M")
    plt.savefig(IMG_ROOT + 'sbs_{}_{}.png'.format(name, dt_time), dpi=300)
    plt.close()

    # Training and test accuracy with all variables
    ks5 = list(sbs.subsets_[-1])
    est.fit(x_train, y_train)
    print("With all variables:")
    print('Training accuracy:', est.score(x_train, y_train))
    print('Test accuracy:', est.score(x_test, y_test))

    # Training and test accuracy with only chosen variables for model
    print("With only chosen (no:{}) variables:".format(k_feats))
    est.fit(x_train[:, ks5], y_train)
    print('Training accuracy:', est.score(x_train[:, ks5], y_train))
    print('Test accuracy:', est.score(x_test[:, ks5], y_test))
    return order
def adaboost(df, xcols):
    y = df['target']
    X = df[list(xcols)]

    # Need this just for specific cases, need postive results to be a value of 1
    y = y.map({4: 1, 0: 0})

    # Standardize and split the training nad test data
    X_std = standardize(X)
    ts = 0.3
    X_train, X_test, y_train, y_test = \
          train_test_split(X_std, y, test_size=ts, random_state=0)

    tree = DecisionTreeClassifier(criterion='entropy',
                                  max_depth=1,
                                  random_state=0)

    ada = AdaBoostClassifier(base_estimator=tree,
                             n_estimators=500,
                             learning_rate=0.1,
                             random_state=0)

    tree = tree.fit(X_train, y_train)
    y_train_pred = tree.predict(X_train)
    y_test_pred = tree.predict(X_test)

    tree_train = accuracy_score(y_train, y_train_pred)
    tree_test = accuracy_score(y_test, y_test_pred)
    print('Decision tree train/test accuracies %.3f/%.3f' %
          (tree_train, tree_test))

    ada = ada.fit(X_train, y_train)
    y_train_pred = ada.predict(X_train)
    y_test_pred = ada.predict(X_test)

    ada_train = accuracy_score(y_train, y_train_pred)
    ada_test = accuracy_score(y_test, y_test_pred)
    print('AdaBoost train/test accuracies %.3f/%.3f' % (ada_train, ada_test))
    x_min, x_max = X_train[:, 0].min() - 1, X_train[:, 0].max() + 1
    y_min, y_max = X_train[:, 1].min() - 1, X_train[:, 1].max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.1),
                         np.arange(y_min, y_max, 0.1))

    f, axarr = plt.subplots(1, 2, sharex='col', sharey='row', figsize=(8, 3))

    for idx, clf, tt in zip([0, 1], [tree, ada],
                            ['Decision Tree', 'AdaBoost']):
        clf.fit(X_train, y_train)

        Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
        Z = Z.reshape(xx.shape)

        axarr[idx].contourf(xx, yy, Z, alpha=0.3)
        axarr[idx].scatter(X_train[y_train.values == 0, 0],
                           X_train[y_train.values == 0, 1],
                           c='blue',
                           marker='^')
        axarr[idx].scatter(X_train[y_train.values == 1, 0],
                           X_train[y_train.values == 1, 1],
                           c='red',
                           marker='o')
        axarr[idx].set_title(tt)

    axarr[0].set_ylabel(xcols[0], fontsize=12)
    plt.text(10.2, -1.2, s=xcols[1], ha='center', va='center', fontsize=12)
    plt.tight_layout()
    plt.savefig(IMG_PATH + 'adaboost.png', bbox_inches='tight', dpi=300)
Exemple #30
0
def nonlinear(df, xcols):
    y = df['target_proxy']
    X = df[list(xcols)[0]]
    X = np.transpose(np.array([X]))

    # Standardize and split the training nad test data
    X_std = standardize(X)
    ts = 0.3
    X_train, X_test, y_train, y_test = \
          train_test_split(X_std, y, test_size=ts, random_state=0)

    regr = LinearRegression()

    # create quadratic features
    quadratic = PolynomialFeatures(degree=2)
    cubic = PolynomialFeatures(degree=3)
    X_quad = quadratic.fit_transform(X)
    X_cubic = cubic.fit_transform(X)

    # fit features
    X_fit = np.arange(X.min(), X.max(), 1)[:, np.newaxis]

    regr = regr.fit(X, y)
    y_lin_fit = regr.predict(X_fit)
    linear_r2 = r2_score(y, regr.predict(X))

    regr = regr.fit(X_quad, y)
    y_quad_fit = regr.predict(quadratic.fit_transform(X_fit))
    quadratic_r2 = r2_score(y, regr.predict(X_quad))

    regr = regr.fit(X_cubic, y)
    y_cubic_fit = regr.predict(cubic.fit_transform(X_fit))
    cubic_r2 = r2_score(y, regr.predict(X_cubic))

    # plot results
    plt.scatter(X, y, label='training points', color='lightgray')

    plt.plot(X_fit,
             y_lin_fit,
             label='linear (d=1), $R^2=%.2f$' % linear_r2,
             color='blue',
             lw=2,
             linestyle=':')

    plt.plot(X_fit,
             y_quad_fit,
             label='quadratic (d=2), $R^2=%.2f$' % quadratic_r2,
             color='red',
             lw=2,
             linestyle='-')

    plt.plot(X_fit,
             y_cubic_fit,
             label='cubic (d=3), $R^2=%.2f$' % cubic_r2,
             color='green',
             lw=2,
             linestyle='--')

    plt.xlabel('x-val')
    plt.ylabel('Return')
    plt.legend(loc='best')
    plt.tight_layout()
    plt.savefig(IMG_PATH + 'nonlinear_regr.png', dpi=300)
    plt.close()

    pdb.set_trace()
    # transform features
    X_log = np.log(X)
    y_sqrt = np.sqrt(y)

    # fit features
    X_fit = np.arange(X_log.min() - 1, X_log.max() + 1, 1)[:, np.newaxis]
    regr = regr.fit(X_log, y_sqrt)
    y_lin_fit = regr.predict(X_fit)
    linear_r2 = r2_score(y_sqrt, regr.predict(X_log))

    # plot results
    plt.scatter(X_log, y_sqrt, label='training points', color='lightgray')
    plt.plot(X_fit,
             y_lin_fit,
             label='linear (d=1), $R^2=%.2f$' % linear_r2,
             color='blue',
             lw=2)

    plt.xlabel('x-val')
    plt.ylabel('Return')
    plt.legend(loc='best')
    plt.tight_layout()
    plt.savefig(IMG_PATH + 'sqrt_log.png', dpi=300)