def do_ml(ticker):
    X, y, df = extract_featuresets(ticker)
    X_train, X_test, y_train, y_test = cross_validation.train_test_split(
        X, y, test_size=0.25)
    clf = neighbors.KNeighborsClassifier()
    clf.fit(X_train, y_train)
    confidence = clf.score(X_test, y_test)
    print('accuracy:', confidence)
    predictions = clf.predict(X_test)
    print('Predicted class counts:', Counter(predictions))

    clf = VotingClassifier([('lsvc', svm.LinearSVC()),
                            ('knn', neighbors.KNeighborsClassifier()),
                            ('rfor', RandomForestClassifier())])

    return confidence
Beispiel #2
0
forecast_col = 'Adj. Close'
data.fillna(-9999, inplace=True)
forecast_out = int(math.ceil(0.01 * len(data)))

data['label'] = data[forecast_col].shift(-forecast_out)

X = np.array(data.drop('label'), 1)
X = preprocessing.scale(X)
X = X[:-forecast_out]
X_lately = X[-forecast_out:]
data.dropna(inplace=True)
y = np.array(data['label'])
y = np.array(data['label'])

X_train, X_test, y_train, y_test = cross_validate.train_test_split(
    X, y, test_size=0.2)
clf = LinearRegression(n_jobs=-1)
clf.fit(X_train, y_train)
accuracy = clf.score(X_test, y_test)

forecast_set = clf.predict(X_lately)
print(forecast_set, accuracy, forecast_out)

data['Forecast'] = np.nan
last_date = data.iloc[-1].name
last_unix = last_date.timestamp()
one_day = 86400
next_unix = last_unix + one_day

for i in forecast_set:
    next_date = datetime.datetime.fromtimestamp(next_unix)
Beispiel #3
0
def XGB_native(train, test, features, features_non_numeric):
    depth = 13
    eta = 0.01
    ntrees = 8000
    mcw = 3
    params = {
        "objective": "reg:linear",
        "booster": "gbtree",
        "eta": eta,
        "max_depth": depth,
        "min_child_weight": mcw,
        "subsample": 0.9,
        "colsample_bytree": 0.7,
        "silent": 1
    }
    print("Running with params: " + str(params))
    print("Running with ntrees: " + str(ntrees))
    print("Running with features: " + str(features))

    # Train model with local split
    tsize = 0.05
    X_train, X_test = cross_validate.train_test_split(train, test_size=tsize)
    dtrain = xgb.DMatrix(X_train[features], np.log(X_train[goal] + 1))
    dvalid = xgb.DMatrix(X_test[features], np.log(X_test[goal] + 1))
    watchlist = [(dvalid, 'eval'), (dtrain, 'train')]
    gbm = xgb.train(params,
                    dtrain,
                    ntrees,
                    evals=watchlist,
                    early_stopping_rounds=100,
                    feval=rmspe_xg,
                    verbose_eval=True)
    train_probs = gbm.predict(xgb.DMatrix(X_test[features]))
    indices = train_probs < 0
    train_probs[indices] = 0
    error = rmspe(np.exp(train_probs) - 1, X_test[goal].values)
    print(error)

    # Predict and Export
    test_probs = gbm.predict(xgb.DMatrix(test[features]))
    indices = test_probs < 0
    test_probs[indices] = 0
    submission = pd.DataFrame({myid: test[myid], goal: np.exp(test_probs) - 1})
    if not os.path.exists('result/'):
        os.makedirs('result/')
    submission.to_csv(
        "./result/dat-xgb_d%s_eta%s_ntree%s_mcw%s_tsize%s.csv" %
        (str(depth), str(eta), str(ntrees), str(mcw), str(tsize)),
        index=False)
    # Feature importance
    if plot:
        outfile = open('xgb.fmap', 'w')
    i = 0
    for feat in features:
        outfile.write('{0}\t{1}\tq\n'.format(i, feat))
    i = i + 1
    outfile.close()
    importance = gbm.get_fscore(fmap='xgb.fmap')
    importance = sorted(importance.items(), key=operator.itemgetter(1))
    df = pd.DataFrame(importance, columns=['feature', 'fscore'])
    df['fscore'] = df['fscore'] / df['fscore'].sum()
    # Plotitup
    plt.figure()
    df.plot()
    df.plot(kind='barh',
            x='feature',
            y='fscore',
            legend=False,
            figsize=(25, 15))
    plt.title('XGBoost Feature Importance')
    plt.xlabel('relative importance')
    plt.gcf().savefig(
        'Feature_Importance_xgb_d%s_eta%s_ntree%s_mcw%s_tsize%s.png' %
        (str(depth), str(eta), str(ntrees), str(mcw), str(tsize)))
Beispiel #4
0
def do_ml(ticker):

    X, y, df = extract_featuresets(tickers)
    X_train, X_test, y_train, y_test = cross_validate.train_test_split(
        X, y, test_size=0.25)
def splitTestTrain(data, labels, percentage):
    ''' Split data into test and train '''

    X_train, X_test, y_train, y_test = cross_validate.train_test_split(data, labels, test_size=percentage, random_state=42)

    return X_train, X_test, y_train, y_test
Beispiel #6
0
# Import some data to play with
iris = datasets.load_iris()
X = iris.data
y = iris.target

##变为2分类
#当y!=2条件成立时,取值为1,不成立时,取值为0.
X, y = X[y != 2], y[y != 2]

# Add noisy features to make the problem harder
random_state = np.random.RandomState(0)
n_samples, n_features = X.shape
X = np.c_[X, random_state.randn(n_samples, 200 * n_features)]

# shuffle and split training and test sets
X_train, X_test, y_train, y_test = cross_validate.train_test_split(
    X, y, test_size=.3, random_state=0)

# Learn to predict each class against the other
svm = svm.SVC(kernel='linear', probability=True, random_state=random_state)

###通过decision_function()计算得到的y_score的值,用在roc_curve()函数中
y_score = svm.fit(X_train, y_train).decision_function(X_test)

# Compute ROC curve and ROC area for each class
fpr, tpr, threshold = roc_curve(y_test, y_score)  ###计算真正率和假正率
roc_auc = auc(fpr, tpr)  ###计算auc的值

plt.figure()
lw = 2
plt.figure(figsize=(10, 10))
plt.plot(fpr,
# In[33]:

X_train, X_test, Y_train, Y_test = sklearn.cross_validation.train_test_split(
    X, Y, test_size=0.33, random_state=5)

# In[34]:

from sklearn import cross_validation

# In[35]:

from sklearn.model_selection import cross_validate

# In[36]:

X_train, X_test, Y_train, Y_test = cross_validate.train_test_split(
    X, Y, test_size=0.33, random_state=5)

# In[37]:

from sklearn.model_selection import train_test_split

# In[38]:

X_train, X_test, Y_train, Y_test = train_test_split(X,
                                                    Y,
                                                    test_size=0.33,
                                                    random_state=5)

# In[39]:

print(X_train.shape)
Beispiel #8
0
print(modelSVM.score(X_test, Y_test))

modelSVMRaw = LinearSVC(C=0.1)
modelSVMRaw = modelSVMRaw.fit(X_new, Y)
cnt = 0
for i in modelSVMRaw.predict(X_new):
    if (i == Y[1]):
        cnt = cnt + 1
print("Linear SVC score without split")
print(float(cnt) / 101)

# Applying the PCA on the data features
modelSVM2 = SVC(C=0.1, kernel='rbf')

# Applying the cross validation on training and the test set for validating our linear SVM model
X_train1, X_test1, Y_train1, Y_test1 = cross_validate.train_test_split(
    X_new, Y, test_size=0.2, train_size=0.1, random_state=0)
modelSVM2 = modelSVM2.fit(X_train1, Y_train1)
print("RBF score with split")
print(modelSVM2.score(X_test1, Y_test1))

modelSVMRaw2 = SVC(C=0.1, kernel='rbf')
modelSVMRaw2 = modelSVMRaw2.fit(X_new, Y)
cnt1 = 0
for i in modelSVMRaw2.predict(X_new):
    if i == Y[1]:
        cnt1 = cnt1 + 1
print("RBF score without split")
print(float(cnt1) / 298)

# Only perform 2 algorithms