def do_pca(good_data, log_samples): # Apply PCA to the good data with the same number of dimensions as features pca = PCA(n_components=4) pca.fit(good_data) # Apply a PCA transformation to the sample log-data pca_samples = pca.transform(log_samples) # Generate PCA results plot pca_results = rs.pca_results(good_data, pca) # Display sample log-data after having a PCA transformation applied print "Principal Components..." display( pd.DataFrame(np.round(pca_samples, 4), columns=pca_results.index.values)) print "Explained variance ratios for all principal components: ", pca.explained_variance_ratio_ # Fit PCA to the good data using only two dimensions pca = PCA(n_components=2) pca.fit(good_data) # Apply a PCA transformation the good data reduced_data = pca.transform(good_data) # Apply a PCA transformation to the sample log-data pca_samples = pca.transform(log_samples) # Create a DataFrame for the reduced data reduced_data = pd.DataFrame(reduced_data, columns=['Dimension 1', 'Dimension 2']) # Display sample log-data after applying PCA transformation in two dimensions display( pd.DataFrame(np.round(pca_samples, 4), columns=['Dimension 1', 'Dimension 2'])) print "Explained variance ratios for top 2 principal components: ", pca.explained_variance_ratio_ return pca, reduced_data, pca_samples
def do_pca(good_data, log_samples): # Apply PCA to the good data with the same number of dimensions as features pca = PCA(n_components=4) pca.fit(good_data) # Apply a PCA transformation to the sample log-data pca_samples = pca.transform(log_samples) # Generate PCA results plot pca_results = rs.pca_results(good_data, pca) # Display sample log-data after having a PCA transformation applied print "Principal Components..." display(pd.DataFrame(np.round(pca_samples, 4), columns = pca_results.index.values)) print "Explained variance ratios for all principal components: ", pca.explained_variance_ratio_ # Fit PCA to the good data using only two dimensions pca = PCA(n_components=2) pca.fit(good_data) # Apply a PCA transformation the good data reduced_data = pca.transform(good_data) # Apply a PCA transformation to the sample log-data pca_samples = pca.transform(log_samples) # Create a DataFrame for the reduced data reduced_data = pd.DataFrame(reduced_data, columns = ['Dimension 1', 'Dimension 2']) # Display sample log-data after applying PCA transformation in two dimensions display(pd.DataFrame(np.round(pca_samples, 4), columns = ['Dimension 1', 'Dimension 2'])) print "Explained variance ratios for top 2 principal components: ", pca.explained_variance_ratio_ return pca, reduced_data, pca_samples
ax.invert_yaxis() ax.set_xlabel('Pontos de desvios') ax.set_title('Pontos de Desvios de cada categoria') plt.show() # Seleciona os pontos que devem ser removidos outliers = list(set(outliersList)) # Remove os pontos de desvios good_data = log_data.drop(log_data.index[outliers]).reset_index(drop=True) # Aplica o PCA para cada categoria pca = PCA(n_components=6) pca.fit(good_data) # Gera o gráfico das dimensões pca_results = rs.pca_results(good_data, pca) # Aplica o PCA para duas dimenões pca = PCA(n_components=2) pca.fit(good_data) # Redus a dimensão dos dados reduced_data = pca.transform(good_data) # Dataset com os dados reduzidos reduced_data = pd.DataFrame(reduced_data, columns=['Dimension 1', 'Dimension 2']) # Calcula o número ideal de clusters com o Elbow Method wcss = [] for i in range(1, 11):
X_variables = df[['sepal_len', 'sepal_wid', 'petal_len', 'petal_wid']] Y_variable = df['class'] from sklearn.preprocessing import StandardScaler X_std1 = StandardScaler().fit_transform(X_variables) from sklearn.decomposition import PCA sklearn_pca = PCA(n_components=4) Y_sklearn = sklearn_pca.fit_transform(X_std1) import renders as rs # Generate PCA results plot X_df = pd.DataFrame( X_std1, columns=['sepal_len', 'sepal_wid', 'petal_len', 'petal_wid']) pca_results = rs.pca_results(X_df, sklearn_pca) #since X_std1 is a np array but the above commmand takes only a df print pca_results['Explained Variance'].cumsum() # so only first 2 are enough to expalin most of the variance #DIMESION REDUCTION sklearn_pca = PCA(n_components=2) Y_sklearn = sklearn_pca.fit_transform(X_std1) # Create a DataFrame for the reduced data reduced_data = pd.DataFrame(Y_sklearn, columns=['Dimension 1', 'Dimension 2']) reduced_data['class'] = Y_variable
print "Original data = ", log_data.shape[0] print "Data without outliers = ", good_data.shape[0] # TODO: Apply PCA to the good data with the same number of dimensions as features from sklearn.decomposition import PCA pca = PCA(n_components=data.shape[1]) pca_all = pca.fit(good_data) # TODO: Apply a PCA transformation to the sample log-data pca_samples = pca_all.transform(log_samples) # Generate PCA results plot # print type(rs) pca_results = rs.pca_results(good_data, pca_all) pca_results print "Cumulative explained variance\n" print pca_results['Explained Variance'].cumsum() print "\n" ## Check that data is de-meaned #good_data_demean = good_data - good_data.mean() #pca_results_demean = rs.pca_results(good_data_demean, pca) #pca_results_demean # The parameter 'whiten' in PCA will control the rescaling of data by their variance (really n_samples * singular values) # source: http://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html
def predictStock(company, forcast = 5, endTime = None, startTime = None, debug=False): """ Predict [company]'s stock trend in [forcast] days by [endTime] Returns RISE/FALL trend and confidence """ if startTime == None: startTime = datetime(2000,1,1) if endTime == None: endTime = dt.date.today() # cuz it's dt's attribute # In[21]: # Introduce stock indices: nasdaq,dji,frankfurt,london,tokyo,hk,australia def getStockData(symbol, start, end, getVolume=False): """ Downloads Stock from Yahoo Finance. Returns daily Adj Close and Volume. Returns pandas dataframe. """ print "retriving data of", symbol df = get_data_yahoo(symbol, start, end) print "done" df.rename(columns = {'Adj Close' : 'AdjClose' + '_' + symbol}, inplace=True) ed = -2 if getVolume else -1 return df.drop(df.columns[:ed], axis = 1) # In[22]: ## Introduce indices: # NASDAQ Composite (^IXIC Yahoo Finance) # Dow Jones Industrial Average (^DJI Quandl) # Frankfurt DAX (^GDAXI Yahoo Finance) # London FTSE-100 (^FTSE Yahoo Finance) # Paris CAC 40 (^FCHI Yahoo Finance) # Tokyo Nikkei-225 (^N225 Yahoo Finance) # Hong Kong Hang Seng (^HSI Yahoo Finance) # Australia ASX-200 (^AXJO Yahoo Finance) ## ----------------------------------------------- indices_list = [company,'^IXIC','^DJI','^GSPC','^GDAXI','^FTSE','^N225','^HSI','^AXJO'] raw_data = [] raw_data.append(getStockData(indices_list[0], startTime, endTime, getVolume=True)) volume_data = raw_data[0].Volume raw_data[0] = raw_data[0].drop(raw_data[0].columns[0], axis = 1) for indice in indices_list[1:]: raw_data.append(getStockData(indice, startTime, endTime, getVolume=False)) if debug: print '\n\n\n================GENERATE FEATURE DATA==============================\n' print 'no. of indicators(prices/indices) retrived: ', len(raw_data) print '\n A glimpse of retrived data: \n', raw_data[0].head(), '\n', raw_data[0].tail() # In[23]: ## define parameters ## ----------------------------------------------- N_history = 13 # look back N_history * forcast days' data to design features return_eval_list = [1,2,3,5,8,13] # calculate return w.r.t the period listed in this list avg_return_span_list = return_eval_list[1:] # calculate avg return across the period listed in this list volume_rolling_win = N_history * forcast # volume statistics is evaluated by this rolling window size PCA_n = 10 # no. of features for PCA ## merge stock prices and indices and generate past N_history day records as features ## ----------------------------------------------- meg_data = pd.concat(raw_data, axis=1, join='inner') #print meg_data.head() for i in range(len(raw_data)): for j in range(1, 10): t = forcast * j; meg_data[meg_data.columns.values[i] + str(t)] = meg_data.iloc[:, i].shift(t) if debug: print "\n Merged Data with past N_history day records as features: \n", meg_data.tail() ## Generating return and average return over a list of period to capture stock history movements ## ----------------------------------------------- returnList = forcast * np.array(return_eval_list) avgReturnList = forcast * np.array(avg_return_span_list) if debug: print "\n return calculation period list \n",returnList print "\n avg. return calculation period list \n", avgReturnList return_data = pd.DataFrame() return_data['return'] = meg_data.iloc[:,0].pct_change(returnList[0]) for i in returnList[1:]: return_data['return'+str(i)] = meg_data.iloc[:,0].pct_change(i) for i in avgReturnList: return_data['avgReturn'+str(i)] = return_data['return'].rolling(i).mean() if debug: print "\n return_data \n", return_data.tail() ## Normalize Volume Data and compare to its standard deviation ## ----------------------------------------------- volume_norm = (volume_data - volume_data.rolling(volume_rolling_win).mean()) / volume_data.rolling(volume_rolling_win).std() if debug: print "\n rolling normallized volume \n", volume_norm.tail() # In[24]: import renders as rs from IPython.display import display from sklearn.decomposition import PCA ## PCA ## ----------------------------------------------- # TODO: Scale the data using the natural logarithm meg_data = meg_data.dropna() log_data = np.log(meg_data) #print "log data: ", log_data # TODO: Apply PCA by fitting the good data with the same number of dimensions as features pca = PCA(n_components = PCA_n) pca.fit(log_data) if debug: print '\n\n\n================PCA==============================\n' print "PCA explained variance ratio = ", pca.explained_variance_ratio_ # Generate PCA results plot pca_results = rs.pca_results(log_data, pca) # TODO: Transform the data using the PCA fit above reduced_data = pca.transform(log_data) reduced_data = pd.DataFrame(reduced_data, index=log_data.index) #print reduced_data.shape, reduced_data ## Combine Overall Data Set ## ----------------------------------------------- data = pd.concat([reduced_data, return_data, volume_norm], axis=1, join='inner') data = data.dropna() #print data ## Generate Labels ## ----------------------------------------------- y_raw = return_data['return'].shift(-returnList[0]) comp = lambda x: (1 if x > 0 else -1) y_raw = y_raw.apply(comp) if debug: print "\n y_raw: \n", y_raw x_endtime = data.iloc[-1] # feature of last day, used for final stock prediction data = data.drop(data.index[-returnList[0]:], axis = 0) # remove last forcast days, useless for model training y = pd.Series(y_raw, index = data.index) # align labels to data indexes if debug: print "\n data for train/test: ", data.shape, "\n", data.tail(volume_rolling_win) print "\n stock return:", data['return'] print "\n labels: ", y.shape, "\n", y#.tail(volume_rolling_win) # In[25]: ## Split the data into training and testing sets using the given feature as the target ## ----------------------------------------------- from sklearn.cross_validation import train_test_split X_train, X_test, y_train, y_test = train_test_split(data, y, test_size=0.25, random_state=42) if debug: print "y_test: \n", y_test # In[26]: ### implement some useful functions ### ================================================ #get_ipython().magic(u'matplotlib inline') from time import time from sklearn.metrics import f1_score, recall_score, precision_score, accuracy_score from sklearn.metrics import roc_curve, auc import matplotlib.pyplot as plt from sklearn.metrics import make_scorer from sklearn.grid_search import GridSearchCV def fa_score(ytrue, ypred, pos_label=1): return min(f1_score(ytrue, ypred, pos_label=1), accuracy_score(ytrue, ypred)) def train_classifier(clf, X_train, y_train, gridSearch=False, parameters=None): ''' Fits a classifier to the training data. Can configure CV GridSearch''' # Start the clock, train the classifier, then stop the clock start = time() print X_train.shape, y_train.shape if not gridSearch: clf.fit(X_train, y_train) else: f1_scorer = make_scorer(f1_score, pos_label=1) accu_scorer = make_scorer(accuracy_score) fa_scorer = make_scorer(fa_score) grid_obj = GridSearchCV(clf, parameters, scoring = fa_scorer) grid_obj.fit(X_train, y_train) print "GridSearch Best Parameters: ", grid_obj.best_params_, '=', grid_obj.best_score_ clf = grid_obj.best_estimator_ end = time() # Print the results print "Trained model in {:.4f} seconds".format(end - start) return clf def predict_labels(clf, features, target): ''' Makes predictions using a fit classifier. ''' # Start the clock, make predictions, then stop the clock start = time() y_pred = clf.predict(features) end = time() # Print and return results print "Made predictions in {:.4f} seconds.".format(end - start) #print "labels: \n", target.values #print "preds: \n", y_pred return f1_score(target.values, y_pred, pos_label=1), accuracy_score(target.values, y_pred),\ recall_score(target.values, y_pred), precision_score(target.values, y_pred) def train_predict(clf, X_train, y_train, X_test, y_test, gridSearch=False, parameters=None): ''' Train and predict using a classifer based on F1 score. ''' # Indicate the classifier and the training set size print "\n\nTraining a {} using a training set size of {}. . .".format(clf.__class__.__name__, len(X_train)) # Train the classifier clf = train_classifier(clf, X_train, y_train, gridSearch, parameters) # Print the results of prediction for both training and testing f1train, atrain, rtrain, ptrain = predict_labels(clf, X_train, y_train) print "For training set:",'\naccuracy =',atrain, '\nprecision =', ptrain, '\nrecall =', rtrain, '\nf1 =',f1train f1test, atest, rtest, ptest = predict_labels(clf, X_test, y_test) print "For testing set:",'\naccuracy =',atest, '\nprecision =', ptest, '\nrecall =', rtest, '\nf1 =',f1test return clf, min(f1test,atest) def plotROC(clf, X_test, y_test): # Determine the false positive and true positive rates print "output labels belong to : ", clf.classes_ fpr, tpr, _ = roc_curve(y_test, clf.predict_proba(X_test)[:, 1]) # Calculate the AUC roc_auc = auc(fpr, tpr) print 'ROC AUC: %0.2f' % roc_auc # Plot of a ROC curve for a specific class plt.figure() plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc) plt.plot([0, 1], [0, 1], 'k--') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('ROC Curve') plt.legend(loc="lower right") plt.show() def compBestClf(clf_cand, score_cand, clf_best, score_best): if score_cand > score_best: score_best = score_cand clf_best = clf_cand return clf_best, score_best ### ================================================// # In[27]: ### train and fit using a set of models and select the best ### ================================================ # Variables to compare and find best classifier model clf_best = None score_best = 0 # In[28]: ## Boost Classifier ## ----------------------------------------------- from sklearn.ensemble import AdaBoostClassifier # TODO: Initialize the three models clf_Bst = AdaBoostClassifier(n_estimators=20) # TODO: Execute the 'train_predict' function for each classifier and each training set size param_Bst = {'n_estimators': [2, 3, 5, 10, 20, 50], 'learning_rate': [0.1, 0.5, 1]} clf_Bst, f1_Bst = train_predict(clf_Bst, X_train, y_train, X_test, y_test, gridSearch=True, parameters = param_Bst) #plotROC(clf_Bst, X_test, y_test) # compare best clf clf_best, score_best = compBestClf(clf_Bst, f1_Bst, clf_best, score_best) # In[29]: ## Random Forest Classifier ## ----------------------------------------------- from sklearn.ensemble import RandomForestClassifier clf_RF = RandomForestClassifier(n_estimators=10, n_jobs=-1) param_RF = {'n_estimators': [5, 10, 20, 40, 80, 160], 'max_depth': [2,3,4,8]} clf_RF, f1_RF = train_predict(clf_RF, X_train, y_train, X_test, y_test, gridSearch=True, parameters = param_RF) #plotROC(clf_RF, X_test, y_test) # compare best clf clf_best, score_best = compBestClf(clf_RF, f1_RF, clf_best, score_best) # In[30]: ## Naive Bayes Classifier ## ----------------------------------------------- from sklearn.naive_bayes import GaussianNB clf_NB = GaussianNB() clf_NB, f1_NB = train_predict(clf_NB, X_train, y_train, X_test, y_test) #plotROC(clf_NB, X_test, y_test) # compare best clf clf_best, score_best = compBestClf(clf_NB, f1_NB, clf_best, score_best) # In[31]: ## Decision Tree Classifier ## ----------------------------------------------- from sklearn.tree import DecisionTreeClassifier clf_DT = DecisionTreeClassifier(random_state=0, min_samples_split=80) param_DT = {'min_samples_split': [2,5,10,50,100,200], 'max_depth': [2,3,5,8]} clf_DT, f1_DT = train_predict(clf_DT, X_train, y_train, X_test, y_test, gridSearch=True, parameters = param_DT) #plotROC(clf_DT, X_test, y_test) # compare best clf clf_best, score_best = compBestClf(clf_DT, f1_DT, clf_best, score_best) # In[32]: ## SVM Classifier ## ----------------------------------------------- from sklearn import svm # TODO: Initialize the three models clf_SVM = svm.SVC(kernel='poly', probability=True) # TODO: Execute the 'train_predict' function for each classifier and each training set size param_SVM = {'C': [1,6,36], 'gamma': [0.5,2,4], 'degree': [2,3]} #clf_SVM, f1_SVM = train_predict(clf_SVM, X_train, y_train, X_test, y_test, gridSearch=True, parameters = param_SVM) #[SLOW!!!] #plotROC(clf_SVM, X_test, y_test) # compare best clf #clf_best, score_best = compBestClf(clf_SVM, f1_SVM, clf_best, score_best) # [SLOW!!!] # In[33]: ## KNN Classifier ## ----------------------------------------------- from sklearn.neighbors import KNeighborsClassifier clf_KNN = KNeighborsClassifier(n_neighbors=4) param_KNN = {'n_neighbors':[2,4,8,16,32]} clf_KNN, f1_KNN = train_predict(clf_KNN, X_train, y_train, X_test, y_test, gridSearch=True, parameters = param_KNN) #plotROC(clf_KNN, X_test, y_test) # compare best clf clf_best, score_best = compBestClf(clf_KNN, f1_KNN, clf_best, score_best) # In[34]: ## Output Classifier Model Selection Sumary and Best Prediction Result ## ----------------------------------------------- print "\n\n===================MODEL TRAIN END=======================" print "Best Classifier is: \n", clf_best print "Best Classifier's Fa Score on test: ", score_best if debug: plotROC(clf_best, X_test, y_test) x_endtime_row = x_endtime.reshape(1, -1) pred_v = clf_best.predict(x_endtime_row) pred_p = clf_best.predict_proba(x_endtime_row) pred_str = "RISE" if pred_v > 0 else "FALL" print "Predict: \n", company, "stock after", forcast, "days of", x_endtime.name, "\nis going to (with confidence FALL/RISE =", pred_p, "):" , pred_str return pred_str, pred_p, score_best