def pca_predict(self, sample_name): index = ["sex", "age", "state", "education",\ "transitDuration","fulfillDuration", \ "green","blue","black","yellow","red","white","amount"] self.df_user_server = pd.read_csv(self.local_path + sample_name, \ names = ["orderid","age","sex","state","education","transitDuration","fulfillDuration","black","blue","green","yellow","red","white","amount"]) self.df_user_server.drop(columns=['orderid']) self.df_user_server = self.df_user_server[index] f = open(self.output_path+"user.dict", 'r') dic = pickle.load(f) f.close() self.df_user_server["sex"] = self.df_user_server["sex"].apply(lambda x: dic["sex"][x]) self.df_user_server["state"] = self.df_user_server["state"].apply(lambda x: dic["state"][x]) self.df_user_server["education"] = self.df_user_server["education"].apply(lambda x: dic["education"][x]) f = open(self.output_path+"pca.model", 'r') self.pca = pickle.load(f) f.close() scaler = StandardScaler() self.pca_result = scaler.fit_transform(self.df_user_server.values) self.pca_result = self.pca.transform(self.pca_result) print self.pca_result self.pca_summary = vs.pca_results(self.df_user_server, self.pca, self.plot_path) np.savetxt(self.feature_path + sample_name, self.pca_result, delimiter=",", header="pca1,pca2", comments='')
def pca(self): index = ["sex", "age", "state", "education",\ "transitDuration","fulfillDuration", \ "green","blue","black","yellow","red","white","amount"] scaler = StandardScaler() self.pca_result = scaler.fit_transform(self.df_user_server[index].values) self.pca = PCA(n_components=2) self.pca_result= self.pca.fit_transform(self.pca_result) self.pca_summary = vs.pca_results(self.df_user_server[index], self.pca, self.plot_path) f = open(self.output_path+"pca.model", 'w') pickle.dump(self.pca, f) f.close() f = open(self.output_path+"pca_summary.csv", 'w') pickle.dump(self.pca_summary, f) f.close() np.savetxt(self.feature_path + "pca.csv", self.pca_result, delimiter=",", header="pca1,pca2", comments='')
# Remove the outliers, if any were specified good_data = log_data.drop(log_data.index[outliers]).reset_index(drop = True) # Apply PCA by fitting the good data with the same number of dimensions as features from sklearn.decomposition import PCA pca = PCA(n_components=6, copy=True) pca.fit(good_data) # Transform log_samples using the PCA fit above pca_samples = pca.transform(log_samples) # Generate PCA results plot pca_results = vs.pca_results(good_data, pca) # Display sample log-data after having a PCA transformation applied display(pd.DataFrame(np.round(pca_samples, 4), columns = pca_results.index.values)) # Apply PCA by fitting the good data with only two dimensions pca = PCA(n_components=2, copy=True) pca.fit(good_data) # Transform the good data using the PCA fit above reduced_data = pca.transform(good_data) # Transform log_samples using the PCA fit above pca_samples = pca.transform(log_samples)
#Import sklearn.decomposition.PCA and assign the results of fitting PCA in six dimensions with good_data to pca. #Apply a PCA transformation of the sample log-data log_samples using pca.transform, and assign the results to pca_samples. from sklearn.decomposition import PCA # Apply PCA by fitting the good data with the same number of dimensions as features pca = PCA(n_components=6) pca = pca.fit(good_data) # Transform the sample log-data using the PCA fit above pca_samples = pca.transform(good_data) # Generate PCA results plot pca_results = vs.pca_results(good_data, pca) # Apply PCA by fitting the good data with only two dimensions pca = PCA(n_components=2) pca = pca.fit(good_data) # Transform the good data using the PCA fit above reduced_data = pca.transform(good_data) # Transform the sample log-data using the PCA fit above pca_samples = pca.transform(log_samples) # Create a DataFrame for the reduced data
def extract_data(ticker, pca=None): #all these don't have anything to do with whether a PCA exist or not data_yahoo = pd.read_csv('Data/' + ticker + '-yahoo.csv', index_col='Date') data_yahoo.index = pd.to_datetime(data_yahoo.index) #here I'm trying to paint a shape of what's happening during the trading hours, these are independent features #range describes the min/max distance per open price data_yahoo['Range'] = (data_yahoo['High'] - data_yahoo['Low']) / data_yahoo['Open'] #high is a percentage of open data_yahoo['High'] = data_yahoo['High'] / data_yahoo['Open'] - 1 #low is a percentage of open data_yahoo['Low'] = data_yahoo['Low'] / data_yahoo['Open'] - 1 #open is a percentage of previous day's close data_yahoo['Open'] = data_yahoo['Open'] / data_yahoo['Close'].shift(1) - 1 #previous 5 days moving average (adj close) data_yahoo['MA5 Adj Close'] = data_yahoo['Adj Close'].rolling( window=5).mean().shift(1) #previous 5 days moving average (volume) data_yahoo['MA5 Volume'] = data_yahoo['Volume'].rolling( window=5).mean().shift(1) #% change vs. previous 5 days (adj close) data_yahoo['MA5 Adj Close pct_change'] = data_yahoo[ 'Adj Close'] / data_yahoo['MA5 Adj Close'] - 1 #% change vs. previous 5 days (volume) data_yahoo['MA5 Volume pct_change'] = data_yahoo['Volume'] / data_yahoo[ 'MA5 Volume'] - 1 #this is what we are trying to predict (targets) #1. 1 day future price data_yahoo['Adj Close 1day'] = data_yahoo['Adj Close'].shift(-1) #2. 5 days future price data_yahoo['Adj Close 5day'] = data_yahoo['Adj Close'].shift(-5) #data_yahoo['Adj Close 10day'] = data_yahoo['Adj Close'].shift(-10) #3. 1 day future price percentage change data_yahoo['Adj Close 1day pct_change'] = data_yahoo[ 'Adj Close 1day'] / data_yahoo['Adj Close'] - 1 #4. 5 day future price percentage change data_yahoo['Adj Close 5day pct_change'] = data_yahoo[ 'Adj Close 5day'] / data_yahoo['Adj Close'] - 1 #data_yahoo['Adj Close 10day pct_change'] = data_yahoo['Adj Close 10day'] / data_yahoo['Adj Close'] - 1 #5. 1 day future price direction data_yahoo['Adj Close 1day pct_change cls'] = data_yahoo[ 'Adj Close 1day pct_change'].apply(lambda x: 1 if x >= 0 else 0) #6. 5 day future price direction data_yahoo['Adj Close 5day pct_change cls'] = data_yahoo[ 'Adj Close 5day pct_change'].apply(lambda x: 1 if x >= 0 else 0) #data_yahoo['Adj Close 10day pct_change cls'] = data_yahoo['Adj Close 10day pct_change'].apply(lambda x: 1 if x >= 0 else 0) data_yahoo.dropna(axis=0, how='any', inplace=True) #let's look at the target variable distribution if False: #scaling isn't all that great for these two target variables for col_label in ['Adj Close 1day', 'Adj Close 5day']: lam = 0.0001 #scaler = StandardScaler() #data = scaler.fit_transform(data_yahoo[col_label]) data = data_yahoo[col_label] if np.min(data) < 0: data = data - np.min(data) ''' no scaler: 1.2165656107790856 -0.06554419693948103 -0.2485500333952623 1.2147780477183334 -0.06797363864363892 -0.25105816533149256 MinMax: 1.2165656107790865 0.9905547643484544 -0.6092542377635981 1.2147780477183334 0.9885749885051007 -0.6115631693413965 Standard: 1.216565610779086 0.7273346450678947 -0.6463496872857882 1.214778047718333 0.7258861448434313 -0.6485618913384967 Adj Close 1day - no scaler boxcox Adj Close 5day - no scaler boxcox ''' fig, (ax1, ax2, ax3) = plt.subplots(ncols=3, figsize=(15, 6)) sns.distplot(data, fit=norm, ax=ax1) sns.distplot(boxcox1p(data, lam), fit=norm, ax=ax2) sns.distplot(np.log(data + lam), fit=norm, ax=ax3) (mu1, sigma1) = norm.fit(data) (mu2, sigma2) = norm.fit(boxcox1p(data, lam)) (mu3, sigma3) = norm.fit(np.log(data + lam)) ax1.legend([ 'Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format( mu1, sigma1), 'Skewness: {:.2f}'.format(skew(data)) ], loc='best') ax2.legend([ 'Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format( mu2, sigma2), 'Skewness: {:.2f}'.format( skew(boxcox1p(data, lam))) ], loc='best') ax3.legend([ 'Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format( mu3, sigma3), 'Skewness: {:.2f}'.format( skew(np.log(data + lam))) ], loc='best') ax1.set_ylabel('Frequency') print(skew(data), skew(boxcox1p(data, lam)), skew(np.log(data + lam))) ax1.set_title(col_label + ' Distribution') ax2.set_title(col_label + ' Box-Cox Transformed') ax3.set_title(col_label + ' Log Transformed') plt.show() if False: #scalers doesn't really work here either for col_label in [ 'Adj Close 1day pct_change', 'Adj Close 5day pct_change' ]: lam = 0.0001 #scaler = StandardScaler() #data = scaler.fit_transform(data_yahoo[col_label]) data = data_yahoo[col_label] if np.min(data) < 0: data = data - np.min(data) ''' no scaler: -1.6510040307386993 -3.041993709001984 -55.25486882951101 -0.9408177644672319 -1.8326191132390537 -29.740251304355382 MinMax: -1.6510040307386906 -3.7210597124936196 -56.219015977319174 -0.9408177644672386 -1.928686775469499 -30.170597099885942 Standard: -1.6510040307386935 -23.430168647942985 -61.022779357622056 -0.9408177644672379 -7.476432811167501 -39.192139404540846 Adj Close 1day pct_change - no scaler no transform Adj Close 5day pct_change - no scaler no transform ''' fig, (ax1, ax2, ax3) = plt.subplots(ncols=3, figsize=(15, 6)) sns.distplot(data, fit=norm, ax=ax1) sns.distplot(boxcox1p(data, lam), fit=norm, ax=ax2) sns.distplot(np.log(data + lam), fit=norm, ax=ax3) (mu1, sigma1) = norm.fit(data) (mu2, sigma2) = norm.fit(boxcox1p(data, lam)) (mu3, sigma3) = norm.fit(np.log(data + lam)) ax1.legend([ 'Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format( mu1, sigma1), 'Skewness: {:.2f}'.format(skew(data)) ], loc='best') ax2.legend([ 'Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format( mu2, sigma2), 'Skewness: {:.2f}'.format( skew(boxcox1p(data, lam))) ], loc='best') ax3.legend([ 'Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format( mu3, sigma3), 'Skewness: {:.2f}'.format( skew(np.log(data + lam))) ], loc='best') ax1.set_ylabel('Frequency') print(skew(data), skew(boxcox1p(data, lam)), skew(np.log(data + lam))) ax1.set_title(col_label + ' Distribution') ax2.set_title(col_label + ' Box-Cox Transformed') ax3.set_title(col_label + ' Log Transformed') plt.show() if False: #transformations doesn't work for col_label in [ 'Adj Close 1day pct_change cls', 'Adj Close 5day pct_change cls' ]: lam = 0.0001 #scaler = StandardScaler() #data = scaler.fit_transform(data_yahoo[col_label]) data = data_yahoo[col_label] if np.min(data) < 0: data = data - np.min(data) fig, (ax1, ax2, ax3) = plt.subplots(ncols=3, figsize=(15, 6)) sns.distplot(data, fit=norm, ax=ax1) sns.distplot(boxcox1p(data, lam), fit=norm, ax=ax2) sns.distplot(np.log(data + lam), fit=norm, ax=ax3) (mu1, sigma1) = norm.fit(data) (mu2, sigma2) = norm.fit(boxcox1p(data, lam)) (mu3, sigma3) = norm.fit(np.log(data + lam)) ax1.legend([ 'Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format( mu1, sigma1), 'Skewness: {:.2f}'.format(skew(data)) ], loc='best') ax2.legend([ 'Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format( mu2, sigma2), 'Skewness: {:.2f}'.format( skew(boxcox1p(data, lam))) ], loc='best') ax3.legend([ 'Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format( mu3, sigma3), 'Skewness: {:.2f}'.format( skew(np.log(data + lam))) ], loc='best') ax1.set_ylabel('Frequency') print(skew(data), skew(boxcox1p(data, lam)), skew(np.log(data + lam))) ax1.set_title(col_label + ' Distribution') ax2.set_title(col_label + ' Box-Cox Transformed') ax3.set_title(col_label + ' Log Transformed') plt.show() #let's look at the distribution between each independent variables if False: for col_label in [ 'Open', 'High', 'Low', 'Range', 'Adj Close', 'Volume', 'MA5 Adj Close', 'MA5 Volume', 'MA5 Adj Close pct_change', 'MA5 Volume pct_change' ]: lam = 0.0001 MM_Scaler = StandardScaler() data = MM_Scaler.fit_transform(data_yahoo[col_label]) #data = data_yahoo[col_label] if np.min(data) < 0: data = data - np.min(data) ''' no scaler: -5.81225631547921 -9.146269872594456 -62.02985068281074 2.712481343056322 2.5700673022819003 -0.9250758631542217 -2.5660281111479226 -2.748341308395658 -33.43627464332718 2.0711648325652057 1.9393218387203244 0.11413402414840106 1.2168876669938415 -0.06494740962703247 -0.24793607926031352 2.964204574183921 -0.053455194367689286 -0.05363840820122992 1.2162667302493857 -0.06319870260334028 -0.24591172238801545 1.612323763816607 -0.13175043131958308 -0.13190749719487718 -1.5597704161777437 -2.5595298869477796 -37.90656860268758 6.2660356551310485 1.3701314015201278 -1.5385880097432818 MinMax: -5.8122563154792015 -11.082155965519698 -62.60899490983156 2.7124813430563224 2.1099552360244025 -1.5007847275278556 -2.566028111147925 -3.2923115321333882 -39.97787685974701 2.071164832565206 1.63981390276237 -0.5591602425043781 1.216887666993841 0.9909781433323644 -0.6086980942779617 2.96420457418392 2.1627318198895265 -0.44700320966425255 1.2162667302493853 0.9896212018472764 -0.6191447177436266 1.6123237638166064 1.2209628689973062 -0.8754504958756806 -1.5597704161777388 -2.9341885148669853 -39.23072118617314 6.26603565513105 4.0674910496926024 -0.6555394360531169 Standard: -5.812256315479212 -43.76335777947685 -65.00115946436995 2.7124813430563215 0.9592105401947402 -2.3878118031831503 -2.566028111147922 -7.668800386424772 -47.99647262001495 2.0711648325652066 0.6949862598819779 -0.7304334255376943 1.2168876669938415 0.7276509004530627 -0.6458190217616919 2.9642045741839196 0.7457660928003991 -0.6036534963638037 1.2162667302493853 0.7279810838194929 -0.6604003387555384 1.6123237638166061 0.47908958845988775 -0.9756621700306726 -1.5597704161777395 -11.379256076858951 -47.849396206857875 6.2660356551310485 0.9336930967122643 -1.872136860642601 Open - no scaler no transform High - StandardScaler boxcox transform Low - no scaler no transform Range - no scaler log transform Adj Close - no scaler boxcox transform Volume - no scaler boxcox transform MA5 Adj Close - no scaler boxcox transform MA5 Volume - no scaler boxcox transform MA5 Adj Close pct_change - no scaler no transform MA5 Volume pct_change - MinMaxScaler log transform ''' fig, (ax1, ax2, ax3) = plt.subplots(ncols=3, figsize=(15, 6)) sns.distplot(data, fit=norm, ax=ax1) sns.distplot(boxcox1p(data, lam), fit=norm, ax=ax2) sns.distplot(np.log(data + lam), fit=norm, ax=ax3) (mu1, sigma1) = norm.fit(data) (mu2, sigma2) = norm.fit(boxcox1p(data, lam)) (mu3, sigma3) = norm.fit(np.log(data + lam)) ax1.legend([ 'Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format( mu1, sigma1), 'Skewness: {:.2f}'.format(skew(data)) ], loc='best') ax2.legend([ 'Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format( mu2, sigma2), 'Skewness: {:.2f}'.format( skew(boxcox1p(data, lam))) ], loc='best') ax3.legend([ 'Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format( mu3, sigma3), 'Skewness: {:.2f}'.format( skew(np.log(data + lam))) ], loc='best') print(skew(data), skew(boxcox1p(data, lam)), skew(np.log(data + lam))) ax1.set_ylabel('Frequency') ax1.set_title(col_label + ' Distribution') ax2.set_title(col_label + ' Box-Cox Transformed') ax3.set_title(col_label + ' Log Transformed') plt.show() #so what do we need to transform? #no scaler no transform #these transofrmations don't need PCA either lam = 0.0001 col_names = [ 'Adj Close 1day pct_change', 'Adj Close 5day pct_change', 'Adj Close 1day pct_change cls', 'Adj Close 5day pct_change cls', 'Open', 'Low', 'MA5 Adj Close pct_change' ] #no scaler, boxcox transform col_names = [ 'Adj Close 1day', 'Adj Close 5day', 'Adj Close', 'MA5 Adj Close', 'MA5 Volume', 'Volume' ] for col_name in col_names: data_yahoo[col_name] = boxcox1p(data_yahoo[col_name], lam) #no scaler, log transform data_yahoo['Range'] = np.log(data_yahoo['Range'] + lam) #StandardScaler, boxcox transform SS_scaler = StandardScaler() data_yahoo['High'] = boxcox1p(SS_scaler.fit_transform(data_yahoo['High']), lam) #MinMaxScaler, log transform MM_scaler = MinMaxScaler() data_yahoo['MA5 Volume pct_change'] = np.log( MM_scaler.fit_transform(data_yahoo['MA5 Volume pct_change']) + lam) #let's look at heatmaps if False: #correlation X vs. ylog print(data_yahoo.head(20)) corrmat = data_yahoo.corr() plt.subplots(figsize=(12, 9)) g = sns.heatmap(corrmat, vmax=0.9, square=True, annot=True, annot_kws={'size': 8}) g.set_yticklabels(g.get_yticklabels(), rotation=0, fontsize=8) g.set_xticklabels(g.get_xticklabels(), rotation=90, fontsize=8) plt.title('Correlation Matrix/Heatmap Numerical Features vs. Targets') plt.tight_layout() plt.show() #let's also try PCA train = data_yahoo[[ 'Open', 'High', 'Low', 'Range', 'Adj Close', 'Volume', 'MA5 Adj Close', 'MA5 Volume', 'MA5 Adj Close pct_change', 'MA5 Volume pct_change' ]] if pca == None: pca = PCA(n_components=7) trainPCA = pd.DataFrame(pca.fit_transform(train)) else: trainPCA = pd.DataFrame(pca.transform(train)) PCA_data_yahoo = pd.DataFrame.copy(trainPCA) PCA_data_yahoo.columns = ['Dimension ' + str(i) for i in range(1, 8)] for target in [ 'Adj Close 1day', 'Adj Close 5day', 'Adj Close 1day pct_change', 'Adj Close 5day pct_change', 'Adj Close 1day pct_change cls', 'Adj Close 5day pct_change cls' ]: PCA_data_yahoo[target] = data_yahoo.reset_index()[target] if False: #show PCA results, cumulative power, and heatmap pca_results = vs.pca_results(train, pca) plt.show() ys = pca.explained_variance_ratio_ xs = np.arange(1, len(ys) + 1) plt.plot(xs, np.cumsum(ys), '-o') for label, x, y in zip(np.cumsum(ys), xs, np.cumsum(ys)): plt.annotate('{:.2f}%'.format(label * 100), xy=(x, y), xytext=(30, -20), textcoords='offset points', ha='right', va='bottom', bbox=dict(boxstyle='round,pad=0.5', fc='yellow', alpha=0.5), arrowprops=dict(arrowstyle='->', connectionstyle='arc3,rad=0')) plt.ylabel('Cumulative Explained Variance') plt.xlabel('Dimensions') plt.title('PCA - Total Explained Variance by # fo Dimensions') plt.tight_layout() plt.show() g = sns.heatmap(temp.corr(), annot=True, annot_kws={'size': 8}) g.set_yticklabels(g.get_yticklabels(), rotation=0, fontsize=8) g.set_xticklabels(g.get_xticklabels(), rotation=90, fontsize=8) plt.title('PCA Correlation Matrix/Heatmap') plt.tight_layout() #plt.savefig('Charts/PCA heatmap.png') plt.show() #export pca, pca dataset, and original dataset return pca, PCA_data_yahoo, data_yahoo
tpvafs[recurrentvafs3] = l3 else: tpvafs[recurrentvafs3] = [0, 0, vaf3] df = pd.DataFrame.from_items(tpvafs.items(), orient='index', columns=['1', '2', '3']) display(df.describe()) df = df.astype(float) display(df.describe()) pd.plotting.scatter_matrix(df, alpha=0.2, figsize=(14, 8), diagonal='kde') pca = PCA(n_components=3) # random_state only available from 0.18.0 onwards pca.fit(df) pca_results = vs.pca_results(df, pca) #vs.biplot(df, reduced_data, pca) #pca = PCA(n_components=2) #pca.fit(df) reduced_data = pca.transform(df) #reduced_data = pd.DataFrame(reduced_data, columns = ['Dimension 1', 'Dimension 2']) #components=[2,3,4,5,6,7,8,9,10,11,12,14,20] components = [9] All_Scores = [] for i in range(len(components)): clusterer = GMM(n_components=components[i], init_params='random').fit(reduced_data) # TODO: Predict the cluster for each data point preds = clusterer.predict(reduced_data)
testing_features = features[october_1:] testing_labels = labels[october_1:] # Generate reduced features using PCA from sklearn.decomposition import PCA pca = PCA(n_components=2) reduced_training_features = pca.fit_transform(training_features) reduced_testing_features = pca.fit_transform(testing_features) # Visualize first 10 principal components of features import visuals as vs featuresDF = pd.DataFrame(data=features) pca = PCA(n_components=10) pca.fit(featuresDF) pca_samples = pca.transform(featuresDF) pca_results = vs.pca_results(featuresDF, pca) # Benchmark Model from sklearn.metrics import mean_absolute_error preds = [np.mean(training_labels)] * len(testing_labels) print 'Benchmark Results:' print 'MAE:', mean_absolute_error(testing_labels,preds) # Linear Regression from sklearn.linear_model import LinearRegression lin_reg = LinearRegression() lin_reg.fit(training_features, training_labels) preds = lin_reg.predict(testing_features) score = lin_reg.score(testing_features,testing_labels) print 'Linear Regression Results:' print 'R2 score:', score
# Remove the data point with only 1 feature which is outlier good_data = log_data.copy() good_data = good_data.drop(log_data.index[outliers]) ##################################################################################################### # TODO: Apply PCA by fitting the good data with the same number of dimensions as features from sklearn.decomposition import PCA pca = PCA(n_components=6) good_data = pca.fit_transform(good_data) # TODO: Transform log_samples using the PCA fit above pca_samples = log_samples.copy() pca_samples = pca.transform(pca_samples) # Generate PCA results plot pca_results = vs.pca_results(pd.DataFrame(pca_samples), pca) ##################################################################################################### # Display sample log-data after having a PCA transformation applied display( pd.DataFrame(np.round(pca_samples, 4), columns=pca_results.index.values)) ##################################################################################################### # TODO: Apply PCA by fitting the good data with only two dimensions pca = PCA(n_components=2) # TODO: Transform the good data using the PCA fit above reduced_data = pca.fit_transform(good_data)