def PCA_f(X, soglia): """Trasforma un insieme di dati tramite l'analisi delle componenti principali. Inputs: X -- l'insieme di dati sottoforma di DataFrame soglia -- la frazione di varianza spiegata minima dell'insieme di dati finale rispetto a quello iniziale Outputs: X_df_transformed -- DataFrame composto dalle componenti principali, il numero di componenti principali viene determinato dal parametro di soglia dato in input """ X_std = StandardScaler().fit_transform(X) X_std = (X_std - X_std.min(axis=0)) / (X_std.max(axis=0) - X_std.min(axis=0)) X = pd.DataFrame(data=X_std, index=None, columns=X.columns) pca = PCA(n_components=len(X.columns)) # max number of pca pca.fit(X) # Standardizing the features X_transformed = pca.transform(X) var_cumulata = np.array([ pca.explained_variance_ratio_[:i].sum() for i in range(1, len(X.columns)) ]).round(2) idx_ok = np.argmax(var_cumulata >= soglia) pca_names = [] for i in range(idx_ok): tmp = ["PCA", str(i + 1)] pca_names.append(" ".join(tmp)) X_df_transformed = pd.DataFrame(data=X_transformed[:, :idx_ok], index=None, columns=pca_names) return X_df_transformed
def analyze_regression(X, y, ax, cPoint, cSLine, metrics): try: _X = StandardScaler().fit_transform(X.values.reshape(-1, 1)) _y = StandardScaler().fit_transform(y.values.reshape(-1, 1)) _linear = LinearRegression().fit(_X, _y) _predict = _linear.predict(_X) ax.scatter(_X, _y, color = cPoint, marker = '.', alpha = .6, label = 'Dispersión') _pn, _px = _X.tolist().index(_X.min()), _X.tolist().index(_X.max()) ax.plot((_X.min(), _X.max()), (_predict[_pn], _predict[_px]), color = cSLine, label = 'Recta de Regresión') ax.set_title('Regresión Lineal') ax.set_xticks(()); ax.set_yticks(()) ax.legend() if metrics: _print_regressionMetrics(_linear, _X, _y, _predict) except Exception as e: return e
def qt_analysis(res): qt = res.drop(res.columns.intersection(['N2', 'dt', 'f']), axis=1) qt.hist() plt.show() qt_std = StandardScaler().fit_transform(qt) max = qt_std.max(0) min = qt_std.min(0) schew = sum(qt_std > 1) / qt_std.shape[0] + \ sum(qt_std < -1) / qt_std.shape[0] return schew
def analyze_classification(X, y, ax, cContourf, cInlier, cOutlier, gamma, metrics): try: _X = StandardScaler().fit_transform(X.values.reshape(-1, 1)) _y = StandardScaler().fit_transform(y.values.reshape(-1, 1)) _padding = CONTOURF_CLASSIFICATION_MINING_PADDING _mesh_sted_size = CONTOURF_CLASSIFICATION_MINING_MESH_STEP_SIZE _X_min, _X_max = _X.min() - _padding, _X.max() + _padding _y_min, _y_max = _y.min() - _padding, _y.max() + _padding _mapx, _mapy = meshgrid(arange(_X_min, _X_max, _mesh_sted_size), arange(_y_min, _y_max, _mesh_sted_size)) if gamma == 0: gamma = 'auto' _classifier = OneClassSVM(kernel = 'rbf', gamma = gamma, random_state = 0).fit(c_[_X, _y]) _Z = _classifier.decision_function(c_[_mapx.ravel(), _mapy.ravel()]) _predict = _classifier.predict(c_[_X, _y]) ax.contourf(_mapx, _mapy, _Z.reshape(_mapx.shape), cmap = cContourf, alpha = .7) _sub_XIn, _sub_XOut = list(), list() _sub_yIn, _sub_yOut = list(), list() for i in range(_predict.size): if _predict[i] == 1: _sub_XIn.append(_X[i]); _sub_yIn.append(_y[i]) else: _sub_XOut.append(_X[i]); _sub_yOut.append(_y[i]) ax.scatter(_sub_XIn, _sub_yIn, c = cInlier, marker = '.', alpha = .6, label = 'Inliers') ax.scatter(_sub_XOut, _sub_yOut, c = cOutlier, marker = '.', alpha = .6, label = 'OutLiers') ax.set_title('SVM') ax.set_xticks(()); ax.set_yticks(()) ax.legend() if metrics: _print_classificationMetrics(_classifier, _predict) except Exception as e: return e
def standardize(array, name): """Recieves a dataFrame or Series (from pandas) and returns a numpy array with zero mean and unit variance.""" # Transform to numpy array nparray = array.as_matrix().reshape(array.shape[0],1).astype('float32') print('------------') print(name) print('Different values before:', np.unique(nparray).shape[0]) # Standardize the data nparray = StandardScaler().fit_transform(nparray) # Print some information print('Mean:', nparray.mean()) print('Max:', nparray.max()) print('Min:', nparray.min()) print('Std:', nparray.std()) print('Different values after:', np.unique(nparray).shape[0]) return nparray
df = pd.read_table('u.data', names=col_names, usecols=col_names[0:3], dtype=np.int32) # Process ratings and save to file user_ratings = np.zeros([N, M]) for i in range(0, N): # foreach user in dataset # foreach rating of a unique user, centre and normalise data u = df.loc[df['user id'] == (i + 1)] temp = np.array([k for j, k in zip(u['movie id'], u['rating'])]) temp = temp.reshape(-1, 1) temp = StandardScaler(with_std=False).fit_transform(X=temp) temp = temp.reshape(len(temp)) min_r = temp.min() max_r = temp.max() x = 0 for j, k in zip(u['movie id'], u['rating']): # store (existing) ratings in array row, filling empty cells with 0 user_ratings[i, (j - 1)] = np.interp(temp[x], [min_r, max_r], [0, 1]) x += 1 np.save('user_ratings.npy', user_ratings) df.drop(columns=col_names[0:3]) # Process user ids # one-hot encoded set of ids equals NxN identity matrix user_ids = np.identity(N, dtype=np.int32) np.save('user_ids.npy', user_ids) # Split train data into 5 folds and save results to file train_ind = []
def Plot_Decision_Boundaries_2D(X1, X2, y, Estimators, Test_Size=0.3, Random_State=None, Scale=True, Colour_Map=plt.cm.coolwarm, Bright_Colour_Map=plt.cm.afmhot, Alpha_Train=1, Alpha_Test=0.6, Certainty_Threshold=None, Variable_Names=("Variable1", "Variable2"), Delta=0.02): def Return_Most_Certain_Classification_Data(X, y, Model, Certainty_Thresh=0, Fit_First=False): if Fit_First: Model = Model.fit(X, y) if hasattr(Model, "predict_proba"): probabilities = Model.predict_proba(X) elif hasattr(Model, "decision_function"): probabilities = Model.decision_function(X) certainty_bool = np.amax(probabilities, axis=1) > Certainty_Thresh certain_predictors, certain_response = X[certainty_bool], y[ certainty_bool] print("Old number of samples:", len(y)) print("New number of samples:", len(certain_response)) return certain_predictors, certain_response if Certainty_Threshold != None: X_Combined = np.hstack((X1.reshape(-1, 1), X2.reshape(-1, 1))) X, y = Return_Most_Certain_Classification_Data( X_Combined, y, Model=Estimator, Certainty_Thresh=Certainty_Threshold, Fit_First=True) X1, X2 = X[:, 0], X[:, 1] #Define a class bijection for class colour mapping unique_classes, y_bijection = np.unique(y, return_inverse=True) #Sort the data so colour labels match up with actual labels X1, X2 = X1.reshape((-1, 1)), X2.reshape((-1, 1)) y_bijection = y_bijection.reshape((-1, 1)) Full_combined = np.hstack((X1, X2, y_bijection)) Full_combined = Full_combined[Full_combined[:, 2].argsort()] X1, X2 = Full_combined[:, 0].reshape((-1, 1)), Full_combined[:, 1].reshape( (-1, 1)) y_bijection = Full_combined[:, 2].reshape((-1, 1)) #Preprocess the data if needed: X1, X2 = StandardScaler().fit_transform( X1), StandardScaler().fit_transform(X2) delta = Delta #Step size in the mesh figure = plt.figure(figsize=(12, 8)) x1_min, x1_max = X1.min() - 0.5, X1.max() + 0.5 x2_min, x2_max = X2.min() - 0.5, X2.max() + 0.5 xx, yy = np.meshgrid(np.arange(x1_min, x1_max, delta), np.arange(x2_min, x2_max, delta)) #Plot the given data (colourmap) col_map = Colour_Map col_map_bright = Bright_Colour_Map #Ready a train test split Full_combined = np.hstack((X1, X2, y_bijection)) X_train, X_test, y_train, y_test = train_test_split( Full_combined[:, [0, 1]], Full_combined[:, 2], test_size=Test_Size, random_state=Random_State) #Get a figure and axes based on how many estimators (1 or multiple there are) #Multiple estimators if isinstance(Estimators, (list, type(np.array))): n_rows = len(Estimators) fig, axes = plt.subplots(nrows=n_rows, ncols=2, sharex=True, sharey=True, figsize=(12, n_rows * 4)) #One estimator else: Estimators = np.array([Estimators]) fig, axes = plt.subplots(1, 2, figsize=(12, 8)) axes = np.array([axes]) for axs, Estimator in zip(axes[:], Estimators): ax1, ax2 = axs[0], axs[1] ax1.set_title("Input Data") #Plot Training data scat = ax1.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=col_map_bright, edgecolors='k', alpha=Alpha_Train) #And testing data ax1.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=col_map_bright, edgecolors='k', alpha=Alpha_Test) ax1.set_xlim(xx.min(), xx.max()) ax1.set_ylim(yy.min(), yy.max()) ax1.set_xlabel(Variable_Names[0]) ax1.set_ylabel(Variable_Names[1]) #Now for the classifier model = Estimator.fit(X_train, y_train) score = model.score(X_test, y_test) #Plot the decision boundary. For that, we will assign a colour to each point # in the mesh [x1_min, x1_max]*[x2_min, x2_max] if hasattr(model, "decision_function"): Z = model.decision_function(np.c_[xx.ravel(), yy.ravel()]) elif hasattr(model, "predict_proba"): Z = model.predict_proba(np.c_[xx.ravel(), yy.ravel()]) else: print( "This Estimator doesn't have a decision_function attribute and can't predict probabilities" ) Z = np.argmax(Z, axis=1) Z_uniques = np.unique(Z) unique_predictions = unique_classes[Z_uniques] #Put the result in a colourplot Z = Z.reshape(xx.shape) contour = ax2.pcolormesh(xx, yy, Z, vmin=Z.min(), vmax=Z.max(), cmap=col_map, alpha=0.7) #Plot also the training data ax2.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=col_map_bright, edgecolors='k', alpha=Alpha_Train) #And testing data ax2.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=col_map_bright, edgecolors='k', alpha=Alpha_Test) ax2.set_xlim(xx.min(), xx.max()) ax2.set_ylim(yy.min(), yy.max()) ax2.set_xlabel(Variable_Names[0]) ax2.set_ylabel(Variable_Names[1]) ax2.set_title(str(Estimator)) ax2.text(xx.max() - .3, yy.min() + .3, ('%.2f' % score).lstrip('0'), size=15, horizontalalignment='right') cb1 = plt.colorbar(scat, spacing="proportional", ax=ax1, ticks=np.arange(len(unique_classes))) cb1.ax.set_yticklabels(unique_classes) print("Unique Predictions: {}".format(unique_classes[Z_uniques]), "for: {}".format(Estimator)) ticks = np.linspace(Z.min(), Z.max(), len(unique_predictions)) cb2 = plt.colorbar(contour, spacing="proportional", ax=ax2, ticks=ticks) cb2.ax.set_yticklabels(unique_predictions) #Also print the score of the model print("Model Score:", score, "\n") plt.tight_layout(rect=[0, 0.03, 1, 0.95]) fig.suptitle("Data and Classification Boundaries", fontsize=20) return fig
def heatmap(x, row_header, column_header, row_method,column_method, row_metric, column_metric,color_gradient, html_folder): with warnings.catch_warnings(): warnings.simplefilter("ignore") """ This below code is based in large part on the protype methods: http://old.nabble.com/How-to-plot-heatmap-with-matplotlib--td32534593.html http://stackoverflow.com/questions/7664826/how-to-get-flat-clustering-corresponding-to-color-clusters-in-the-dendrogram-cre x is an m by n ndarray, m observations, n genes """ ### Define the color gradient to use based on the provided name n = len(x[0]); m = len(x) if color_gradient == 'red_white_blue': cmap=plt.cm.bwr if color_gradient == 'red_black_sky': cmap=RedBlackSkyBlue() if color_gradient == 'red_black_blue': cmap=RedBlackBlue() if color_gradient == 'red_black_green': cmap=RedBlackGreen() if color_gradient == 'yellow_black_blue': cmap=YellowBlackBlue() if color_gradient == 'seismic': cmap=plt.cm.seismic if color_gradient == 'green_white_purple': cmap=plt.cm.PiYG_r if color_gradient == 'coolwarm': cmap=plt.cm.coolwarm ### Scale the max and min colors so that 0 is white/black x = StandardScaler().fit_transform(x) vmin = x.min() vmax = x.max() vmax = max([vmax,abs(vmin)]) vmin = vmax*-1 norm = mpl.colors.Normalize(vmin, vmax) ### adjust the max and min to scale these colors ### Scale the Matplotlib window size default_window_height = 7 default_window_width = 11 fig = plt.figure(figsize=(default_window_width,default_window_height)) ### could use m,n to scale here color_bar_w = 0.015 ### Sufficient size to show ## calculate positions for all elements # ax1, placement of dendrogram 1, on the left of the heatmap [ax1_x, ax1_y, ax1_w, ax1_h] = [0.05,0.22,0.2,0.6] ### The second value controls the position of the matrix relative to the bottom of the view width_between_ax1_axr = -0.004 height_between_ax1_axc = -0.004 ### distance between the top color bar axis and the matrix # axr, placement of row side colorbar [axr_x, axr_y, axr_w, axr_h] = [0.31,0.1,color_bar_w,0.6] ### second to last controls the width of the side color bar - 0.015 when showing axr_x = ax1_x + ax1_w + width_between_ax1_axr axr_y = ax1_y; axr_h = ax1_h width_between_axr_axm = -0.004 # axc, placement of column side colorbar [axc_x, axc_y, axc_w, axc_h] = [0.4,0.63,0.5,color_bar_w] ### last one controls the height of the top color bar - 0.015 when showing axc_x = axr_x + axr_w + width_between_axr_axm axc_y = ax1_y + ax1_h + height_between_ax1_axc height_between_axc_ax2 = -0.004 # axm, placement of heatmap for the data matrix [axm_x, axm_y, axm_w, axm_h] = [0.4,0.9,2.5,0.5] axm_x = axr_x + axr_w + width_between_axr_axm axm_y = ax1_y; axm_h = ax1_h axm_w = axc_w # ax2, placement of dendrogram 2, on the top of the heatmap [ax2_x, ax2_y, ax2_w, ax2_h] = [0.3,0.72,0.6,0.15] ### last one controls height of the dendrogram ax2_x = axr_x + axr_w + width_between_axr_axm ax2_y = ax1_y + ax1_h + height_between_ax1_axc + axc_h + height_between_axc_ax2 ax2_w = axc_w # axcb - placement of the color legend [axcb_x, axcb_y, axcb_w, axcb_h] = [0.07,0.88,0.18,0.07] # Compute and plot top dendrogram if column_method != None: d2 = dist.pdist(x.T) D2 = dist.squareform(d2) ax2 = fig.add_axes([ax2_x, ax2_y, ax2_w, ax2_h], frame_on=False) Y2 = sch.linkage(D2, method=column_method, metric=column_metric) ### array-clustering metric - 'average', 'single', 'centroid', 'complete' Z2 = sch.dendrogram(Y2) ind2 = sch.fcluster(Y2,0.7*max(Y2[:,2]),'distance') ### This is the default behavior of dendrogram ax2.set_xticks([]) ### Hides ticks ax2.set_yticks([]) else: ind2 = ['NA']*len(column_header) ### Used for exporting the flat cluster data # Compute and plot left dendrogram. if row_method != None: d1 = dist.pdist(x) D1 = dist.squareform(d1) # full matrix ax1 = fig.add_axes([ax1_x+0.005, ax1_y, ax1_w, ax1_h], frame_on=False) # frame_on may be False Y1 = sch.linkage(D1, method=row_method, metric=row_metric) ### gene-clustering metric - 'average', 'single', 'centroid', 'complete' Z1 = sch.dendrogram(Y1, orientation='right') ind1 = sch.fcluster(Y1,0.7*max(Y1[:,2]),'distance') ### This is the default behavior of dendrogram ax1.set_xticks([]) ### Hides ticks ax1.set_yticks([]) else: ind1 = ['NA']*len(row_header) ### Used for exporting the flat cluster data # Plot distance matrix. axm = fig.add_axes([axm_x, axm_y, axm_w, axm_h]) # axes for the data matrix xt = x if column_method != None: idx2 = Z2['leaves'] ### apply the clustering for the array-dendrograms to the actual matrix data xt = xt[:,idx2] # ind2 = ind2[:,idx2] ### reorder the flat cluster to match the order of the leaves the dendrogram if row_method != None: idx1 = Z1['leaves'] ### apply the clustering for the gene-dendrograms to the actual matrix data xt = xt[idx1,:] # xt is transformed x # ind1 = ind1[idx1,:] ### reorder the flat cluster to match the order of the leaves the dendrogram ### taken from http://stackoverflow.com/questions/2982929/plotting-results-of-hierarchical-clustering-ontop-of-a-matrix-of-data-in-python/3011894#3011894 im = axm.matshow(xt, aspect='auto', origin='lower', cmap=cmap, norm=norm) ### norm=norm added to scale coloring of expression with zero = white or black axm.set_xticks([]) ### Hides x-ticks axm.set_yticks([]) # Add text new_row_header=[] new_column_header=[] for i in range(x.shape[0]): if row_method != None: if len(row_header)<200: ### Don't visualize gene associations when more than 100 rows if len(row_header) < 20: fontsize=15 else: fontsize=200/len(row_header) axm.text(x.shape[1]-0.5, i-0.1, ' '+row_header[idx1[i]],fontsize=fontsize) new_row_header.append(row_header[idx1[i]]) else: if len(row_header)<200: ### Don't visualize gene associations when more than 100 rows if len(row_header) < 20: fontsize=8 else: fontsize=200/len(row_header) axm.text(x.shape[1]-0.5, i-0.1, ' '+row_header[i],fontsize=fontsize) ### When not clustering rows new_row_header.append(row_header[i]) for i in range(x.shape[1]): if column_method != None: axm.text(i, -0.55, ' '+column_header[idx2[i]], rotation=315, verticalalignment="top") # rotation could also be degrees new_column_header.append(column_header[idx2[i]]) else: ### When not clustering columns axm.text(i, -0.55, ' '+column_header[i], rotation=315, verticalalignment="top") new_column_header.append(column_header[i]) # Plot colside colors # axc --> axes for column side colorbar # if column_method != None: # axc = fig.add_axes([axc_x, axc_y, axc_w, axc_h]) # axes for column side colorbar # cmap_c = mpl.colors.ListedColormap(['r', 'g', 'b', 'y', 'w', 'k', 'm']) # dc = np.array(ind2, dtype=int) # dc.shape = (1,len(ind2)) # im_c = axc.matshow(dc, aspect='auto', origin='lower', cmap=cmap_c) # axc.set_xticks([]) ### Hides ticks # axc.set_yticks([]) # Plot rowside colors # axr --> axes for row side colorbar # if row_method != None: # axr = fig.add_axes([axr_x+0.005, axr_y, axr_w-0.005, axr_h]) # axes for column side colorbar # dr = np.array(ind1, dtype=int) # dr.shape = (len(ind1),1) # cmap_r = mpl.colors.ListedColormap(['r', 'g', 'b', 'y', 'w', 'k', 'm']) # im_r = axr.matshow(dr, aspect='auto', origin='lower', cmap=cmap_r) # axr.set_xticks([]) ### Hides ticks # axr.set_yticks([]) # Plot color legend axcb = fig.add_axes([axcb_x, axcb_y, axcb_w, axcb_h], frame_on=False) # axes for colorbar cb = mpl.colorbar.ColorbarBase(axcb, cmap=cmap, norm=norm, orientation='horizontal') cb.set_ticks([vmin,0,vmax]) axcb.set_title("Normalized Expression") #Save figures plt.savefig(os.path.join(html_folder,'Heatmap.png'), dpi=300) plt.savefig(os.path.join(html_folder,'Heatmap.svg')) #Create html output file html_file = """<!DOCTYPE html> <html> <head> <title>Heatmap</title> <style> * { font-family:Arial, Helvetica, sans-serif; } </style> </head> <body style="width:100%"> <img style="width:100%" src="Heatmap.png" alt="Heatmap"> <a href="index.html"><b>Back</b></a> </body> </html> """ with open(os.path.join(html_folder,'Heatmap.html'),'w') as F: F.write(html_file)
if index == 0: plt.title(name, size=18) plt.ylabel(str(k) + ' clusters', color='k', size=18) colors = np.array( list( islice( cycle([ '#377eb8', '#ff7f00', '#4daf4a', '#f781bf', '#a65628', '#984ea3', '#999999', '#e41a1c', '#dede00' ]), int(max(y_pred) + 1)))) # plt.scatter(X, np.zeros_like(X) + 0., s=10, color=colors[y_pred]) minX, maxX = np.floor(X.min()), np.ceil(X.max()) bins_plot = np.linspace(minX, maxX, axisplot) hist, bins = np.histogram(X, bins=bins_plot) minY, maxY = hist.min(), hist.max() width = np.diff(bins) center = (bins[:-1] + bins[1:]) / 2 hist[np.abs(center).min() == np.abs(center)] = 0 color = [] print(name) for p in range(len(hist)): c = y_pred[np.logical_and(X > bins[p], X < bins[p + 1])[:, 0]] if np.size(c) == 0: color.append(0) # print('-')
final_df.loc[indicesToKeep, "LDA 1"], final_df.loc[indicesToKeep, "LDA 2"], c=color, s=50, ) ax.legend(targets) ax.grid() # - # <a name="3-3"></a> # ## 3.3 Non-Negative Matrix Factorization (NMF) # Source: [Scikit-learn](https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.NMF.html) # # Find two non-negative matrices (W, H) whose product approximates the non- negative matrix X. This factorization can be used for example for dimensionality reduction, source separation or topic extraction. print(X.max()) print(X.min()) # In order to use NMF, our data cannot contain negative values. For that reason, we wil use `MinMaxScaler` from sklearn which scales the data in a given range. For example, range (0,1). # # `MinMaxScaler` is equivalent to the code below: # # ```python # X_std = (X - X.min(axis=0)) / (X.max(axis=0) - X.min(axis=0)) # X_scaled = X_std * (max - min) + min # ``` # + from sklearn.preprocessing import MinMaxScaler scaler = MinMaxScaler(feature_range=(0, 1))
new_rows_list = [] for row in csv_reader: row[1] = labels[i] i = i + 1 new_rows_list.append(row) #for data in new_rows_list: # data[i][1]=''.join(labels[i]) # i = i+1 print(new_rows_list) with open('Crops_MIR.csv', 'w', newline='') as write_csv: csv_writer = csv.writer(write_csv) csv_writer.writerows(new_rows_list) cols = ['Name', 'Labels', 'Water Require', 'Temp', 'Moisture', 'Production'] data = pd.read_csv(r'Crops_MIR.csv', names=cols) y = data['Labels'] X_norm = (X - X.min()) / (X.max() - X.min()) lda = LDA(n_components=1) lda_transformed = pd.DataFrame(lda.fit_transform(X_norm, y)) #print(lda_transformed) for i in range(3): plt.scatter(lda_transformed[y == i], data[y == i]['Water Require'], color=colmap[i]) plt.show() #min_required = min(data[y==2]['Water Require']) #Get Current Data from Farm to predict list of next possible crops blob = bucket.get_blob("Current_Data.csv") blob.download_to_filename("currentcropdata.csv") cols1 = ['Humidity', 'Temperature', 'Distance', 'Moisture'] current_data = pd.read_csv(r'Crops_MIR.csv', names=cols1)
def residual_plot(model, X, Y): """This function plots residual-plot for a regressor. X, y : np.ndarray model : estimator object. Should have 'fit' and 'predict' methods. """ x_train, x_test, y_train, y_test = split_data(X, Y) model.fit(x_train, y_train) y_pred_train = model.predict(x_train) y_pred_test = model.predict(x_test) res_train = y_train - y_pred_train res_test = y_test - y_pred_test fig, [ax0, ax1] = plt.subplots(2, 1, figsize=(14, 10)) tableau20 = [(31, 119, 180), (174, 199, 232), (255, 127, 14), (255, 187, 120), (44, 160, 44), (152, 223, 138), (214, 39, 40), (255, 152, 150), (148, 103, 189), (197, 176, 213), (140, 86, 75), (196, 156, 148), (227, 119, 194), (247, 182, 210), (127, 127, 127), (199, 199, 199), (188, 189, 34), (219, 219, 141), (23, 190, 207), (158, 218, 229)] tableau20 = [(i[0]/255., i[1]/255., i[2]/255.) for i in tableau20] %matplotlib inline # definitions for the axes left, width = 0.1, 0.65 bottom, height = 0.1, 0.65 bottom_h = left_h = left + width + 0.02 rect_scatter = [left, bottom, width, height] rect_histx = [left, bottom_h, width, 0.2] rect_histy = [left_h, bottom, 0.2, height] ################################ # Plot res-plot for training set x = StandardScaler().fit_transform(y_pred_train.reshape(-1, 1)) y = StandardScaler().fit_transform(res_train.reshape(-1, 1)) fig1 = plt.figure(figsize=(14, 10)) fig1.suptitle('Residual plot for training set') # start with a rectangular Figure axScatter = plt.axes(rect_scatter) axHistx = plt.axes(rect_histx) axHisty = plt.axes(rect_histy) # the scatter plot: axScatter.scatter(x, y, color=tableau20[0], alpha=0.5) # now determine nice limits by hand: n_bins = 100 x_limp = x.max() + x.std() x_limn = x.min() - x.std() y_limp = y.max() + y.std() y_limn = y.min() - y.std() axScatter.set_xlim((x_limn, x_limp)) axScatter.set_ylim((y_limn, y_limp)) axScatter.set_xlabel('Estimated output') axScatter.set_ylabel('Residuals') axHistx.hist(x, bins=n_bins, color=tableau20[1], alpha=0.75) axHisty.hist(y, bins=n_bins, orientation='horizontal', color=tableau20[2], alpha=0.75) axHistx.set_xlim(axScatter.get_xlim()) axHisty.set_ylim(axScatter.get_ylim()) ################################ # Plot res-plot for testing set x = StandardScaler().fit_transform(y_pred_test.reshape(-1, 1)) y = StandardScaler().fit_transform(res_test.reshape(-1, 1)) fig2 = plt.figure(figsize=(14, 10)) fig2.suptitle('Residual plot for testing set') # start with a rectangular Figure axScatter = plt.axes(rect_scatter) axHistx = plt.axes(rect_histx) axHisty = plt.axes(rect_histy) # the scatter plot: axScatter.scatter(x, y, color=tableau20[0], alpha=0.5) # now determine nice limits by hand: n_bins = 100 x_limp = x.max() + x.std() x_limn = x.min() - x.std() y_limp = y.max() + y.std() y_limn = y.min() - y.std() axScatter.set_xlim((x_limn, x_limp)) axScatter.set_ylim((y_limn, y_limp)) axScatter.set_xlabel('Estimated output') axScatter.set_ylabel('Residuals') axHistx.hist(x, bins=n_bins, color=tableau20[1], alpha=0.75) axHisty.hist(y, bins=n_bins, orientation='horizontal', color=tableau20[2], alpha=0.75) axHistx.set_xlim(axScatter.get_xlim()) axHisty.set_ylim(axScatter.get_ylim()) plt.show()
QuadraticDiscriminantAnalysis() ] X = data_pca_tsne figure = plt.figure(figsize=(27, 9)) i = 1 # iterate over datasets # preprocess dataset, split into training and test part X = StandardScaler().fit_transform(X) X_train, X_test, y_train, y_test = \ train_test_split(X, y, test_size=.4, random_state=42) x_min, x_max = X.min() - .5, X.max() + .5 y_min, y_max = y.min() - .5, y.max() + .5 xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) # just plot the dataset first cm_bright = ListedColormap(['#FF0000', '#0000FF']) ax = plt.subplot(len(datasets), len(classifiers) + 1, i) ax.set_title("Input data") # Plot the training points ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright, edgecolors='k') # and testing points
print(df.head()) print(df.describe()) from sklearn.preprocessing import StandardScaler features = ['sepal length', 'sepal width', 'petal length', 'petal width'] # Separating out the features x = df.loc[:, features].values # Separating out the target y = df.loc[:, ['target']].values # Access column by name # Standardizing the features x = StandardScaler().fit_transform(x) print(x[:5, ]) print(x.min(axis=0)) print(x.max(axis=0)) ##### # PCA Projection to 2D ##### from sklearn.decomposition import PCA pca = PCA(n_components=2) principalComponents = pca.fit_transform(x) principalDf = pd.DataFrame( data=principalComponents, columns=['principal component 1', 'principal component 2']) finalDf = pd.concat([principalDf, df[['target']]], axis=1) print(finalDf.head())
if (label[i] == 4 or label[i] == 1 or label[i] == 8 or label[i] == 7 or label[i] == 9 or label[i] == 2 or label[i] == 6): X_train1.append(a[i]) X_train1 = np.array(X_train1) y_train1 = np.array(y_train1) from sklearn.utils import shuffle X_train1, y_train1 = shuffle(X_train1, y_train1, random_state=0) from sklearn.preprocessing import StandardScaler X_train1 = StandardScaler().fit_transform(X_train1) from sklearn.decomposition import PCA pca = PCA(n_components=64) X_train1 = pca.fit_transform(X_train1) print(X_train1.shape) print(X_train.max()) print(X_train1.max()) X_train = X_train.astype('float32') X_train1 = X_train1.astype('float32') X_train = X_train / 100 X_train1 = X_train1 / 100 X_test = X_train1[20000:39332, :] y_test = y_train1[20000:39332] X_train1 = X_train1[0:20000, :] y_train1 = y_train1[0:20000] print(X_train.shape) print(X_train1.shape) print(X_test.shape)
from sklearn.mixture import GaussianMixture from sklearn.preprocessing import StandardScaler for i in range(2): fig, axes = plt.subplots(3, 6) axes = axes.ravel() for j in range(len(axes)): feature = StandardScaler().fit_transform(X_train[y_train == i, j:j + 1]) hist = axes[j].hist(feature, bins='auto', histtype='step', linewidth=2, density=True) grid = np.linspace(feature.min(), feature.max(), num=1000) log_density = (GaussianMixture( n_components=10, reg_covar=0.03).fit(feature).score_samples(grid[:, None])) gmm = axes[j].plot(grid, np.exp(log_density)) axes[j].set_title(f'var_{j}', **title_config) axes[j].set_ylim([0, 1]) fig.suptitle(f'Histogram vs Gaussian Mixture Model for Class {i}', **title_config) fig.legend((hist[2][0], gmm[0]), ('Histogram', 'Gaussian mixture model'), loc='upper center', bbox_to_anchor=(0.5, 1), ncol=2, fontsize=14) fig.tight_layout() fig.subplots_adjust(top=0.88)