def result_visualization(x_test, y_test, result): cols = y_test.shape[0] y = [] pre = [] labels = ['setosa', 'versicolor', 'virginica'] # 将0、1、2转换成setosa、versicolor、virginica for i in range(cols): y.append(labels[y_test[i]]) pre.append(labels[result[i]]) # 将特征和类别矩阵拼接起来 real = np.column_stack((x_test.T, y)) prediction = np.column_stack((x_test.T, pre)) # 转换成DataFrame类型,并添加columns df_real = pd.DataFrame(real, index=None, columns=['Sepal Length', 'Sepal Width', 'Petal Length', 'Petal Width', 'Species']) df_prediction = pd.DataFrame(prediction, index=None, columns=['Sepal Length', 'Sepal Width', 'Petal Length', 'Petal Width', 'Species']) # 将特征列转换为float类型,否则radviz会报错 df_real[['Sepal Length', 'Sepal Width', 'Petal Length', 'Petal Width']] = df_real[['Sepal Length', 'Sepal Width', 'Petal Length', 'Petal Width']].astype(float) df_prediction[['Sepal Length', 'Sepal Width', 'Petal Length', 'Petal Width']] = df_prediction[['Sepal Length', 'Sepal Width', 'Petal Length', 'Petal Width']].astype(float) # 绘图 plt.figure('真实分类') radviz(df_real, 'Species', color=['blue', 'green', 'red', 'yellow']) plt.figure('预测分类') radviz(df_prediction, 'Species', color=['blue', 'green', 'red', 'yellow']) plt.show()
def result_visualization(x_test, y_test, result): cols = y_test.shape[1] y = [] pre = [] # 反转换类别的独热编码 for i in range(cols): if y_test[0][i] == 0 and y_test[1][i] == 0 and y_test[2][i] == 1: y.append('setosa') elif y_test[0][i] == 0 and y_test[1][i] == 1 and y_test[2][i] == 0: y.append('versicolor') elif y_test[0][i] == 1 and y_test[1][i] == 0 and y_test[2][i] == 0: y.append('virginica') for j in range(cols): if result[0][j] == 0 and result[1][j] == 0 and result[2][j] == 1: pre.append('setosa') elif result[0][j] == 0 and result[1][j] == 1 and result[2][j] == 0: pre.append('versicolor') elif result[0][j] == 1 and result[1][j] == 0 and result[2][j] == 0: pre.append('virginica') else: pre.append('unknown') # 将特征和类别矩阵拼接起来 real = np.column_stack((x_test.T, y)) prediction = np.column_stack((x_test.T, pre)) # 转换成DataFrame类型,并添加columns df_real = pd.DataFrame(real, index=None, columns=[ 'Sepal Length', 'Sepal Width', 'Petal Length', 'Petal Width', 'Species' ]) df_prediction = pd.DataFrame(prediction, index=None, columns=[ 'Sepal Length', 'Sepal Width', 'Petal Length', 'Petal Width', 'Species' ]) # 将特征列转换为float类型,否则radviz会报错 df_real[['Sepal Length', 'Sepal Width', 'Petal Length', 'Petal Width']] = df_real[[ 'Sepal Length', 'Sepal Width', 'Petal Length', 'Petal Width' ]].astype(float) df_prediction[[ 'Sepal Length', 'Sepal Width', 'Petal Length', 'Petal Width' ]] = df_prediction[[ 'Sepal Length', 'Sepal Width', 'Petal Length', 'Petal Width' ]].astype(float) # 绘图 plt.figure('真实分类') radviz(df_real, 'Species', color=['blue', 'green', 'red', 'yellow']) plt.figure('预测分类') radviz(df_prediction, 'Species', color=['blue', 'green', 'red', 'yellow']) plt.show()
def test_radviz(): # data = pd.read_csv('e:/tmp/22/iris.csv') data = pd.read_csv('e:/tmp/22/201404.out') print data.head() plt.figure() radviz(data, 'Name') plt.show()
def plot_radviz(dataset): """ Generates a RadViz plot of the provided DataSet. RadViz is useful for visualizing data with more than two dimensions. """ # radviz takes a pandas DataFrame and the name of the column which # contains class membership info. # therefore need to pass in the dataset's merged data and labels radviz(dataset.get_labelled_data_frame(), dataset.get_labels().name) plt.show()
def result_visualization(x_test, y_test, result): cols = y_test.shape[1] y = [] pre = [] # 反转换类别的独热编码 for i in range(cols): if y_test[0][i] == 0 and y_test[1][i] == 0 and y_test[2][i] == 1: y.append('setosa') elif y_test[0][i] == 0 and y_test[1][i] == 1 and y_test[2][i] == 0: y.append('versicolor') elif y_test[0][i] == 1 and y_test[1][i] == 0 and y_test[2][i] == 0: y.append('virginica') for j in range(cols): if result[0][j] == 0 and result[1][j] == 0 and result[2][j] == 1: pre.append('setosa') elif result[0][j] == 0 and result[1][j] == 1 and result[2][j] == 0: pre.append('versicolor') elif result[0][j] == 1 and result[1][j] == 0 and result[2][j] == 0: pre.append('virginica') else: pre.append('未知种类') # 将特征和类别矩阵拼接起来 real = np.column_stack((x_test.T, y)) prediction = np.column_stack((x_test.T, pre)) df_real = pd.DataFrame(real, index=None, columns=['萼片长度', '萼片宽度', '花瓣长度', '花瓣宽度', '种类']) df_prediction = pd.DataFrame( prediction, index=None, columns=['萼片长度', '萼片宽度', '花瓣长度', '花瓣宽度', '种类']) df_real[['萼片长度', '萼片宽度', '花瓣长度', '花瓣宽度']] = df_real[['萼片长度', '萼片宽度', '花瓣长度', '花瓣宽度']].astype(float) df_prediction[['萼片长度', '萼片宽度', '花瓣长度', '花瓣宽度']] = df_prediction[['萼片长度', '萼片宽度', '花瓣长度', '花瓣宽度']].astype(float) # 绘图 plt.figure('真实分类') radviz(df_real, '种类', color=['blue', 'green', 'red', 'yellow']) plt.figure('预测分类') radviz(df_prediction, '种类', color=['blue', 'green', 'red', 'yellow']) plt.rcParams['font.sans-serif'] = ['SimHei'] # 显示中文标签 plt.rcParams['axes.unicode_minus'] = False plt.show()
def test_radviz(self, iris): from pandas.plotting import radviz from matplotlib import cm df = iris _check_plot_works(radviz, frame=df, class_column="Name") rgba = ("#556270", "#4ECDC4", "#C7F464") ax = _check_plot_works(radviz, frame=df, class_column="Name", color=rgba) # skip Circle drawn as ticks patches = [p for p in ax.patches[:20] if p.get_label() != ""] self._check_colors(patches[:10], facecolors=rgba, mapping=df["Name"][:10]) cnames = ["dodgerblue", "aquamarine", "seagreen"] _check_plot_works(radviz, frame=df, class_column="Name", color=cnames) patches = [p for p in ax.patches[:20] if p.get_label() != ""] self._check_colors(patches, facecolors=cnames, mapping=df["Name"][:10]) _check_plot_works(radviz, frame=df, class_column="Name", colormap=cm.jet) cmaps = [cm.jet(n) for n in np.linspace(0, 1, df["Name"].nunique())] patches = [p for p in ax.patches[:20] if p.get_label() != ""] self._check_colors(patches, facecolors=cmaps, mapping=df["Name"][:10]) colors = [[0.0, 0.0, 1.0, 1.0], [0.0, 0.5, 1.0, 1.0], [1.0, 0.0, 0.0, 1.0]] df = DataFrame( {"A": [1, 2, 3], "B": [2, 1, 3], "C": [3, 2, 1], "Name": ["b", "g", "r"]} ) ax = radviz(df, "Name", color=colors) handles, labels = ax.get_legend_handles_labels() self._check_colors(handles, facecolors=colors)
def radviz_quad_features(filename=None): df = pd.read_csv(filename, index_col=0) df_angle = df.drop(labels=['TEXTURES'], axis=1) df_texture = df.drop(labels=['angle'], axis=1) # plt.style.use('ggplot') plt.style.use(['bmh']) # ''classic', 'seaborn-dark' fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(11, 6)) # plt.suptitle("Visualization of quadrant features using Radviz ", fontsize=12) ax1 = radviz(df_texture, 'TEXTURES', ax=ax1, colormap='gist_rainbow') ax1.set_title('Textures - radviz', loc='center', fontsize=10) ax2 = radviz(df_angle, 'angle', ax=ax2, colormap='rainbow') ax2.set_title('Angles - radviz', loc='center', fontsize=10) # andrews_curves(df_angle, 'angle', ax=ax1, colormap='rainbow') # andrews_curves(df_texture, 'TEXTURES', ax=ax2, colormap='rainbow') fig.tight_layout() plt.subplots_adjust(left=0.05, wspace=0.15, top=0.9) plt.savefig( os.path.join(output_dir, 'Radviz_' + get_basename(filename) + '.png'))
def test_radviz(self, iris): from pandas.plotting import radviz from matplotlib import cm df = iris _check_plot_works(radviz, frame=df, class_column='Name') rgba = ('#556270', '#4ECDC4', '#C7F464') ax = _check_plot_works(radviz, frame=df, class_column='Name', color=rgba) # skip Circle drawn as ticks patches = [p for p in ax.patches[:20] if p.get_label() != ''] self._check_colors(patches[:10], facecolors=rgba, mapping=df['Name'][:10]) cnames = ['dodgerblue', 'aquamarine', 'seagreen'] _check_plot_works(radviz, frame=df, class_column='Name', color=cnames) patches = [p for p in ax.patches[:20] if p.get_label() != ''] self._check_colors(patches, facecolors=cnames, mapping=df['Name'][:10]) _check_plot_works(radviz, frame=df, class_column='Name', colormap=cm.jet) cmaps = lmap(cm.jet, np.linspace(0, 1, df['Name'].nunique())) patches = [p for p in ax.patches[:20] if p.get_label() != ''] self._check_colors(patches, facecolors=cmaps, mapping=df['Name'][:10]) colors = [[0., 0., 1., 1.], [0., 0.5, 1., 1.], [1., 0., 0., 1.]] df = DataFrame({ "A": [1, 2, 3], "B": [2, 1, 3], "C": [3, 2, 1], "Name": ['b', 'g', 'r'] }) ax = radviz(df, 'Name', color=colors) handles, labels = ax.get_legend_handles_labels() self._check_colors(handles, facecolors=colors)
def test_radviz(self): from pandas.plotting import radviz from matplotlib import cm df = self.iris _check_plot_works(radviz, frame=df, class_column='Name') rgba = ('#556270', '#4ECDC4', '#C7F464') ax = _check_plot_works( radviz, frame=df, class_column='Name', color=rgba) # skip Circle drawn as ticks patches = [p for p in ax.patches[:20] if p.get_label() != ''] self._check_colors( patches[:10], facecolors=rgba, mapping=df['Name'][:10]) cnames = ['dodgerblue', 'aquamarine', 'seagreen'] _check_plot_works(radviz, frame=df, class_column='Name', color=cnames) patches = [p for p in ax.patches[:20] if p.get_label() != ''] self._check_colors(patches, facecolors=cnames, mapping=df['Name'][:10]) _check_plot_works(radviz, frame=df, class_column='Name', colormap=cm.jet) cmaps = lmap(cm.jet, np.linspace(0, 1, df['Name'].nunique())) patches = [p for p in ax.patches[:20] if p.get_label() != ''] self._check_colors(patches, facecolors=cmaps, mapping=df['Name'][:10]) colors = [[0., 0., 1., 1.], [0., 0.5, 1., 1.], [1., 0., 0., 1.]] df = DataFrame({"A": [1, 2, 3], "B": [2, 1, 3], "C": [3, 2, 1], "Name": ['b', 'g', 'r']}) ax = radviz(df, 'Name', color=colors) handles, labels = ax.get_legend_handles_labels() self._check_colors(handles, facecolors=colors)
# Draw a heatmap with the numeric values in each cell sns.heatmap(data1, annot=True, fmt='f', linewidths=1) plt.show() import pickle with open('forest-riders.pkl', 'wb') as f: pickle.dump(model, f) with open('forest-riders.pkl', 'rb') as f: model = pickle.load(f) from pandas.tools.plotting import radviz plt.figure(figsize=(12, 12)) radviz(dataset, 'Yield') plt.show() from yellowbrick.features.rankd import Rank2D # Instantiate the visualizer with the Covariance ranking algorithm visualizer = Rank2D(features=features, algorithm='covariance') visualizer.fit(X, y) # Fit the data to the visualizer visualizer.transform(X) # Transform the data visualizer.poof() # Draw/show/poof the data ################################################3 #######RANDOM FOREST REGRESSOR TREE###### # features = ['Year','Harvested','Value','Grow_total_p','Grow_avg_t','Price'] # target = 'Yield'
sns.set(style="white", color_codes=True) iris = pd.read_csv("./Iris.csv") print(iris.head()) print(iris["Species"].value_counts()) #散点图 iris.plot(kind="scatter", x="SepalLengthCm", y="SepalWidthCm") #散点图+柱状图 sns.jointplot(x="SepalLengthCm", y="SepalWidthCm", data=iris, size=5) #散点图,加标签,用不同颜色区分 sns.FacetGrid(iris, hue="Species", size=5) \ .map(plt.scatter, "SepalLengthCm", "SepalWidthCm") \ .add_legend() #箱型图 sns.boxplot(x="Species", y="PetalLengthCm", data=iris) #箱型图 ax = sns.boxplot(x="Species", y="PetalLengthCm", data=iris) ax = sns.stripplot(x="Species", y="PetalLengthCm", data=iris, jitter=True, edgecolor="gray") #核密度图 sns.FacetGrid(iris, hue="Species", size=6) \ .map(sns.kdeplot, "PetalLengthCm") \ .add_legend() #多变量图 sns.pairplot(iris.drop("Id", axis=1), hue="Species", size=3) #弹力图 radviz(iris.drop("Id", axis=1), "Species") plt.show()
def RadvizPlot(): # 降维可视化 data = load_data() radviz(data.drop("Id", axis=1), "Species") return
import matplotlib.pyplot as plt from pandas.plotting import scatter_matrix, radviz import pandas from sklearn.utils import shuffle # Load the data from the CSV file data_frame = pandas.read_csv("spambase/spambase.csv") # Randomize the dataframe data_frame = shuffle(data_frame) sample_frame = data_frame.sample(frac=1) sample_frame = sample_frame.iloc[:, ::1] print(sample_frame) radviz(sample_frame, "spam") plt.show() # scatter_matrix(sample_frame, alpha=0.2) # plt.show()
# calculating the accuracies print("Training Accuracy :", model.score(x_train, y_train)) print("Testing Accuracy :", model.score(x_test, y_test)) # printing the confusion matrix from sklearn.metrics import confusion_matrix # creating a confusion matrix cm = confusion_matrix(y_test, y_pred) # printing the confusion matrix plt.rcParams['figure.figsize'] = (8, 8) sns.heatmap(cm, annot = True, cmap = 'Reds') plt.title('Confusion Matrix for Random Forest', fontweight = 30, fontsize = 20) plt.show() get_ipython().system('pip install yellowbrick') from pandas.plotting import radviz fig, ax = plt.subplots(figsize=(12, 12)) new_df = x.copy() new_df["status"] = y radviz(new_df, "status", ax=ax, colormap="rocket") plt.title('Radial Visualization for Target', fontsize = 20) plt.show() # * It gives a clear Idea that Students getting very low grades have high correlation on Lunch and Parental Education
species_dict = {0: species[0], 1: species[1], 2: species[2]} iris_pair_df["species"] = target_df iris_pair_df["species"] = iris_pair_df["species"].map(species_dict) # sns.pairplot(iris_df, hue='species', size=2.5) # ECDF # # ecdf = sm.distributions.ECDF(iris_df["sepal width (cm)"]) # x = np.linspace(iris_df["sepal width (cm)"].min(), iris_df["sepal width (cm)"].max()) # y = ecdf(x) # plt.step(x, y) # plt.title("Empirical CDF for Iris attributes - Sepal Width") # plt.show() # Percentile Plot # position, sepal_width = probscale.plot_pos(iris_df["sepal width (cm)"]) # position *= 100 # fig, ax = plt.subplots(figsize=(6, 3)) # ax.plot(position, sepal_width, marker='.', label='Sepal Width') # ax.set_xlabel('Percentile') # ax.set_ylabel('Sepal Width (cm)') # sns.despine() # parallel coordinates # parallel_coordinates(iris_pair_df, "species") # plt.title('Parallel Coordinates visualization for Iris Dataset') # radvis radviz(iris_pair_df, "species") plt.title('Radvis multivarieate visualization')
def visualize_radial(df, col, title): plt.figure(title) radviz(df, col) plt.show()
# We can visualize other features by substituting "meanfun" sns.boxplot(x="label", y="meanfun", data=dataset) plt.show() # ------ Distribution of male and female(every feature) # We can visualize other features by substituting "meanfun" sns.FacetGrid(dataset, hue="label", size=6) \ .map(sns.kdeplot, "meanfun") \ .add_legend() plt.show() # ------ Radviz circle # Good to compare every feature from pandas.plotting import radviz radviz(dataset, "label") plt.show() ##################################################### # # # Starting with Sets and Pre-Processing # # # ##################################################### # ------ Separating the Independent and Dependent Variables # Getting all Columns, except the last one with the genders X = dataset.iloc[:, :-1].values # Getting the last column y = dataset.iloc[:, 20].values # ------ Taking Care of Missing Data
import pandas import matplotlib.pyplot as plt url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data" names = ['sepal-length', 'sepal-width', 'petal-length', 'petal-width', 'class'] dataset = pandas.read_csv(url, names=names) from pandas.plotting import radviz radviz(dataset, 'class') from pandas.plotting import andrews_curves andrews_curves(dataset, 'class') from pandas.plotting import parallel_coordinates parallel_coordinates(dataset, 'class') plt.show()
def visualisation(df_final, visual_ctl, Y, X, scatter_x, features, test, Top_n_counts, export_ctl=False, dim_ctl=3): ### Raw if visual_ctl in ['all', 'raw']: if dim_ctl == 2: fig = px.scatter(df_final, x=scatter_x[0], y=scatter_x[1], color='Top_n_similar', size_max=18, symbol='Class', opacity=0.5) # 3d scatter plot elif dim_ctl == 3: fig = px.scatter_3d(df_final, x=scatter_x[0], y=scatter_x[1], z=scatter_x[2], color='Top_n_similar', size_max=18, symbol='Class', opacity=0.5) # Layout fig.update_layout( margin=dict(l=0, r=0, b=0, t=0), title="Scatter Plot", ) # Output if export_ctl == 'True': print(f'Export Raw to Output/Raw-3D.png') fig.write_image("Output/Raw-3D.png") ### Radviz if visual_ctl in ['all', 'radviz']: radviz_fig, ax = plt.subplots(nrows=1, ncols=1) # Turn off tick labels ax.set_yticklabels([]) ax.set_xticklabels([]) # Plot ax = radviz(df_final[scatter_x], "Top_n_similar", color=['Red', 'Orange', 'Blue', 'Green'], alpha=0.5) ax.title.set_text('Radviz Plot of the Features') if export_ctl == 'True': print(f'Export Radviz to Output/MultiDimension_Radviz.png') radviz_fig.savefig('Output/MultiDimension_Radviz.png', bbox_inches='tight') ### Parallel Coordinates if visual_ctl in ['all', 'paral_coor']: parallel_x = scatter_x.copy() parallel_x.append('similar_alpha') # parallel plot par_fig = px.parallel_coordinates( df_final[parallel_x], color="similar_alpha", dimensions=scatter_x, color_continuous_scale=px.colors.diverging.Tealrose, color_continuous_midpoint=2) # Layout par_fig.update_layout( title={ 'text': "Parrallel Plot for Iris Neighbors", 'y': 0.1, 'x': 0.5, 'xanchor': 'center', 'yanchor': 'top' }) # Output if export_ctl == 'True': print( f'Export Parallel Coordinates to Output/MultiDimension_ParrCoor.png' ) par_fig.write_image("Output/MultiDimension_ParrCoor.png") return raw_fig, radviz_fig, par_fig
#numerical data. Parallel Coordinates Plots are ideal for comparing many variables together and #seeing the relationships between them. For example, if you had to compare an array of products with #the same attributes (comparing computer or cars specs across different models). from pandas.plotting import parallel_coordinates parallel_coordinates(iris_data, "species") # In[143]: #Radviz Plot : RadViz is a multivariate data visualization algorithm that #plots each feature dimension uniformly around the circumference of a #circle then plots points on the interior of the circle such that the #point normalizes its values on the axes from the center to each arc. from pandas.plotting import radviz radviz(iris_data, "species", color=['pink', 'green']) # In[150]: #Factorplot: Factor plot is informative when we have multiple groups to compare. sns.factorplot("species", "sepal_length", data=iris_data) plt.ioff() plt.show() # In[153]: #Boxen Plot: An enhanced box plot for larger datasets. fig = plt.gcf() fig.set_size_inches(10, 6)
def lp_genes( data, kind="scatter", hue="Pattern", sizes=(2, 100), gridsize=20, random_state=4, ax=None, fname=None, **kwargs, ): """ Plot the pattern distribution of each gene in a RadViz plot. RadViz projects an N-dimensional data set into a 2D space where the influence of each dimension can be interpreted as a balance between the influence of all dimensions. Parameters ---------- data : AnnData Spatial formatted AnnData kind : str 'Scatter' for scatter plot, 'hex' for hex plot, default "scatter" hue : str Name of columns in data.obs to color points, default "Pattern" sizes : tuple Minimum and maximum point size to scale points, default (2, 100) gridsize : int Number of hex bins along each axis, default 20 fname : str, optional Save the figure to specified filename, by default None **kwargs Options to pass to matplotlib plotting method. """ lp_stats(data) palette = dict(zip(PATTERN_NAMES, PATTERN_COLORS)) # RADVIZ plot if not ax: figsize = (6, 6) fig = plt.figure(figsize=figsize) # Use Plot the "circular" axis and labels, hide points # TODO move "pattern" computation to lp_stats col_names = [f"{p}_fraction" for p in PATTERN_NAMES] gene_frac = data.var[col_names] gene_frac.columns = PATTERN_NAMES gene_frac["Pattern"] = gene_frac.idxmax(axis=1) gene_frac_copy = gene_frac.copy() gene_frac_copy["Pattern"] = "" if hue and hue != "Pattern": gene_frac = gene_frac.join(data.var[hue]) if not ax: ax = radviz(gene_frac_copy, "Pattern", s=0) else: radviz(gene_frac_copy, "Pattern", s=0, ax=ax) del gene_frac_copy ax.get_legend().remove() circle = plt.Circle((0, 0), radius=1, color="black", fill=False) ax.add_patch(circle) # Hide 2D axes ax.axis(False) # Get points pts = [] for c in ax.collections: pts.extend(c.get_offsets().data) pts = np.array(pts).reshape(-1, 2) xy = pd.DataFrame(pts, index=gene_frac.index) xy["Pattern"] = gene_frac["Pattern"] # Plot points as scatter or hex if kind == "scatter": del ax.collections[0] # Scale point size by max xy["Fraction of cells"] = gene_frac.iloc[:, :5].max(axis=1) # Plot points sns.scatterplot( data=xy.sample(frac=1, random_state=random_state), x=0, y=1, size="Fraction of cells", hue=hue, sizes=sizes, linewidth=0, palette=palette, ax=ax, **kwargs, ) plt.legend(bbox_to_anchor=(1.05, 0.5), loc="center left", frameon=False) elif kind == "hex": # Hexbin xy.plot.hexbin( x=0, y=1, gridsize=gridsize, extent=(-1, 1, -1, 1), cmap=sns.light_palette("lightseagreen", as_cmap=True), mincnt=1, colorbar=False, ax=ax, **kwargs, ) # [left, bottom, width, height] plt.colorbar(ax.collections[-1], cax=fig.add_axes([1, 0.4, 0.05, 0.3]), label="genes")
list_of_best_features.append(df.columns[our_index]) copy_of_importance_list[our_index] = 0 #We create a new array containing only the n features df_scaled_new = df_scaled df_scaled_new = df_scaled[list_of_best_features] X = df_scaled_new.values # Examine the dispersion of the defaults/non defaulted amongst the features in a multivariate setting from pandas.plotting import radviz plt.figure(figsize=(8,8)) df_all_data = pd.read_csv(your_path) Scaler = StandardScaler() scaled_data = Scaler.fit_transform(df_all_data) df_all_data_scaled = pd.DataFrame(scaled_data , columns=df_all_data.columns) radviz(df_all_data_scaled, 'Default', color='BGR'); # Same thing but this time dropping some of the features for clarity plt.figure(figsize=(10,10)) df_all_data = pd.read_csv(your_path) less_interesting_features=['Guarantors','Sex & Marital Status','Instalment per cent', 'Duration in Current address','Age (years)','Most valuable available asset', 'Concurrent Credits', 'Type of apartment', 'No of Credits at this Bank','Occupation', 'No of dependents', 'Telephone', 'Foreign Worker','Value Savings/Stocks'] df_all_data_dropped = df_all_data.drop(less_interesting_features, axis=1) radviz(df_all_data_dropped, 'Default',color='BGR'); # ////////////////////////////////////////////////////////////// # //////// Train - Test - Split /////// # ////////////////////////////////////////////////////////////// from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=7)
def plot_radviz(plotdata,label): plotdata['label'] = label plt.figure('kmeans-radviz',figsize=(100,50)) plt.title('radviz') radviz(plotdata, 'label') plt.show()
#plt.show() #9 this shows the relationship between the various sepal/petal lengths/widths measures g = sns.pairplot(iris, hue="name") plt.savefig("../graphs/pairploth.jpg") #plt.show() # similar to above but with kde along the diag instead of histograms # g = sns.pairplot(iris, hue="name", diag_kind="kde") plt.savefig("../graphs/pairplotk.jpg") #plt.show() #10 Andrews curves are a method for visualizing multidimensional data by mapping each observation onto a function. In the plot each colour used represents a class and we can easily note that the lines that represent samples from the same class have similar curves.Andrews curves that are represented by functions close together suggest that the corresponding data points will also be close together. from pandas.plotting import andrews_curves andrews_curves(iris, "name") plt.savefig("../graphs/andrewcs.jpg") #plt.show() #11 Another multivariate visualization technique pandas has is parallel_coordinates # Parallel coordinates plots each feature on a separate column & then draws lines # connecting the features for each data sample #Inselberg (Inselberg 1997) made a full review of how to visually read out parallel coords' relational patterns.[9] When most lines between two parallel axis are somewhat parallel to each others, that suggests a positive relationship between these two dimensions. When lines cross in a kind of superposition of X-shapes, that's negative relationship. When lines cross randomly or are parallel, that show there is no particular relationship. from pandas.plotting import parallel_coordinates parallel_coordinates(iris, "name") plt.savefig("../graphs/parac.jpg") #plt.show() #12 from pandas.plotting import radviz radviz(iris, "name") plt.savefig("../graphs/rad.jpg") #plt.show()
# A [""] as last data = data[:-1] trans_dict = { "Iris-setosa": "0", "Iris-versicolor": "1", "Iris-virginica": "2" } for i in data: i[-1] = trans_dict[i[-1]] data = np.asarray(data, dtype="float32") numpy_data = data.copy() data = pd.DataFrame(data) # Change columns to string. data.rename(mapper=lambda x: str(x), axis=1, inplace=True) radviz(data, class_column="4") plt.show() x, y = np.split(numpy_data, (4, ), axis=1) # PCA # Col as a var, row as attr. M = np.mean(x, axis=0) x = x - M C = np.cov(x.T) eigenvalue, eigenvetcor = np.linalg.eig(C) # Get the index of reversed sorted eigenvalue sorted_index = np.argsort(-eigenvalue)[:P] # Get the first p vectors to get the projection matrix.
import pandas as pd import numpy as np from pandas.plotting import scatter_matrix, andrews_curves, parallel_coordinates, radviz import matplotlib.pyplot as plt #data = pd.read_csv('test.csv') #data = pd.read_csv('BostonHousing.csv') #scatter_matrix(data, alpha=0.2, figsize=(10, 10), diagonal='kde') #plt.suptitle('scatter-matrix') #plt.show() # data = pd.read_csv('BostonHousing.csv') # data.plot.line(x='medv', y=['rad', 'indus', 'ptratio', 'lstat'], figsize=(10, 10)) # plt.show() data = pd.read_csv('test.csv') #data = data.cumsum() radviz(data, 'medv') plt.show() # print (data.head())
print('################################') print('Working Base :', Base, ' using ', sys.platform) print('################################') ################################################################ sDataFile = Base + '/01-Vermeulen/00-RawData/irisdata.csv' data = pd.read_csv(sDataFile) from pandas.plotting import andrews_curves plt.figure(figsize=(10, 10)) andrews_curves(data, 'Name') sPicNameOut1 = Base + '/01-Vermeulen/06-Report/01-EDS/02-Python/andrews_curves.png' plt.savefig(sPicNameOut1, dpi=600) plt.tight_layout() plt.show() from pandas.plotting import parallel_coordinates plt.figure(figsize=(10, 10)) parallel_coordinates(data, 'Name') sPicNameOut2 = Base + '/01-Vermeulen/06-Report/01-EDS/02-Python/parallel_coordinates.png' plt.savefig(sPicNameOut2, dpi=600) plt.tight_layout() plt.show() from pandas.plotting import radviz plt.figure(figsize=(10, 10)) radviz(data, 'Name') sPicNameOut3 = Base + '/01-Vermeulen/06-Report/01-EDS/02-Python/radviz.png' plt.savefig(sPicNameOut3, dpi=600) plt.tight_layout() plt.show()
# 六边形箱图(蜂窝图) # pandas绘图 # gridsize#: x轴方向分箱数目 默认100 salary.plot.hexbin(x='salary', y='begin_salary', gridsize=25) clf_cla_close(plt) # 描述类似气泡图的散点值 salary.plot.hexbin(x='salary', y='begin_salary', C='age', reduce_C_function=np.min, gridsize=25) clf_cla_close(plt) # 雷达坐标图(属性图) fig = plt.figure() # 1#: 要分析对象 2#: 分类变量 radviz(salary[['salary', 'begin_salary', 'age', 'education', 'jobtime', 'position']], 'position') clf_cla_close(plt) # 轮廓图(横坐标表示需要分析的变量 纵坐标各个指标的值) # 1#: 要分析对象 2#: 分类变量 parallel_coordinates(salary[['salary', 'begin_salary', 'jobtime', 'position']], 'position') clf_cla_close(plt) # 调和曲线图(根据三角变换方法将高维空间上的点映射到二维平面的曲线上) andrews_curves(salary[['salary', 'begin_salary', 'jobtime', 'position']], 'position') clf_cla_close(plt) # 等高线图
fig = plt.figure() scatter_matrix(iris_pd, alpha=0.3, diagonal='kde') iris_fig(fig, title) title = 'Andrews_Curves' # Very cool way to try to differentiate between classes. Some math is needed # https://en.wikipedia.org/wiki/Andrews_plot # http://sci-hub.cc/ fig = plt.figure() andrews_curves(iris_pd, 'tgt') iris_fig(fig, title) title = 'Radviz' # Springy area plots fig = plt.figure() radviz(iris_pd, 'tgt') iris_fig(fig, title) # %% Machine Learning - Classification Tree # http://docs.python-guide.org/en/latest/scenarios/ml/ # Randomizes order of indices for splitting the data into train and test sets ids = np.random.permutation(len(x)) # Gets n-10 data points for training x_train = x[ids[:-10]] y_train = y[ids[:-10]] # Gets 10 data points for testing x_test = x[ids[-10:]] y_test = y[ids[-10:]]
plt.scatter(X_2[y==1,0], X_2[y==1,1], color='r') # 1分类的散点图 plt.savefig("./Pictures/raw_scatter_2.png") # 保存原始数据分布图 plt.cla() # 清除图片 # 原始数据可视化(平行坐标) data = pd.read_csv(r"iris.csv") # 读取数据 plt.figure('多维度-parallel_coordinates') plt.title('parallel_coordinates') # 添加标题 parallel_coordinates(data, 'Class', color=['b', 'g']) plt.savefig("./Pictures/raw_parallel_coordinates.png") # 保存原始数据分布图 plt.cla() # 清除图片 # 原始数据可视化(RadViz雷达图) plt.figure('多维度-radviz') plt.title('radviz') radviz(data, 'Class', color=['red', 'm']) plt.savefig("./Pictures/raw_radviz.png") # 保存原始数据分布图 plt.cla() # 清除图片 # 原始数据可视化(andrews_curves) plt.figure('多维度-andrews_curves') plt.title('andrews_curves') andrews_curves(data, 'Class', color=['pink', 'gold']) plt.savefig("./Pictures/raw_andrews_curves.png") # 保存原始数据分布图 plt.cla() # 清除图片 # 特征相关性热力图 data = pd.read_csv(r"iris.csv") data = data.iloc[:,[0,1,2,3]] def heat_map(df):
dataF = pd.read_csv("letter-recognition.data", sep=',', header=None, names=columns) for x in columns: if x == 'y1': continue print(x + " max: " + str(dataF[x].max()), end=" ") print("mean: " + str(dataF[x].mean()), end=" ") print("min: " + str(dataF[x].min())) if 1: dataF['y1'] = dataF['y1'].apply(lambda y: ord(y) - 65) scatter_matrix(dataF, alpha=0.2, figsize=(6, 6), diagonal='kde') plt.show() if 0: # Parallel Coordinates plt.figure() radviz(dataF, 'y1') plt.show() if 0: ser = dataF ser.plot.kde(subplots=True, layout=(2, 12), legend=False, Label=False, yticks=[], xticks=[]) plt.show()