def test_andrews_curves(self): from pandas.tools.plotting import andrews_curves from matplotlib import cm df = self.iris _check_plot_works(andrews_curves, frame=df, class_column='Name') rgba = ('#556270', '#4ECDC4', '#C7F464') ax = _check_plot_works(andrews_curves, frame=df, class_column='Name', color=rgba) self._check_colors(ax.get_lines()[:10], linecolors=rgba, mapping=df['Name'][:10]) cnames = ['dodgerblue', 'aquamarine', 'seagreen'] ax = _check_plot_works(andrews_curves, frame=df, class_column='Name', color=cnames) self._check_colors(ax.get_lines()[:10], linecolors=cnames, mapping=df['Name'][:10]) ax = _check_plot_works(andrews_curves, frame=df, class_column='Name', colormap=cm.jet) cmaps = lmap(cm.jet, np.linspace(0, 1, df['Name'].nunique())) self._check_colors(ax.get_lines()[:10], linecolors=cmaps, mapping=df['Name'][:10]) colors = ['b', 'g', 'r'] df = DataFrame({"A": [1, 2, 3], "B": [1, 2, 3], "C": [1, 2, 3], "Name": colors}) ax = andrews_curves(df, 'Name', color=colors) handles, labels = ax.get_legend_handles_labels() self._check_colors(handles, linecolors=colors) with tm.assert_produces_warning(FutureWarning): andrews_curves(data=df, class_column='Name')
def irisVisualization(): sns.set(style="white", color_codes=True) irisdata = load_iris() iris = pd.DataFrame(irisdata.data, columns=irisdata.feature_names) iris['Species'] = pd.Categorical.from_codes(irisdata.target, irisdata.target_names) # sepal length (cm) sepal width (cm) petal length (cm) petal width (cm) # pandas plot # iris.plot(kind="scatter", x="sepal length (cm)", y="sepal width (cm)") # iris.boxplot(by="Species", figsize=(12, 6)) from pandas.tools.plotting import andrews_curves, parallel_coordinates andrews_curves(iris, "Species") parallel_coordinates(iris, "Species") # seaborn plot sns.jointplot(x="sepal length (cm)", y="sepal width (cm)", data=iris, size=5) sns.FacetGrid(iris, hue="Species", size=5) \ .map(plt.scatter, "sepal length (cm)", "sepal width (cm)").add_legend() # sns.kdeplot sns.boxplot(x="Species", y="sepal length (cm)", data=iris) sns.violinplot(x="Species", y="sepal length (cm)", data=iris, size=6) sns.pairplot(iris, hue="Species", size=3) sns.plt.show()
def multidimensional_plots(df, target_name, maxevents=10000, standardize=False): # randomize the data frame order df_random = df.reindex(np.random.permutation(df.index))[:maxevents] # Make a figure and declare the size fig = plt.figure(figsize=(9, 9)) # Make histograms for each column and put them in the figure current_axis = fig.add_subplot(2, 2, 1) current_axis.set_title('Andrews Curves') andrews_curves(df_random, target_name, ax=current_axis) current_axis = fig.add_subplot(2, 2, 2) current_axis.set_title('Parallel Coordinates') parallel_coordinates(df_random, target_name, ax=current_axis, colormap='gist_rainbow') current_axis = fig.add_subplot(2, 2, 3) current_axis.set_title('Radviz Spring Tension') radviz(df_random, target_name, ax=current_axis, colormap='jet') #fig.tight_layout() return fig
def test_andrews_curves(self): from pandas.tools.plotting import andrews_curves from matplotlib import cm df = self.iris _check_plot_works(andrews_curves, df, 'Name') rgba = ('#556270', '#4ECDC4', '#C7F464') ax = _check_plot_works(andrews_curves, df, 'Name', color=rgba) self._check_colors(ax.get_lines()[:10], linecolors=rgba, mapping=df['Name'][:10]) cnames = ['dodgerblue', 'aquamarine', 'seagreen'] ax = _check_plot_works(andrews_curves, df, 'Name', color=cnames) self._check_colors(ax.get_lines()[:10], linecolors=cnames, mapping=df['Name'][:10]) ax = _check_plot_works(andrews_curves, df, 'Name', colormap=cm.jet) cmaps = lmap(cm.jet, np.linspace(0, 1, df['Name'].nunique())) self._check_colors(ax.get_lines()[:10], linecolors=cmaps, mapping=df['Name'][:10]) colors = ['b', 'g', 'r'] df = DataFrame({"A": [1, 2, 3], "B": [1, 2, 3], "C": [1, 2, 3], "Name": colors}) ax = andrews_curves(df, 'Name', color=colors) handles, labels = ax.get_legend_handles_labels() self._check_colors(handles, linecolors=colors) with tm.assert_produces_warning(FutureWarning): andrews_curves(data=df, class_column='Name')
def multidimensional_plots(df, target_name, maxevents=10000): # normalize df_std = (df - df.mean()) / df.std() # put the unnormalized target back df_std[target_name] = df[target_name] # randomize the data frame order df_random = df_std.reindex(np.random.permutation(df_std.index)) # make sure this doesn't take too long if df_random.shape[0] > maxevents: df_random = df_random[:maxevents] # Make a figure and declare the size fig = plt.figure(figsize=(9, 9)) # Make histograms for each column and put them in the figure current_axis = fig.add_subplot(2, 2, 1) current_axis.set_title('Andrews Curves') andrews_curves(df_random, target_name, ax=current_axis) current_axis = fig.add_subplot(2, 2, 2) current_axis.set_title('Parallel Coordinates') parallel_coordinates(df_random, target_name, ax=current_axis, colormap='gist_rainbow') current_axis = fig.add_subplot(2, 2, 3) current_axis.set_title('Radviz Spring Tension') radviz(df_random, target_name, ax=current_axis, colormap='jet') #fig.tight_layout() return fig
def multidimensional_plots(df, target_name, maxevents=10000): # normalize df_std = (df - df.mean())/df.std() # put the unnormalized target back df_std[target_name] = df[target_name] # randomize the data frame order df_random = df_std.reindex(np.random.permutation(df_std.index)) # make sure this doesn't take too long if df_random.shape[0] > maxevents: df_random = df_random[:maxevents] # Make a figure and declare the size fig = plt.figure(figsize=(9, 9)) # Make histograms for each column and put them in the figure current_axis = fig.add_subplot(2, 2, 1) current_axis.set_title('Andrews Curves') andrews_curves(df_random, target_name, ax=current_axis) current_axis = fig.add_subplot(2, 2, 2) current_axis.set_title('Parallel Coordinates') parallel_coordinates(df_random, target_name, ax=current_axis, colormap='gist_rainbow') current_axis = fig.add_subplot(2, 2, 3) current_axis.set_title('Radviz Spring Tension') radviz(df_random, target_name, ax=current_axis, colormap='jet') #fig.tight_layout() return fig
def andrew_curves(): plt_feat = ['Alcohol', 'MalicAcid', 'Ash', 'AlcalinityOfAsh', 'Magnesium', 'TotalPhenols', 'Flavanoids', 'NonflavanoidPhenols', 'Proanthocyanins', 'ColorIntensity', 'Hue', 'OD280/OD315', 'Proline'] plt_feat1 = ['MalicAcid', 'Ash', 'OD280/OD315', 'Magnesium','TotalPhenols'] data_norm = pd.concat([X_norm[plt_feat1], y], axis=1) andrews_curves(data, 'Class') plt.show()
def draw_curve_plot2(dframe, cid, behav): fig, (ax1) = plt.subplots(nrows=1, ncols=1) # Andrews' curves andrews_curves(dframe, 'avgIC', ax=ax1) plt.legend(loc='best') graphTitle = 'Child: ' + str(cid) graphTitle += ', Behaviour:' + behav plt.title(graphTitle) plt.xlabel('session number') plt.ylabel('average correct answers') mng = plt.get_current_fig_manager() mng.resize(*mng.window.maxsize()) plt.show()
def module3(): """ Notes on module 3 """ # The Seven Basic Tools of Quality: https://en.wikipedia.org/wiki/Seven_Basic_Tools_of_Quality #Histogram path = "C:/Users/jbennett02/Documents/Magic Briefcase/classwork/edx/Microsoft/DAT210x.b/module3/Datasets/" df = pd.read_csv(path + "wheat.data") matplotlib.style.use('ggplot') # Look Pretty df.asymmetry.plot.hist(title='Asymmetry', bins=10) plt.show() #2D scatterplot df.plot.scatter(x='area', y='perimeter') plt.show() #3D scatterplot fig = plt.figure() ax = fig.add_subplot(111, projection='3d') ax.set_xlabel('area') ax.set_ylabel('perimeter') ax.set_zlabel('asymmetry') ax.scatter(df.area, df.perimeter, df.asymmetry, c='r', marker='.') plt.show() #Parallel Coordinates -- higher dimensionality visualizations data = load_iris() df = pd.DataFrame(data.data, columns=data.feature_names) df['target_names'] = [data.target_names[i] for i in data.target] # Parallel Coordinates Start Here: plt.figure() parallel_coordinates(df, 'target_names') plt.show() #Andrews curve plt.figure() andrews_curves(df, 'target_names') plt.show() #correlation plot df = pd.DataFrame(np.random.randn(1000, 5), columns=['a', 'b', 'c', 'd', 'e']) print(df.corr()) plt.imshow(df.corr(), cmap=plt.cm.Blues, interpolation='nearest') plt.colorbar() tick_marks = [i for i in range(len(df.columns))] plt.xticks(tick_marks, df.columns, rotation='vertical') plt.yticks(tick_marks, df.columns)
def draw_curve_plot(dframe, cid, behav): fig, (ax1, ax2) = plt.subplots(nrows=2, ncols=1) # Andrews' curves andrews_curves(df, 'is_correct', ax=ax1) # multiline plot with group by for key, grp in dframe.groupby(['session']): ax2.plot(grp['date'], grp['answering_time'], label="") plt.legend(loc='best') graphTitle = 'Child: ' + str(cid) graphTitle += ', Behaviour:' + behav plt.title(graphTitle) plt.xlabel('date') plt.ylabel('answering time') mng = plt.get_current_fig_manager() mng.resize(*mng.window.maxsize()) plt.show()
def Clonal_Evolution_Multidimensional_Data(self): i = 0.0 Clonal_Evolution_df = pd.DataFrame() for df in DataStructs: if (i == 0): t = [i] * len(df) Clonal_Evolution_df = df Clonal_Evolution_df['t'] = pd.Series( t, index=Clonal_Evolution_df.index) else: t = [i] * len(df) df['t'] = pd.Series(t, index=df.index) Clonal_Evolution_df = pd.concat([Clonal_Evolution_df, df], ignore_index=True) i = i + 1.0 C = Clonal_Evolution_df['ID'] S = Clonal_Evolution_df['Size'] M = Clonal_Evolution_df['MR'] P = Clonal_Evolution_df['PR'] T = Clonal_Evolution_df['t'] Normalised_df = pd.DataFrame(zip(T / max(T), S / max(S), P / max(P), M / max(M), C), columns=['t', 'Size', 'PR', 'MR', 'ID']) plt.figure() parallel_coordinates(Normalised_df, 'ID', colormap='jet').set_title("PC Plot") plt.legend(loc='center left', bbox_to_anchor=(1.0, 0.5)) plt.savefig('Clonal_Evolution_Parallel_Coords_Plot.eps', format='eps', dpi=1000) plt.figure() andrews_curves(Normalised_df, 'ID', colormap='jet') plt.legend(loc='center left', bbox_to_anchor=(1.0, 0.5)) plt.savefig('Clonal_Evolution_Andrews_Curves_Plot.eps', format='eps', dpi=1000) plt.figure() radviz(Normalised_df, 'ID', colormap='jet') plt.legend(loc='center left', bbox_to_anchor=(1.0, 0.5)) plt.savefig('Clonal_Evolution_RadViz_Plot.eps', format='eps', dpi=1000)
def t4(tp='r'): # 可视化 conda install pandas 多维数据 可视化 # http://cloga.info/%E6%95%B0%E6%8D%AE%E5%88%86%E6%9E%90/2016/10/12/multivariate-data-visualization import pandas as pd import matplotlib.pyplot as plt data = pd.read_csv('file:///e:/stock/xx1') from pandas.tools.plotting import andrews_curves from pandas.tools.plotting import parallel_coordinates from pandas.tools.plotting import radviz plt.figure() if tp == 'r': radviz(data, 'Name') elif tp == 'a': andrews_curves(data, 'Name') elif tp == 'p': parallel_coordinates(data, 'Name') plt.show()
def plot_regression(self): # sample plot of points x = self.filtered.tmax y = self.filtered.tmin fit = polyfit(x,y,1) fit_fn = poly1d(fit) # takes in x and returns an estimate for y # fig 1 plt.plot(x,y, 'yo', x, fit_fn(x), '--k') plt.title("Temperature pattern regression plot") plt.xlabel("tmax") plt.ylabel("tmin") # fig 2 plt.figure() andrews_curves(self.filtered[[2,3]], 'tmin') plt.title("Andrews curve plot for tmin") plt.show() # fig 3 plt.figure() self.df[[2,3]].boxplot() # fig 4 self.filtered.plot() plt.show()
def plot_regression(self): # sample plot of points x = self.filtered.tmax y = self.filtered.tmin fit = polyfit(x, y, 1) fit_fn = poly1d(fit) # takes in x and returns an estimate for y # fig 1 plt.plot(x, y, 'yo', x, fit_fn(x), '--k') plt.title("Temperature pattern regression plot") plt.xlabel("tmax") plt.ylabel("tmin") # fig 2 plt.figure() andrews_curves(self.filtered[[2, 3]], 'tmin') plt.title("Andrews curve plot for tmin") plt.show() # fig 3 plt.figure() self.df[[2, 3]].boxplot() # fig 4 self.filtered.plot() plt.show()
def Evt_Multi_D_Andrew_Plot(self, event): page = self.New_Tab.GetSelection() panel = self.New_Tab.GetPage(page) self.selected_checkbox() panel.canvas.figure.clf() data_list = list() for variable in self.selected_checkboxes: data_list.append(variable[1]) data_list.append("customer_number") data = self.data[data_list][self.minimum:self.maximum] ax = andrews_curves(data, "customer_number") for direction in ["left", "right", "top", "bottom"]: ax.spines[direction].set_color("none") panel.canvas.draw() return
def Evt_Multi_D_Andrew_Plot(self, event): page = self.New_Tab.GetSelection() panel = self.New_Tab.GetPage(page) self.selected_checkbox() panel.canvas.figure.clf() data_list = list() for variable in self.selected_checkboxes: data_list.append(variable[1]) data_list.append("customer_number") data = self.data[data_list][self.minimum: self.maximum] ax= andrews_curves(data, "customer_number") for direction in ["left", "right", "top", "bottom"]: ax.spines[direction].set_color("none") panel.canvas.draw() return
import pandas as pd import matplotlib.pyplot as plt import matplotlib from pandas.tools.plotting import andrews_curves matplotlib.style.use('ggplot') seeds_dataset = pd.read_csv('Datasets/wheat.data') seeds_dataset1 = seeds_dataset.drop(labels=['id', 'area', 'perimeter'], axis='columns') seeds_dataset2 = seeds_dataset.drop(labels='id', axis='columns') plt.figure() andrews_curves(seeds_dataset1, 'wheat_type') plt.show() plt.figure() andrews_curves(seeds_dataset2, 'wheat_type') plt.show()
import pandas as pd import matplotlib.pyplot as plt import matplotlib # Look pretty... matplotlib.style.use('ggplot') # If the above line throws an error, use plt.style.use('ggplot') instead # Load up SKLearn's Iris Dataset into a Pandas Dataframe data = load_iris() df = pd.DataFrame(data.data, columns=data.feature_names) df['target_names'] = [data.target_names[i] for i in data.target] # Andrews Curves Start Here: plt.figure() andrews_curves(df, 'target_names') plt.show() # # imshow # import matplotlib.pyplot as plt import numpy as np import random random.seed(1) df = pd.DataFrame(np.random.randn(1000, 5), columns=['a', 'b', 'c', 'd', 'e']) df.corr()
YL = np.asarray(YL) YT = np.asarray(YT) YL[YL!='0']='2' YT[YT!='0']='2' treeclf = RandomForestClassifier(n_estimators=40, criterion='entropy', max_features='auto', bootstrap=True, oob_score=True, n_jobs=2, class_weight="balanced", random_state=42) treeclf.fit(XL, YL) learnguesses = [treeclf.predict(chunk) for chunk in Xlearn2] testguesses = [treeclf.predict(chunk) for chunk in Xtest] devguesses = [treeclf.predict(chunk) for chunk in Xdev] # p=1 # make 100% of the data noise # learnguesses, testguesses, devguesses = simulate_data(p) CXlearn = get_crf_data(learnguesses, Xlearn2, Nlearn2, Plearn2) CXtest = get_crf_data(testguesses, Xtest, Ntest, Ptest) CXdev = get_crf_data(devguesses, Xdev, Ndev, Pdev) clf.fit(CXlearn, Ylearn2, CXdev, Ydev) CYhat = clf.predict(CXtest) print "======CRF PERFORMANCE======" print_performance(lsum(Ytest), lsum(CYhat)) # joblib.dump(clf, "Model/model.pkl") if visualize: scatter_matrix(df, alpha=0.2, figsize=(8, 8), diagonal='none'); plt.figure() andrews_curves(df, 'Class') plt.show() joblib.dump(treeclf, "Model/treemodel.pkl")
for userid in pd.unique(workingData.loc[:,"_user_id"]): sampleCount=int(.3 * workingData.loc[workingData._user_id==userid,"_user_id"].count()) df=workingData[workingData._user_id==userid].sample(sampleCount) sampleData=sampleData.append(df) sampleData.info() # parallel co ordinate plot from pandas.tools.plotting import parallel_coordinates plt.figure(figsize=(18,6)) parallel_coordinates(sampleData, '_user_id') # andrews curves from pandas.tools.plotting import andrews_curves plt.figure(figsize=(18,6)) andrews_curves(sampleData,'_user_id') # radviz spring constant plot from pandas.tools.plotting import radviz plt.figure(figsize=(12,10)) radviz(sampleData,"_user_id") # As can be observed from above plots data is very closely spaced without any apparent linear or non-linear boundaries # Initial inference - A tree based approach might work better when compared to a kernel based boundary fitting approach # plotting predictor variables, not considering feat8 and feat16 here as majority of them would be replaced missing values features= ["feat11","feat13","feat5","feat2","feat1","feat4","feat7","feat14","feat10","feat15","feat21","feat3", "feat18","feat9","feat17","feat20","feat6","feat12","feat19","feat22"] workingData[features].hist()
import matplotlib # Look pretty... matplotlib.style.use('ggplot') # If the above line throws an error, use plt.style.use('ggplot') instead # Load up SKLearn's Iris Dataset into a Pandas Dataframe data = load_iris() df = pd.DataFrame(data.data, columns=data.feature_names) df['target_names'] = [data.target_names[i] for i in data.target] # Parallel Coordinates Start Here: plt.figure() parallel_coordinates(df, 'target_names') plt.show() # Andrews Curves Start Here: plt.figure() andrews_curves(df, 'target_names') plt.show() # correlation matrix # df.corr() plt.imshow(df.corr(), cmap=plt.cm.Blues, interpolation='nearest') plt.colorbar() tick_marks = [i for i in range(len(df.columns))] plt.xticks(tick_marks, df.columns, rotation='vertical') plt.yticks(tick_marks, df.columns) plt.show()
def plot_andrews(data_frame, class_name): plt.clf() andrews_curves(data_frame, class_name) plt.title('Andrews Curve') # plt.show(block=False) plt.savefig(join(Status.TEMP_DIR, Status.ANDREWS_NAME))
import matplotlib.pyplot as plt from pandas import DataFrame from pandas.tools.plotting import andrews_curves from pandas.tools.plotting import parallel_coordinates from pandas.tools.plotting import scatter_matrix # Load dataset from the sklearn iris_data = load_iris() # Concantenate dataset to dataframe iris_cat = np.concatenate((iris_data.data, iris_data.target.reshape(150,1)), axis=1) iris_df = DataFrame(iris_cat, columns=['PA', 'PB', 'PC', 'PD','Name']) # Plot the data using # 1 - Parallel Coordinates plt.figure() parallel_coordinates(iris_df, 'Name') # 2 - Andrews Curves plt.figure() andrews_curves(iris_df, 'Name') # 3 - Scatter_Plots plt.figure() scatter_matrix(iris_df, alpha=0.2, figure=(6,6), diagonal='kde') # Show the plot plt.show()
def display_andrews_graph(self): pdplt.andrews_curves(df, "output", ax=None) plt.show()
def time_plot_andrews_curves(self): andrews_curves(self.df, "Name")
import pandas as pd import matplotlib.pyplot as plt import matplotlib from pandas.tools.plotting import andrews_curves # Look pretty... matplotlib.style.use('ggplot') # # TODO: Load up the Seeds Dataset into a Dataframe # It's located at 'Datasets/wheat.data' # df = pd.read_csv('Datasets/wheat.data', index_col=0) # # TODO: Drop the 'id', 'area', and 'perimeter' feature # #df.drop(['area', 'perimeter'], axis=1, inplace=True) # # TODO: Plot a parallel coordinates chart grouped by # the 'wheat_type' feature. Be sure to set the optional # display parameter alpha to 0.4 # plt.figure() andrews_curves(df, 'wheat_type') plt.show()
# It's located at 'Datasets/wheat.data' # # .. your code here .. df = pd.read_csv('Datasets/wheat.data') # # TODO: Drop the 'id', 'area', and 'perimeter' feature # # .. your code here .. df = df.drop(labels=['id'], axis = 1) # # TODO: Plot an Andrews Curve grouped by the 'wheat_type' feature. Be sure to set the optional # display parameter alpha to 0.4 # # .. your code here .. plt.figure() andrews_curves(df, 'wheat_type', alpha = 0.4) plt.show() # Questions: # Are your outlier samples still easily identifiable in the plot? # No # # After adding in the area and perimeter features, does your plot suffer from the same feature scaling issue you had with parallel # coordinates? # No
sns.pairplot(iris,hue='Species',kind='reg') # 12. Heatmap sns.heatmap(iris.corr(),linewidth=0.3,vmax=1.0,square=True, linecolor='black',annot=True) # 13 Boxplot sns.boxplot(x='Species', y = 'SepalLength', data=iris) # 14. Kdeplot sns.FacetGrid(iris,hue='Species',size=4) \ .map(sns.kdeplot,'SepalLength') \ .add_legend() # Andrews Curve from pandas.tools.plotting import andrews_curves andrews_curves(iris.drop("Id", axis=1), 'Species') # radviz from pandas.tools.plotting import radviz radviz(iris.drop("Id", axis=1), 'Species')
# # TODO: Drop the 'id', 'area', and 'perimeter' feature # # .. your code here .. df0 = df.drop(['id', 'area', 'perimeter'], axis = 1) # # TODO: Plot a parallel coordinates chart grouped by # the 'wheat_type' feature. Be sure to set the optional # display parameter alpha to 0.4 # # .. your code here .. plt.figure() parallel_coordinates(df0, 'wheat_type', alpha = 0.4) plt.show() plt.figure() andrews_curves(df0, 'wheat_type') plt.show() df1 = df.drop(['id'], axis = 1) plt.figure() andrews_curves(df1, 'wheat_type') plt.show()
df1 = pd.DataFrame({ 'a' : [0,1] * 250, 'b' : np.random.randn(500), 'c' : np.random.randn(500), 'd' : np.random.randn(500), 'e' : np.random.randn(500), 'f' : np.random.randn(500), 'g' : np.random.randn(500), 'h' : np.random.randn(500), 'i' : np.random.randn(500), 'j' : np.random.randn(500), 'k' : np.random.randn(500), 'l' : np.random.randn(500), 'm' : np.random.randn(500) }) # 6.3 Plot Andrews curve. Merely a jumble of lines andrews_curves(df1, "a") # 6.4 Plot now for titanic data. There appears to be a structure in that # there is clear separation between yellow and green lines andrews_curves(df, "Survived") # 7.1 Draw parallel coordinates # 7.2 First with random data parallel_coordinates(df1, "a") # 7.3 Next with titanic data # Presence of structure is evident parallel_coordinates(df, "Survived") # 8. A final multivariate visualization technique pandas
# TODO: Plot a parallel coordinates chart grouped by # the 'wheat_type' feature. Be sure to set the optional # display parameter alpha to 0.4 # # .. your code here .. parallel_coordinates(w2, "wheat_type", alpha = 0.4) plt.show() # Andrews curves (assignment 5) from pandas.tools.plotting import andrews_curves andrews_curves(w2, "wheat_type", alpha = 0.4) w3 = w2.copy() w3["parameter"] = wheatData.perimeter w3["area"] = wheatData.area andrews_curves(w3, "wheat_type", alpha = 0.4)
import matplotlib.pyplot as plt import matplotlib from pandas.tools.plotting import andrews_curves # Look pretty... matplotlib.style.use('ggplot') seeds = pd.read_csv("Datasets/wheat.data", index_col=0) #seeds=seeds.drop(labels=['area','perimeter'],axis=1) # # TODO: Plot a parallel coordinates chart grouped by # the 'wheat_type' feature. Be sure to set the optional # display parameter alpha to 0.4 # # .. your code here .. plt.figure(dpi=267) andrews_curves(seeds, 'wheat_type',alpha=4) plt.show()
data.columns = ["age","bmi","chol"] data.sort(["age"],inplace=True) print data.corr(method="pearson") #%% line, = plt.plot(data.age, data.chol, "m-", linewidth = 5.0) #line.set_antialiased(False) plt.setp(line) plt.xlabel("age") plt.ylabel("chol") plt.show() #ax = plt.subplot(111) #t = np.arange(0.0, 5.0, 0.01) #s = np.cos(2*np.pi*t) #line, = plt.plot(t, s, lw=2) #plt.annotate('local max', xy=(2, 1), xytext=(3, 1.5), # arrowprops=dict(facecolor='black', shrink=0.05), # ) #plt.ylim(-2,2) #plt.show() #print plt.style.available #%% #data.plot(y = data.index) from pandas.tools.plotting import andrews_curves data = pd.read_csv("https://raw.githubusercontent.com/pydata/pandas/master/pandas/tests/data/iris.csv") #%% andrews_curves(data, "Name")
def plot_main(): """ moyenne """ fig, axes = plt.subplots(2, 4, figsize=(10, 16)) values = pd.Series(df["moyenne"]) df.boxplot(column="moyenne", by="classe", ax=axes[0][0]) values.hist(color='g', ax=axes[1], normed=True) values.plot(kind="KDE", ax=axes[1], style='r-', label=" KDE", legend=True) # plt.savefig('kde_boxplot_moyenne.png') # plt.close() pass """ ecart-type """ fig, axes = plt.subplots(2, 1, figsize=(10, 16)) values = pd.Series(df["ecart-type"]) df.boxplot(column="ecart-type", by="classe", ax=axes[0][1]) values.hist(color='g', ax=axes[1], normed=True) values.plot(kind="KDE", ax=axes[1], style='r-', label=" KDE", legend=True) plt.savefig('kde_boxplot_ecart-type.png') plt.close() pass """ mediane """ fig, axes = plt.subplots(2, 1, figsize=(10, 16)) values = pd.Series(df["mediane"]) df.boxplot(column="mediane", by="classe", ax=axes[0][2]) values.hist(color='g', ax=axes[1], normed=True) values.plot(kind="KDE", ax=axes[1], style='r-', label=" KDE", legend=True) plt.savefig('kde_boxplot_mediane.png') plt.close() pass """ entropie """ fig, axes = plt.subplots(2, 1, figsize=(10, 16)) values = pd.Series(df["entropie"]) df.boxplot(column="entropie", by="classe", ax=axes[0][3]) values.hist(color='g', ax=axes[1], normed=True) values.plot(kind="KDE", ax=axes[1], style='r-', label=" KDE", legend=True) plt.savefig('kde_boxplot_entropie.png') plt.close() pass """ uniformit """ fig, axes = plt.subplots(2, 1, figsize=(10, 16)) values = pd.Series(df["uniformit"]) df.boxplot(column="uniformit", by="classe", ax=axes[1][0]) values.hist(color='g', ax=axes[1], normed=True) values.plot(kind="KDE", ax=axes[1], style='r-', label=" KDE", legend=True) plt.savefig('kde_boxplot_uniformit.png') plt.close() pass """ surface """ fig, axes = plt.subplots(2, 1, figsize=(10, 16)) values = pd.Series(df["surface"]) df.boxplot(column="surface", by="classe", ax=axes[1][1]) values.hist(color='g', ax=axes[1], normed=True) values.plot(kind="KDE", ax=axes[1], style='r-', label=" KDE", legend=True) plt.savefig('kde_boxplot_surface.png') plt.close() pass """ eccent""" fig, axes = plt.subplots(2, 1, figsize=(10, 16)) values = pd.Series(df["eccentricity"]) df.boxplot(column="eccentricity", by="classe", ax=axes[1][2]) values.hist(color='g', ax=axes[1], normed=True) values.plot(kind="KDE", ax=axes[1], style='r-', label=" KDE", legend=True) plt.savefig('kde_boxplot_eccentricitie.png') plt.close() pass """ Andrews Curves & parallel coordinates ploting """ fig, axes = plt.subplots(2, 1, figsize=(10, 17)) andrews_curves(df, 'classe', ax=axes[0]) parallel_coordinates(df, 'classe', ax=axes[1]) plt.savefig('Andrew_curves_df2.png') plt.close() """ matrice des correlations """ fig, axes = plt.subplots(1, 1, figsize=(10, 10)) scatter_matrix(df, alpha=0.2, figsize=(6, 6), diagonal='kde', ax=axes) plt.savefig('matrix_corr_kde.png') plt.close() pass """ RadViz visualizing multi-variate data""" fig, axes = plt.subplots(1, 1, figsize=(10, 10)) radviz(df, "classe", ax=axes) plt.savefig('RadViz_df.png') plt.close()
# -*- coding: utf-8 -*- """ Created on Tue Nov 08 12:56:53 2016 @author: paul.buxton """ from sklearn.datasets import load_iris from pandas.tools.plotting import andrews_curves import pandas as pd import matplotlib.pyplot as plt import matplotlib # Look pretty... matplotlib.style.use('ggplot') # If the above line throws an error, use plt.style.use('ggplot') instead # Load up SKLearn's Iris Dataset into a Pandas Dataframe data = load_iris() df = pd.DataFrame(data.data, columns=data.feature_names) df['target_names'] = [data.target_names[i] for i in data.target] # Andrews Curves Start Here: plt.figure() andrews_curves(df, 'target_names') plt.show()
sns.pairplot(iris.drop("Id", axis=1), hue="Species", size=3) plt.show() # The diagonal elements in a pairplot show the histogram by default # We can update these elements to show other things, such as a kde sns.pairplot(iris.drop("Id", axis=1), hue="Species", size=3, diag_kind="kde") plt.show() # Now that we've covered seaborn, let's go back to some of the ones we can make with Pandas # We can quickly make a boxplot with Pandas on each feature split out by species iris.drop("Id", axis=1).boxplot(by="Species", figsize=(12, 6)) plt.show() # One cool more sophisticated technique pandas has available is called Andrews Curves # Andrews Curves involve using attributes of samples as coefficients for Fourier series # and then plotting these from pandas.tools.plotting import andrews_curves andrews_curves(iris.drop("Id", axis=1), "Species") plt.show() # Another multivariate visualization technique pandas has is parallel_coordinates # Parallel coordinates plots each feature on a separate column & then draws lines # connecting the features for each data sample from pandas.tools.plotting import parallel_coordinates parallel_coordinates(iris.drop("Id", axis=1), "Species") plt.show() # A final multivariate visualization technique pandas has is radviz # Which puts each feature as a point on a 2D plane, and then simulates # having each sample attached to those points through a spring weighted # by the relative value for that feature from pandas.tools.plotting import radviz radviz(iris.drop("Id", axis=1), "Species") plt.show()
if (Parser().Get_TEM_File() and Surviving_Phylogeny): print "Phylogeny " Parser().Read_Tumour_Evolution_File(False, 2000) #Parser().Filtered_Clonal_Value(CE_Frequency_Filter) Parser().Raw_Clonal_Values() Parser().Surviving_Phylogenetic_Tree() #Phylogenetic treee reconstructions if (Parser().Get_TEM_File() and False): Parser().Read_Tumour_Evolution_File(False, 2000) Parser().Filtered_Clonal_Value(0.1) #print Tumour_Evolution.keys() #print DataStructs[len(DataStructs)-1] #P_H = Tumour_Evolution['P-0:0'][3] #print P_H[0], P_H[len(P_H)-1] print "PC Ploting" # Ploting final population parallel coordinates if (Parser().Valid_Final_Population_File() and False): Final_Population_df = Parser().Read_Final_Population_File_Top_values(1) #Final_Population_df = Parser().Read_Final_Population_File() Final_Population_df = Parser().Normalise_df_axis() #print Final_Population_df print "Ploting", len(Final_Population_df) plt.figure() #parallel_coordinates(Final_Population_df, 'Extinct', color=['blue','black','red']).set_title("PC Plot") andrews_curves(Final_Population_df, 'Extinct', colormap='jet') plt.show() print "Done"
from sklearn.datasets import load_iris import numpy as np import pandas as pd from pandas import Series, DataFrame from pandas.tools.plotting import andrews_curves import seaborn as sns import matplotlib.pyplot as plt from pandas.tools.plotting import parallel_coordinates iris=load_iris() df_iris=pd.DataFrame(iris['data'],columns=['sepallength','sepalwidth','petallength','petalwidth']) df_iris['target']=iris['target'] print(df_iris.head()) print(df_iris['target'].value_counts()) df_iris.plot(kind='scatter',x='sepallength',y='sepalwidth') plt.show() sns.jointplot(x='sepallength',y='sepalwidth', data=df_iris, size=5) sns.FacetGrid(df_iris,hue='target',size=5).map(plt.scatter,'sepallength','sepalwidth').add_legend() sns.boxplot(x='target',y='sepallength',data=df_iris) ax = sns.boxplot(data=df_iris, x = 'target',y = 'sepallength') ax = sns.stripplot(data=df_iris, x='target', y='sepallength', jitter=True, edgecolor='green') sns.violinplot(x='target',y='sepallength',data=df_iris,size=5) sns.FacetGrid(df_iris,hue='target',size=5).map(sns.kdeplot,'sepallength').add_legend() sns.pairplot(df_iris,hue='target',size=4) sns.pairplot(df_iris,hue='target',size=4,diag_kind='kde') df_iris.boxplot(by='target',figsize=(20,10)) andrews_curves(df_iris,'target') parallel_coordinates(df_iris,'target')
# Look pretty... matplotlib.style.use('ggplot') # # Load up the Seeds Dataset into a Dataframe # It's located at 'Datasets/wheat.data' # path = "C:/Users/jbennett02/Documents/Magic Briefcase/classwork/edx/Microsoft/DAT210x.b/module3/Datasets/" df = pd.read_csv(path + "wheat.data") # # Drop the 'id', 'area', and 'perimeter' feature # df.drop('id', axis=1, inplace=True) #df.drop('area', axis=1, inplace=True) #df.drop('perimeter', axis=1, inplace=True) # # Plot an Andrews curve grouped by # the 'wheat_type' feature. Be sure to set the optional # display parameter alpha to 0.4 # plt.figure() andrews_curves(df, 'wheat_type') plt.show()
plt.savefig('databox.eps', format='eps', dpi=600) plt.show() setosa.boxplot() plt.title('Iris-setosa boxplot') plt.savefig('setobox.eps', format='eps', dpi=600) plt.show() versi.boxplot() plt.title('Iris-versi boxplot') plt.savefig('versibox.eps', format='eps', dpi=600) plt.show() verginica.boxplot() plt.title('Iris-verginica boxplot') plt.savefig('verginicabox.eps', format='eps', dpi=600) plt.show() andrews_curves(data, 'irisclass').legend(bbox_to_anchor=(0.4, 1)) plt.savefig('andrews_curve.eps', format='eps', dpi=600) radviz(data, 'irisclass').legend(bbox_to_anchor=(1.1, 1)) plt.savefig('radviz.eps', format='eps', dpi=600) plt.show() parallel_coordinates(data, 'irisclass').legend(bbox_to_anchor=(1, 1)) plt.savefig('paracoor.eps', format='eps', dpi=600) plt.show() #plot scatter, correlation sns.set(style="ticks") sns.pairplot(data, hue="irisclass") plt.savefig('scatermatrix.eps', format='eps', dpi=600) plt.show()
import pandas as pd data = pd.read_csv('https://raw.github.com/pydata/pandas/master/pandas/tests/data/iris.csv') data.head() # In[ ]: from pandas.tools.plotting import andrews_curves plt.figure() andrews_curves(data, 'Name') # 表格函数应用 可以通过将函数和适当数量的参数作为管道参数来执行自定义操作。 因此,对整个DataFrame执行操作。 例如,为DataFrame中的所有元素相加一个值2。 # In[99]: import pandas as pd import numpy as np def adder(ele1,ele2): return ele1+ele2 np.random.seed(293423)
import matplotlib from pandas.tools.plotting import andrews_curves # Look pretty... # matplotlib.style.use('ggplot') plt.style.use('ggplot') # # TODO: Load up the Seeds Dataset into a Dataframe # It's located at 'Datasets/wheat.data' # df = pd.read_csv('Datasets/wheat.data') # # TODO: Drop the 'id' feature, if you included it as a feature # (Hint: You shouldn't have) # Also get rid of the 'area' and 'perimeter' features # df = df.drop(labels=['id'], axis=1) # # TODO: Plot a parallel coordinates chart grouped by # the 'wheat_type' feature. Be sure to set the optional # display parameter alpha to 0.4 # plt.figure() andrews_curves(df, 'wheat_type', alpha=0.4) plt.show()
dataset.plot(kind='kde') dataset.plot(kind='box', subplots=True, layout=(2, 2), sharex=False, sharey=False) import pandas url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data" names = ['sepal-length', 'sepal-width', 'petal-length', 'petal-width', 'class'] dataset = pandas.read_csv(url, names=names) from pandas.tools.plotting import radviz radviz(dataset, 'class') from pandas.tools.plotting import andrews_curves andrews_curves(dataset, 'class') from pandas.tools.plotting import parallel_coordinates parallel_coordinates(dataset, 'class') import pandas url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data" names = ['sepal-length', 'sepal-width', 'petal-length', 'petal-width', 'class'] dataset = pandas.read_csv(url, names=names) from pandas.tools.plotting import scatter_matrix scatter_matrix(dataset, alpha=0.2, figsize=(6, 6), diagonal='kde') from sklearn.datasets import load_iris
# Select the columns of interest. # Our charts will be created from the new dataframes that we will create below. df2 = df[['compactness','length','width','asymmetry','groove','wheat_type']] # Create the parallel coordinates chart from pandas.plotting import parallel_coordinates plt.figure(figsize = (8,8)) parallel_coordinates(df2, 'wheat_type', alpha = 0.8) # Save your chart in a selected format plt.savefig("Wheat_Parallel_Coordinates.png", orientation = "landscape", dpi = 100) plt.show() # Create an andrews curves chart from pandas.tools.plotting import andrews_curves plt.figure(figsize = (8,8)) andrews_curves(df2,"wheat_type",alpha = 0.8) # Save your chart in a selected format plt.savefig("Wheat_Andrews_Curve.png", orientation = "landscape", dpi = 100) plt.show()
# Ubicación de los datos de ROBIN dir_GD = u"C:/Users/Miguel/Documents/1 Nube/GoogleDrive" dir_ROBIN = u"/2 Proyectos/RoBiN/Datos RoBiN/México/0_Vigente" dir_clima = u"/GIS/Mapas_base/2004/clima" os.chdir(dir_GD + dir_ROBIN + u"/GIS/Mapas_base/2004") archivos = os.listdir(os.curdir) nombre_archivo = [nombre for nombre in archivos if re.findall(".+clima-mx\\.csv$", nombre)][0] # Lee los datos de clima datos = pd.read_csv(nombre_archivo) datos.head() # Selected items in pandas dataframe datos.iloc[1:5,3] # the histogram of the data with histtype='step' fig1 = plt.figure(figsize=(4,3)) datos.iloc[:,3].plot(kind="hist", alpha=0.5, bins = 100) datos.iloc[:,4].plot(kind="hist", alpha=0.5, bins = 100) pp_vars = list(datos.columns[2:3]) zvh_var = [datos.columns[38]] melted = pd.DataFrame(pd.melt(datos, id_vars=zvh_var, value_vars=pp_vars)) pp_melted = melted[["value", "Zvh_observed"]] plt.figure() andrews_curves(melted[["value", "Zvh_observed"]], 'Zvh_observed') melted.head()