def visualization(file_path,fig_save_path): SAMPLE_NUMBER = 3000#作图采用的随机点的个数 data = pd.read_csv(file_path) set1 = data[data.icol(27) >= 0.5] set2 = data[data.icol(27) <= 0.5] random1 = random.sample(list(range(set1.shape[0])),SAMPLE_NUMBER)#随机采集样本的index random2 = random.sample(list(range(set2.shape[0])),SAMPLE_NUMBER) result = [] for i in range(len(random1)): result.append(set1[random1[i]:random1[i]+1])#采用[a:b]才能选择行 result.append(set2[random2[i]:random2[i]+1]) set = pd.concat(result) print(set) fig = plt.figure(figsize=(20, 10)) ax1 = fig.add_subplot(1, 1, 1) ax1.axis([0,28,-8,8])#scale : [xmin xmax ymin ymax] try: parallel_coordinates(set,"1.000000000000000000e+00",alpha=0.3,ax=ax1)#以label作为分类作图,第二个参数需要表头的标签,但是现在表头变成了值,只能用值索引 except: parallel_coordinates(set, "0.000000000000000000e+00", alpha=0.3, ax=ax1) plt.savefig(fig_save_path,dpi=150) plt.show()
def parallelStrokeDir(strokes): #strokes.append(['1','3','4']) #strokes.append(['5','4','3','2']) #strokes.append(['1','2','3']) strokes = sorted(strokes,key=len,reverse=True) maxStrokeLength = max([len(stroke) for stroke in strokes]) strokesSep = {i+1:[] for i in range(maxStrokeLength)} si = [-0.05 for s in range(9)] for stroke in strokes: for i,s in enumerate(stroke): strokesSep[i+1].append(s+si[s]) si[s] += 0.01 strokesSep = {label:Series(np.asarray(data,dtype=np.float64)) for label,data in strokesSep.iteritems()} strokesSep['Name'] = np.asarray(range(1,len(strokes)+1),dtype=np.float64) df2 = DataFrame(strokesSep) parallel_coordinates(df2,'Name') plt.show()
def visualize(config): # Create various visualizations of the data, this would help to create a feature vector for dataset in config['datasets']: scatter_matrix(dataset['df'], alpha=0.2, figsize=(20, 20), diagonal='kde') fig_name = dataset['name'] + '_scatter_matrix' + '.png' plt.savefig(fig_name) plt.close() plt.figure(figsize=(20, 20)) parallel_coordinates(dataset['df'], 'quality') fig_name = dataset['name'] + '_parallel_coordinates' + '.png' plt.savefig(fig_name) plt.close() plt.figure(figsize=(20, 20)) radviz(dataset['df'], 'quality') fig_name = dataset['name'] + '_radviz' + '.png' plt.savefig(fig_name) plt.close() return OK
def test_parallel_coords(pandas=False, outpath=None): """ Runs the parallel coordinates visualizer on the dataset. Parameters ---------- pandas : bool Run the pandas version of the function outpath : path or None Save the figure to disk rather than show (if None) """ data = load_data('occupancy') # Load the data features = ['temp', 'humid', 'light', 'co2', 'hratio'] classes = ['unoccupied', 'occupied'] X = data[features].as_matrix() y = data.occupied.as_matrix() if pandas: parallel_coordinates(data[features + ['occupied']], 'occupied') if outpath: plt.savefig(outpath) else: plt.show() else: visualizer = ParallelCoordinates( # Instantiate the visualizer classes=classes, features=features) visualizer.fit(X, y) # Fit the data to the visualizer visualizer.transform(X) # Transform the data visualizer.poof(outpath=outpath) # Draw/show/poof the data
def plot_parallel_coordinates(data_frame): plotting.parallel_coordinates(data_frame, 'quality', colors = seaborn.color_palette('pastel', n_colors=6).as_hex()) figure = pyplot.gcf() figure.set_size_inches(15, 15) pyplot.savefig('build/parallel_coordinates.png', dpi=300) pyplot.clf()
def makeParallel(traindata): x,y = traindata num = x.shape[0] df = DataFrame(np.hstack((x[num/2-100:num/2+100],y[num/2-100:num/2+100,None])),columns=range(x.shape[1])+["class"]) parallel_coordinates(df, 'class') plt.title('Parallel coordinates F0-f0') plt.savefig('paralell_coordinates_F0-f0.png')
def irisVisualization(): sns.set(style="white", color_codes=True) irisdata = load_iris() iris = pd.DataFrame(irisdata.data, columns=irisdata.feature_names) iris['Species'] = pd.Categorical.from_codes(irisdata.target, irisdata.target_names) # sepal length (cm) sepal width (cm) petal length (cm) petal width (cm) # pandas plot # iris.plot(kind="scatter", x="sepal length (cm)", y="sepal width (cm)") # iris.boxplot(by="Species", figsize=(12, 6)) from pandas.tools.plotting import andrews_curves, parallel_coordinates andrews_curves(iris, "Species") parallel_coordinates(iris, "Species") # seaborn plot sns.jointplot(x="sepal length (cm)", y="sepal width (cm)", data=iris, size=5) sns.FacetGrid(iris, hue="Species", size=5) \ .map(plt.scatter, "sepal length (cm)", "sepal width (cm)").add_legend() # sns.kdeplot sns.boxplot(x="Species", y="sepal length (cm)", data=iris) sns.violinplot(x="Species", y="sepal length (cm)", data=iris, size=6) sns.pairplot(iris, hue="Species", size=3) sns.plt.show()
def plot_viz(): Attr_nr=57 xtik=[i for i in range(Attr_nr)] sheet = xlrd.open_workbook('spam_data.xls').sheet_by_index(0) header = sheet.row_values(0, 1, 59) classLabel = sheet.col_values(0, 1, 4602) X = np.mat(np.empty((4601, 57))) for i, col_id in enumerate(range(1, 58)): X[:, i] = np.mat(sheet.col_values(col_id, 1, 4602)).T header = header[:len(header) - 1] x_std= stats.zscore(X, ddof=1) df = pd.DataFrame(data=x_std[:,:Attr_nr], columns=header[:Attr_nr]) df['Name'] = classLabel plt.figure() #radviz(df, 'Name') parallel_coordinates(df, 'Name', color=['blue','red']) trimed_header=[h.replace("word_freq_","",-1).replace(':','',-1) for h in header[:Attr_nr]] plt.xticks(xtik,trimed_header, rotation='vertical') #andrews_curves(df, 'Name', colormap='winter') #plt.show() #normalizeing #df_norm = (df - df.mean()) / (df.max() - df.min()) #df_norm.plot(kind='box') plt.savefig('C://Users//Bahram//Desktop//E2015//pic//att20.eps', format='eps', dpi=1000,bbox_inches='tight') plt.savefig('C://Users//Bahram//Desktop//E2015//pic//att20.png', format='png', dpi=1000,bbox_inches='tight') plt.show()
def show_data(): tData = numpy.genfromtxt('fordTrain.csv', skip_header=1, delimiter=',', max_rows=400000) x = tData[:,3:33] y = tData[:,2] mins = x.min(0) x = x - mins x = x - (x.max(0) - x.min(0)) / 2 maxs = x.max(0) maxs[maxs==0] = 1 x = 2 * x / maxs d = numpy.zeros(tData.shape) d[:,[0,1]] = tData[:,[0,1]] d[:,2] = y d[:,3:33] = x head = 'TrialID,ObsNum,IsAlert,P1,P2,P3,P4,P5,P6,P7,P8,E1,E2,E3,E4,E5,E6,E7,E8,E9,E10,E11,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11' numpy.savetxt('normalized_data.csv', d, delimiter=',', header=head, fmt='%1.3f') data = pandas.read_csv('normalized_data.csv', header=0, usecols=numpy.r_[2:33], sep=',', nrows=4000) parallel_coordinates(data, 'IsAlert', color=['r','b']) plt.show()
def drawParallelCoordinatesWithScaledValues(data, columnNames): # Construct dataframe without quality attribute (don't want to scale that) dataWoQuality = pandas.DataFrame(data.iloc[:,0:11]) # Scale values scaled = pandas.DataFrame(preprocessing.scale(dataWoQuality)) # Construct dataframe with quality attribute values quality = pandas.DataFrame(data.iloc[:,11]) # Concatenate dataframes allAttributes = pandas.concat([scaled, quality], axis=1) # Add column names allAttributes.columns = columnNames # Select random samples num = 1500 allAttributes = allAttributes.loc[random.sample(list(allAttributes.index), num)] # Plot figure plt.figure() parallel_coordinates(allAttributes, 'quality') plt.title('Parallel coordinates, scaled values ' + 'n: ' + str(len(allAttributes))) plt.ylim(-2, 4) plt.show()
def multidimensional_plots(df, target_name, maxevents=10000): # normalize df_std = (df - df.mean()) / df.std() # put the unnormalized target back df_std[target_name] = df[target_name] # randomize the data frame order df_random = df_std.reindex(np.random.permutation(df_std.index)) # make sure this doesn't take too long if df_random.shape[0] > maxevents: df_random = df_random[:maxevents] # Make a figure and declare the size fig = plt.figure(figsize=(9, 9)) # Make histograms for each column and put them in the figure current_axis = fig.add_subplot(2, 2, 1) current_axis.set_title('Andrews Curves') andrews_curves(df_random, target_name, ax=current_axis) current_axis = fig.add_subplot(2, 2, 2) current_axis.set_title('Parallel Coordinates') parallel_coordinates(df_random, target_name, ax=current_axis, colormap='gist_rainbow') current_axis = fig.add_subplot(2, 2, 3) current_axis.set_title('Radviz Spring Tension') radviz(df_random, target_name, ax=current_axis, colormap='jet') #fig.tight_layout() return fig
def plot(self): self.prepare() try: data_frames = list() for dset_key in self.mw._vdata.keys(): dset = self.mw._vdata[dset_key] # each dataset has a single patient outcome pscores = dset.pat_data_pandas_scores curr_df = pd.DataFrame.from_dict(pscores) curr_df['Outcome'] = dset_key data_frames.append(curr_df) tot_df = data_frames[0] for idx in range(1, len(data_frames), 1): # http://pandas.pydata.org/pandas-docs/version/0.16.2/generated/pandas.merge.html#pandas.merge tot_df = pd.merge(tot_df, data_frames[idx], how='outer') ax = self.fig.add_subplot(111) ax.set_xlabel('patient') ax.set_ylabel('clinical score') parallel_coordinates(tot_df, 'Outcome', ax = ax, linewidth = 3) ax.hold(False) self.canvas.draw() except: QtGui.QMessageBox.warning( self, 'Warning', \ 'Parallel coordinates plot failed. Check if there is dataset loaded.' )
def test_parallel_coordinates(self): from pandas.tools.plotting import parallel_coordinates from matplotlib import cm df = self.iris ax = _check_plot_works(parallel_coordinates, frame=df, class_column='Name') nlines = len(ax.get_lines()) nxticks = len(ax.xaxis.get_ticklabels()) rgba = ('#556270', '#4ECDC4', '#C7F464') ax = _check_plot_works(parallel_coordinates, frame=df, class_column='Name', color=rgba) self._check_colors(ax.get_lines()[:10], linecolors=rgba, mapping=df['Name'][:10]) cnames = ['dodgerblue', 'aquamarine', 'seagreen'] ax = _check_plot_works(parallel_coordinates, frame=df, class_column='Name', color=cnames) self._check_colors(ax.get_lines()[:10], linecolors=cnames, mapping=df['Name'][:10]) ax = _check_plot_works(parallel_coordinates, frame=df, class_column='Name', colormap=cm.jet) cmaps = lmap(cm.jet, np.linspace(0, 1, df['Name'].nunique())) self._check_colors(ax.get_lines()[:10], linecolors=cmaps, mapping=df['Name'][:10]) ax = _check_plot_works(parallel_coordinates, frame=df, class_column='Name', axvlines=False) assert len(ax.get_lines()) == (nlines - nxticks) colors = ['b', 'g', 'r'] df = DataFrame({ "A": [1, 2, 3], "B": [1, 2, 3], "C": [1, 2, 3], "Name": colors }) ax = parallel_coordinates(df, 'Name', color=colors) handles, labels = ax.get_legend_handles_labels() self._check_colors(handles, linecolors=colors) with tm.assert_produces_warning(FutureWarning): parallel_coordinates(data=df, class_column='Name') with tm.assert_produces_warning(FutureWarning): parallel_coordinates(df, 'Name', colors=colors)
def multidimensional_plots(df, target_name, maxevents=10000): # normalize df_std = (df - df.mean())/df.std() # put the unnormalized target back df_std[target_name] = df[target_name] # randomize the data frame order df_random = df_std.reindex(np.random.permutation(df_std.index)) # make sure this doesn't take too long if df_random.shape[0] > maxevents: df_random = df_random[:maxevents] # Make a figure and declare the size fig = plt.figure(figsize=(9, 9)) # Make histograms for each column and put them in the figure current_axis = fig.add_subplot(2, 2, 1) current_axis.set_title('Andrews Curves') andrews_curves(df_random, target_name, ax=current_axis) current_axis = fig.add_subplot(2, 2, 2) current_axis.set_title('Parallel Coordinates') parallel_coordinates(df_random, target_name, ax=current_axis, colormap='gist_rainbow') current_axis = fig.add_subplot(2, 2, 3) current_axis.set_title('Radviz Spring Tension') radviz(df_random, target_name, ax=current_axis, colormap='jet') #fig.tight_layout() return fig
def plot_viz(): Attr_nr = 57 xtik = [i for i in range(Attr_nr)] sheet = xlrd.open_workbook('spam_data.xls').sheet_by_index(0) header = sheet.row_values(0, 1, 59) classLabel = sheet.col_values(0, 1, 4602) X = np.mat(np.empty((4601, 57))) for i, col_id in enumerate(range(1, 58)): X[:, i] = np.mat(sheet.col_values(col_id, 1, 4602)).T header = header[:len(header) - 1] x_std = stats.zscore(X, ddof=1) df = pd.DataFrame(data=x_std[:, :Attr_nr], columns=header[:Attr_nr]) df['Name'] = classLabel plt.figure() #radviz(df, 'Name') parallel_coordinates(df, 'Name', color=['blue', 'red']) trimed_header = [ h.replace("word_freq_", "", -1).replace(':', '', -1) for h in header[:Attr_nr] ] plt.xticks(xtik, trimed_header, rotation='vertical') #andrews_curves(df, 'Name', colormap='winter') #plt.show() #normalizeing #df_norm = (df - df.mean()) / (df.max() - df.min()) #df_norm.plot(kind='box') plt.savefig('C://Users//Bahram//Desktop//E2015//pic//att20.eps', format='eps', dpi=1000, bbox_inches='tight') plt.savefig('C://Users//Bahram//Desktop//E2015//pic//att20.png', format='png', dpi=1000, bbox_inches='tight') plt.show()
def multidimensional_plots(df, target_name, maxevents=10000, standardize=False): # randomize the data frame order df_random = df.reindex(np.random.permutation(df.index))[:maxevents] # Make a figure and declare the size fig = plt.figure(figsize=(9, 9)) # Make histograms for each column and put them in the figure current_axis = fig.add_subplot(2, 2, 1) current_axis.set_title('Andrews Curves') andrews_curves(df_random, target_name, ax=current_axis) current_axis = fig.add_subplot(2, 2, 2) current_axis.set_title('Parallel Coordinates') parallel_coordinates(df_random, target_name, ax=current_axis, colormap='gist_rainbow') current_axis = fig.add_subplot(2, 2, 3) current_axis.set_title('Radviz Spring Tension') radviz(df_random, target_name, ax=current_axis, colormap='jet') #fig.tight_layout() return fig
def plot_dmr(covs, cluster_df, covariate, chrom, res, png, weights_df=None): from matplotlib import pyplot as plt from pandas.tools.plotting import parallel_coordinates colors = ('#e41a1c', '#377eb8', '#4daf4a') cdf = cluster_df.T try: cdf.columns = [ '%s:%s' % (chrom, "{:,}".format(p)) for p in cdf.columns ] except ValueError: cdf.columns = list(cdf.columns) #cdf = 1 / (1 + np.exp(-cdf)) mmax = cdf.max().max() mmin = cdf.min().min() cdf['group'] = getattr(covs, covariate) ax = plt.gca() if cdf.group.dtype == float: ax = parallel_coordinates(cdf, 'group', ax=ax) ax.get_legend().set_visible(False) else: ax = parallel_coordinates(cdf, 'group', colors=colors, ax=ax) lbls = ax.get_legend().get_texts() for lbl in lbls: lbl.set_text(covariate + ' ' + lbl.get_text()) if weights_df is not None: W = weights_df.T W.columns = cdf.columns[:-1] while W.max().max() > 300: W = W.copy() / (W.max().max() / 300) for icol, cname in enumerate(W.columns): for j, g in enumerate(set(cdf['group'])): ax.scatter([icol] * sum(cdf['group'] == g), cdf.ix[cdf['group'] == g, icol], edgecolors=colors[j], facecolors=colors[j], alpha=0.5, s=W.ix[cdf['group'] == g, icol]) vals = ax.get_xlim() ax.set_xlim(vals[0] - 0.05, vals[1] + 0.05) if len(cdf.columns) > 6: if len(cdf.columns) > 20: lbls = ax.get_xticklabels()[::2] ax.set_xticks(ax.get_xticks()[::2]) ax.set_xticklabels([x.get_text() for x in lbls], rotation=10) else: ax.set_xticklabels([x.get_text() for x in ax.get_xticklabels()], rotation=10) ax.set_ylabel('methylation') if 0 <= mmin <= mmax <= 1: vals = ax.get_ylim() ax.set_ylim(max(0, vals[0]), min(1, vals[1]))
def parallel_plot(data): from itertools import cycle, islice from pandas.tools.plotting import parallel_coordinates import matplotlib.pyplot as plt my_colors = list(islice(cycle(['b', 'r', 'g', 'y', 'k']), None, len(data))) plt.figure(figsize=(15, 8)).gca().axes.set_ylim([-2.5, +2.5]) parallel_coordinates(data, 'prediction', color=my_colors, marker='o')
def parallelVisualise(data, measure, colors, measurename): mms = MinMaxScaler() for header in list(data): data[header] = mms.fit_transform(data[header].values.reshape(-1, 1)) data[measurename] = measure print(data) parallel_coordinates(data, measurename, color =colors) plt.show()
def plot_dmr(covs, cluster_df, covariate, chrom, res, png, weights_df=None): from matplotlib import pyplot as plt from pandas.tools.plotting import parallel_coordinates colors = ('#e41a1c', '#377eb8', '#4daf4a') cdf = cluster_df.T try: cdf.columns = ['%s:%s' % (chrom, "{:,}".format(p)) for p in cdf.columns] except ValueError: cdf.columns = list(cdf.columns) #cdf = 1 / (1 + np.exp(-cdf)) mmax = cdf.max().max() mmin = cdf.min().min() cdf['group'] = getattr(covs, covariate) ax = plt.gca() if cdf.group.dtype == float: ax = parallel_coordinates(cdf, 'group', ax=ax) ax.get_legend().set_visible(False) else: ax = parallel_coordinates(cdf, 'group', colors=colors, ax=ax) lbls = ax.get_legend().get_texts() for lbl in lbls: lbl.set_text(covariate + ' ' + lbl.get_text()) if weights_df is not None: W = weights_df.T W.columns = cdf.columns[:-1] while W.max().max() > 300: W = W.copy() / (W.max().max() / 300) for icol, cname in enumerate(W.columns): for j, g in enumerate(set(cdf['group'])): ax.scatter([icol] * sum(cdf['group'] == g), cdf.ix[cdf['group'] == g, icol], edgecolors=colors[j], facecolors=colors[j], alpha=0.5, s=W.ix[cdf['group'] == g, icol]) vals = ax.get_xlim() ax.set_xlim(vals[0] - 0.05, vals[1] + 0.05) if len(cdf.columns) > 6: if len(cdf.columns) > 20: lbls = ax.get_xticklabels()[::2] ax.set_xticks(ax.get_xticks()[::2]) ax.set_xticklabels([x.get_text() for x in lbls], rotation=10) else: ax.set_xticklabels([x.get_text() for x in ax.get_xticklabels()], rotation=10) ax.set_ylabel('methylation') if 0 <= mmin <= mmax <= 1: vals = ax.get_ylim() ax.set_ylim(max(0, vals[0]), min(1, vals[1]))
def parallel_plot(data, P, ylim=[-3, +3], figsize=(15, 8), colors=None): my_colors = colors if my_colors is None: my_colors = list( islice(cycle(['b', 'r', 'g', 'y', 'c', 'k', 'm']), None, len(P))) plt.figure(figsize=figsize).gca().axes.set_ylim(ylim) parallel_coordinates(data, 'prediction', color=my_colors, marker='o')
def plot_parallel_coordinates(data_frame): plotting.parallel_coordinates(data_frame, 'quality', colors=seaborn.color_palette( 'pastel', n_colors=6).as_hex()) figure = pyplot.gcf() figure.set_size_inches(15, 15) pyplot.savefig('build/parallel_coordinates.png', dpi=300) pyplot.clf()
def visualize_data(X, Y, name): m_lbl_survived = Y['Survived'].apply(lambda e: 'Survived' if e == 1 else 'Dead') plt.figure() parallel_coordinates(X.assign(Survived=m_lbl_survived), 'Survived', color=["red", "green"]) plt.suptitle(name) plt.savefig(name + ".png") plt.show(block=False)
def parallel_plot(): plt_feat = ['Alcohol', 'MalicAcid', 'Ash', 'AlcalinityOfAsh', 'Magnesium', 'TotalPhenols', 'Flavanoids', 'NonflavanoidPhenols', 'Proanthocyanins', 'ColorIntensity', 'Hue', 'OD280/OD315', 'Proline'] plt_feat1 = ['MalicAcid', 'Ash', 'OD280/OD315', 'Magnesium','TotalPhenols'] data_norm = pd.concat([X_norm[plt_feat1], y], axis=1) data2 = pd.concat([X[plt_feat1], y], axis=1) # Perform parallel coordinate plot parallel_coordinates(data_norm, 'Class') # parallel_coordinates(data_norm, 'Class') plt.show()
def build_parallel_coordinate_dashboard(train_pre): features = ['range', 'T2_V1', 'T1_V2', 'T2_V2', 'T1_V1'] df = train_pre.copy() df['range'] = pd.cut(df['Hazard'], bins=[0,3,5,10,70], labels=["(1-3]","(3-6]","(6-10]","(10-70]"]) df = df[df['Hazard'] < 11] # df = df[df['Hazard'].isin([1, 5, 10])] parallel_coordinates(df[features],'range')
def ParrarelCorrelationPlotChart(): iris = LoadDataset() X = pd.DataFrame(iris.data[:, :4], columns=[ 'sepal length', 'sepal width', 'petal length', 'petal width' ]) # we only take the first two features. X['class'] = iris.target #pcp plot parallel_coordinates(X, 'class') plt.show()
def build_parallel_coordinate_dashboard(train_pre): features = ['range', 'T2_V1', 'T1_V2', 'T2_V2', 'T1_V1'] df = train_pre.copy() df['range'] = pd.cut(df['Hazard'], bins=[0, 3, 5, 10, 70], labels=["(1-3]", "(3-6]", "(6-10]", "(10-70]"]) df = df[df['Hazard'] < 11] # df = df[df['Hazard'].isin([1, 5, 10])] parallel_coordinates(df[features], 'range')
def pc(df, ref): new_columns = ['IVTT', 'WT', 'TP', 'UP', 'FS', 'RL', 'DO'] df_norm = (df - ref.min()) / (ref.max() - ref.min()) for i in range(0, len(columns)): new_columns[i] = columns[i] + ' (' + str(round(ref[columns[i]].min(),2)) + ')' #df_norm = df_norm.drop( df_norm[df_norm[columns[i]] > 1].index ) #df_norm = df_norm.drop( df_norm[df_norm[columns[i]] < 0].index ) df_norm.columns = new_columns df_norm['ID'] = df_norm.index parallel_coordinates(df_norm,'ID', colormap='prism').legend_.remove() for i in range(1, len(columns)+1): plt.figtext(i*unit,0.92,'(' + str(round(ref[columns[i-1]].max(),2)) + ')',ha='center') plt.suptitle(title) plt.show()
def module3(): """ Notes on module 3 """ # The Seven Basic Tools of Quality: https://en.wikipedia.org/wiki/Seven_Basic_Tools_of_Quality #Histogram path = "C:/Users/jbennett02/Documents/Magic Briefcase/classwork/edx/Microsoft/DAT210x.b/module3/Datasets/" df = pd.read_csv(path + "wheat.data") matplotlib.style.use('ggplot') # Look Pretty df.asymmetry.plot.hist(title='Asymmetry', bins=10) plt.show() #2D scatterplot df.plot.scatter(x='area', y='perimeter') plt.show() #3D scatterplot fig = plt.figure() ax = fig.add_subplot(111, projection='3d') ax.set_xlabel('area') ax.set_ylabel('perimeter') ax.set_zlabel('asymmetry') ax.scatter(df.area, df.perimeter, df.asymmetry, c='r', marker='.') plt.show() #Parallel Coordinates -- higher dimensionality visualizations data = load_iris() df = pd.DataFrame(data.data, columns=data.feature_names) df['target_names'] = [data.target_names[i] for i in data.target] # Parallel Coordinates Start Here: plt.figure() parallel_coordinates(df, 'target_names') plt.show() #Andrews curve plt.figure() andrews_curves(df, 'target_names') plt.show() #correlation plot df = pd.DataFrame(np.random.randn(1000, 5), columns=['a', 'b', 'c', 'd', 'e']) print(df.corr()) plt.imshow(df.corr(), cmap=plt.cm.Blues, interpolation='nearest') plt.colorbar() tick_marks = [i for i in range(len(df.columns))] plt.xticks(tick_marks, df.columns, rotation='vertical') plt.yticks(tick_marks, df.columns)
def test_parallel_coordinates(self): from pandas.tools.plotting import parallel_coordinates from matplotlib import cm df = self.iris ax = _check_plot_works(parallel_coordinates, frame=df, class_column='Name') nlines = len(ax.get_lines()) nxticks = len(ax.xaxis.get_ticklabels()) rgba = ('#556270', '#4ECDC4', '#C7F464') ax = _check_plot_works(parallel_coordinates, frame=df, class_column='Name', color=rgba) self._check_colors( ax.get_lines()[:10], linecolors=rgba, mapping=df['Name'][:10]) cnames = ['dodgerblue', 'aquamarine', 'seagreen'] ax = _check_plot_works(parallel_coordinates, frame=df, class_column='Name', color=cnames) self._check_colors( ax.get_lines()[:10], linecolors=cnames, mapping=df['Name'][:10]) ax = _check_plot_works(parallel_coordinates, frame=df, class_column='Name', colormap=cm.jet) cmaps = lmap(cm.jet, np.linspace(0, 1, df['Name'].nunique())) self._check_colors( ax.get_lines()[:10], linecolors=cmaps, mapping=df['Name'][:10]) ax = _check_plot_works(parallel_coordinates, frame=df, class_column='Name', axvlines=False) assert len(ax.get_lines()) == (nlines - nxticks) colors = ['b', 'g', 'r'] df = DataFrame({"A": [1, 2, 3], "B": [1, 2, 3], "C": [1, 2, 3], "Name": colors}) ax = parallel_coordinates(df, 'Name', color=colors) handles, labels = ax.get_legend_handles_labels() self._check_colors(handles, linecolors=colors) with tm.assert_produces_warning(FutureWarning): parallel_coordinates(data=df, class_column='Name') with tm.assert_produces_warning(FutureWarning): parallel_coordinates(df, 'Name', colors=colors)
def plot_parallel(topic_group, topic_group_name): #plot df_plot = pd.DataFrame( compare_cluster_epochs_nationality(topic_group), columns=['country', '1789-1847', '1848-1874', '1875-1914']) fig = plt.figure(figsize=(11, 4)) plt.ylabel('relative frequency') plt.xlabel('time range') plt.title( 'Topic: {0} {1}: Frequency over Time and by Nationalities'.format( topic_group_name, topic_group)) #plt.suptitle('Cluster description: {0}'. format('to come ...')) parallel_coordinates(df_plot, 'country', colormap='jet', linewidth=5) plt.show() return fig
def Clonal_Evolution_Multidimensional_Data(self): i = 0.0 Clonal_Evolution_df = pd.DataFrame() for df in DataStructs: if (i == 0): t = [i] * len(df) Clonal_Evolution_df = df Clonal_Evolution_df['t'] = pd.Series( t, index=Clonal_Evolution_df.index) else: t = [i] * len(df) df['t'] = pd.Series(t, index=df.index) Clonal_Evolution_df = pd.concat([Clonal_Evolution_df, df], ignore_index=True) i = i + 1.0 C = Clonal_Evolution_df['ID'] S = Clonal_Evolution_df['Size'] M = Clonal_Evolution_df['MR'] P = Clonal_Evolution_df['PR'] T = Clonal_Evolution_df['t'] Normalised_df = pd.DataFrame(zip(T / max(T), S / max(S), P / max(P), M / max(M), C), columns=['t', 'Size', 'PR', 'MR', 'ID']) plt.figure() parallel_coordinates(Normalised_df, 'ID', colormap='jet').set_title("PC Plot") plt.legend(loc='center left', bbox_to_anchor=(1.0, 0.5)) plt.savefig('Clonal_Evolution_Parallel_Coords_Plot.eps', format='eps', dpi=1000) plt.figure() andrews_curves(Normalised_df, 'ID', colormap='jet') plt.legend(loc='center left', bbox_to_anchor=(1.0, 0.5)) plt.savefig('Clonal_Evolution_Andrews_Curves_Plot.eps', format='eps', dpi=1000) plt.figure() radviz(Normalised_df, 'ID', colormap='jet') plt.legend(loc='center left', bbox_to_anchor=(1.0, 0.5)) plt.savefig('Clonal_Evolution_RadViz_Plot.eps', format='eps', dpi=1000)
def t4(tp='r'): # 可视化 conda install pandas 多维数据 可视化 # http://cloga.info/%E6%95%B0%E6%8D%AE%E5%88%86%E6%9E%90/2016/10/12/multivariate-data-visualization import pandas as pd import matplotlib.pyplot as plt data = pd.read_csv('file:///e:/stock/xx1') from pandas.tools.plotting import andrews_curves from pandas.tools.plotting import parallel_coordinates from pandas.tools.plotting import radviz plt.figure() if tp == 'r': radviz(data, 'Name') elif tp == 'a': andrews_curves(data, 'Name') elif tp == 'p': parallel_coordinates(data, 'Name') plt.show()
def coordenatesParallels(base, classe): plt.figure(figsize=(10, 8)) ax = parallel_coordinates(base, classe) ax.legend(loc='center left', bbox_to_anchor=(0, 1), fancybox=True, ncol=2, fontsize='x-small') plt.show()
def get_parallel_cordinate(data_to_bo_plotted,label,my_data,class_column_title = "class"): """ :param data_to_bo_plotted: this is the numpy array of the data to be plotted :param label: The label array for the data_to_bo_plotted array :param my_data: the is the initial data extracted from the .csv file proir to any processing. This will form the column for the panda frame :return: """ parallel_coordinates_data = np.concatenate((data_to_bo_plotted, label.T), axis=1) #print "parallel_coordinates_data", parallel_coordinates_data df = pd.DataFrame(data=parallel_coordinates_data[0:, 0:], index=[str(i) for i in range(1, len(data_to_bo_plotted) + 1)], columns=my_data[0, 0:]) # 1st row as the column names # print df parallel_coordinates(df, class_column_title) plt.show()
def plot_top5_batsman(df): # Top 5 Batsman of T20 cricket in each team df_sub = df[['Striker', 'Run_Scored', 'Batting_Team']] df_sub['Run_Scored'] = df_sub['Run_Scored'].astype(int) x = df_sub.pivot_table(index='Striker', columns='Batting_Team', aggfunc=sum) all_teams = df['team'].unique().tolist() top5players = {} for team in all_teams: y = x['Run_Scored'][team] y = y.dropna() top5players[team] = dict(y.sort_values(ascending=False)[:5]) df_plot = pd.DataFrame(top5players).stack().reset_index() df_plot.columns = ['Player Name', 'Country', 'Total Score'] parallel_coordinates(df_plot, class_column='Country') plt.show()
def createParallelCoordinates(self,data,base_dir,fileName): from pandas.tools.plotting import parallel_coordinates pdf = PdfPages(''.join([base_dir,fileName])) for cols in data.columns.values: if len(data[cols].value_counts()) <= 20 and len(data[cols].value_counts()) > 1: req_data = data._get_numeric_data() req_data[cols]= data[cols] fig = plt.figure() fig = parallel_coordinates(req_data, cols) fig.set_title(''.join(["plot of radviz vis ", cols])) pdf.savefig(fig.get_figure()) pdf.close()
def visualize(config): # Create various visualizations of the data, this would help to create a feature vector for dataset in config['datasets']: scatter_matrix(dataset['df'], alpha=0.2, figsize=(20, 20), diagonal='kde') fig_name = dataset['name'] + '_scatter_matrix' + '.png' plt.savefig(fig_name) plt.close() plt.figure(figsize=(20,20)) parallel_coordinates(dataset['df'], 'quality') fig_name = dataset['name'] + '_parallel_coordinates' + '.png' plt.savefig(fig_name) plt.close() plt.figure(figsize=(20,20)) radviz(dataset['df'], 'quality') fig_name = dataset['name'] + '_radviz' + '.png' plt.savefig(fig_name) plt.close() return OK
def parallel_coordinate(): """ This function draw to plot between the Age,Education,Profit,Loss and Hours/week to show how they are related. """ data = pandas.read_csv('dataset.txt', sep=',', header=None, names=['Age','JobType','Other','Education','Education_Score','Relationship','Position','Status','Race','Sex','Profit','Loss','Hours/Week','Country','Income']) #print data # data = data.drop('Age', 1) data = data.drop('JobType', 1) data = data.drop('Other', 1) # data = data.drop('Education', 1) data = data.drop('Education_Score', 1) data = data.drop('Relationship', 1) data = data.drop('Position', 1) data = data.drop('Status', 1) data = data.drop('Race', 1) data = data.drop('Sex', 1) # data = data.drop('Profit', 1) # data = data.drop('Loss', 1) # data = data.drop('Hours/Week', 1) data = data.drop('Country', 1) data = data.drop('Income', 1) parallel_coordinates(data[:50], 'Education') plt.show()
def Evt_Multi_D_Parellel_Plot(self, event): page = self.New_Tab.GetSelection() panel = self.New_Tab.GetPage(page) self.selected_checkbox() panel.canvas.figure.clf() data_list = list() for variable in self.selected_checkboxes: data_list.append(variable[1]) data_list.append("customer_number") data = self.data[data_list][self.minimum: self.maximum] ax = parallel_coordinates(data, "customer_number") for direction in ["left", "right", "top", "bottom"]: ax.spines[direction].set_color("none") panel.canvas.draw() return
def Evt_Multi_D_Parellel_Plot(self, event): page = self.New_Tab.GetSelection() panel = self.New_Tab.GetPage(page) self.selected_checkbox() panel.canvas.figure.clf() data_list = list() for variable in self.selected_checkboxes: data_list.append(variable[1]) data_list.append("customer_number") data = self.data[data_list][self.minimum:self.maximum] ax = parallel_coordinates(data, "customer_number") for direction in ["left", "right", "top", "bottom"]: ax.spines[direction].set_color("none") panel.canvas.draw() return
import pandas as pd import matplotlib.pyplot as plt import matplotlib import os from pandas.tools.plotting import parallel_coordinates os.chdir("Datasets") # Look pretty... # matplotlib.style.use('ggplot') plt.style.use('ggplot') # # Loading up the Seeds Dataset into a Dataframe df = pd.read_csv("wheat.data", sep=',', header=0) print(df) # # Drop the 'id','area' and 'perimeter' features df.drop(df.columns[[0, 1, 2]], axis=1, inplace=True) print(df) # # Ploting a parallel coordinates chart grouped by # the 'wheat_type' feature. parallel_coordinates(df, 'wheat_type', alpha=.4) plt.show()
# -*- coding: utf-8 -*- """ Created on Fri Jul 15 15:54:49 2016 @author: ntelford """ from sklearn.datasets import load_iris from pandas.tools.plotting import parallel_coordinates import pandas as pd import matplotlib.pyplot as plt import matplotlib matplotlib.style.use('ggplot') data = load_iris() df = pd.DataFrame(data.data, columns=data.feature_names) df['target_names'] = [data.target_names[i] for i in data.target] plt.figure() parallel_coordinates(df, 'target_names') plt.show()
bowlindata['Wickt/Inning'] = bowlindata['Wickets']/ bowlindata['Innings'] bowlindata['ball/inning'] = bowlindata['Balls']/ bowlindata['Innings'] cols = [col for col in bowlindata.columns if col not in ['Best B Inning1', 'Best B Inning2', 'Best B Match1','Best B Match2']] bowling3 = bowlindata[cols] # Visualising statistics for no of matches played plt.title("Histogram of No of Matches Played") bowling3['Matches'].hist(bins= 50, figsize=(12,6)) ## Data Pruning India = bowling3[bowling3['Country']== 'India'] World = bowling3[bowling3['Country']!= 'India'] Indiadata = IndiaFinal[IndiaFinal['Wickets'] > 50] Worldfinal = World[World['Innings'] > 30] worldnew = Worldfinal[Worldfinal['Wickets'] > 50] data = worldnew.iloc[:,3:] ### Scaling the data for better visualization df_norm = data / (data.max() - data.min()) df_norm['world'] = data['world'] plt.figure(figsize=(16,6)) plt.title("Rest of the world Bowling Data Visualization using Parallel Coordinates") parallel_coordinates(df_norm, 'world') ## writing the output files worldnew.to_csv("worlddatabowling.csv") Indiadata.to_csv("indiadatabowling.csv")
plt.hist(Master_df.Num_Adv_Event, bins=100, color='Orange',) plt.title('Num_Adv_Event') #Even though data has really fat tails I do a correlation matrix anyway Corr_matrix = Master_df.corr() Corr_matrix.to_html() #Standardize columns and do parallel coordinates from pandas.tools.plotting import parallel_coordinates from sklearn.preprocessing import StandardScaler Numerical_Add_Adv = Add_Adv_df Numerical_Add_Adv = Numerical_Add_Adv.drop(['Unnamed: 0', 'Unnamed: 0.1', 'Unnamed: 0.1','Unnamed: 0.1.1', 'Approval_Year','Trade_Name', 'Active_Ing','Lan_Drug_Class', 'FDA_Drug_Class','Innovation_Cat', 'Top_25', 'Norm_Adv_Event'], axis=1) scaler = StandardScaler() scaler.fit(Numerical_Add_Adv) X_scaled = scaler.transform(Numerical_Add_Adv) X_scaled = pd.DataFrame(X_scaled, columns=['Num_Adv_Event','Num_Serious', 'Num_Other','Num_Life_Threat', 'Num_Hosp','Num_Congen_Anom', 'Num_Disable','Num_Deaths', 'Num_Male','Num_Female', 'AE_Per_Year','Adj_Num_AE', 'Adj_Per_Year']) reindexed_Ino_Cat = Add_Adv_df.Innovation_Cat.reset_index(drop=True) X_scaled['Innovation_Cat'] = reindexed_Ino_Cat parallel_coordinates(X_scaled, 'Innovation_Cat')
import pandas import matplotlib.pyplot as plt from pandas.tools.plotting import parallel_coordinates data = pandas.read_csv( r'C:\Python27\Lib\site-packages\pandas\tests\data\iris.csv', sep=',') parallel_coordinates(data, 'Name') plt.show()
print('\nTarget Description') print('--------------------') print(df['class'].describe()) print(df['class'].value_counts()) import matplotlib.pyplot as plt import numpy as np plt.style.use('ggplot') # look pretty # some visualizations print('\nData Visualization: Parallel Coordinates') from pandas.tools.plotting import parallel_coordinates plt.figure() parallel_coordinates(df, 'class') plt.show() # target print('we will use only two classes: iris-setosa and iris-versicolor') y = df.iloc[0:100, 4].values y y.shape y = np.where(y == 'Iris-setosa', -1, 1) y # features print( 'we will use only two features for classification: sepal length and petal length' ) X = df.iloc[0:100, [0, 2]].values X
def plot_parcoord(df, reporter, patient): plt.clf() plt.rc("figure", figsize=(13, 6.5)) parallel_coordinates(df.query('known_cancer_gene'), 'genes', ['alt_freq_ID_fix', 'alt_freq_REL_fix', 'alt_freq_CR'], alpha=0.8) plt.savefig(os.path.join(reporter.out_folder, '{}_parallel_coordinates.png'.format(patient)))
def spaghetti_plot(cluster, cov, ax=None, ilogit=False, palette='Set1'): """Create a spaghetti plot of a modeled cluster. This is best when the number of samples is less than about 20. Otherwise, use :func:`~plot_cluster` .. plot:: :include-source: true >>> import crystal >>> import crystal.utils as cu >>> covs, cluster = cu.real_count_cluster() >>> covs.head() >>> formula = "methylation ~ ko" >>> c = crystal.wrapper(crystal.zscore_cluster, formula, cluster, covs, "ko") >>> crystal.plot.spaghetti_plot(c, covs) .. note:: in the case of CountFeature as from sequence data, the points are sized by the sequencing depth. """ from pandas.tools.plotting import parallel_coordinates from crystal import CountFeature features = cluster['cluster'] methylation = np.array([f.values for f in features]).T if ilogit: methylation = 1 / (1 + np.exp(-methylation)) df = pd.DataFrame(methylation, columns=[f.spos for f in features]) var = cluster['var'] df[var] = [str(x) for x in cov[var]] mmax = methylation.max().max() mmin = methylation.min().min() if ax is None: fig, ax = plt.subplots(1) colors=sns.color_palette(palette) ax = parallel_coordinates(df, var, color=colors, ax=ax, use_columns=False) # pandas adds dark axvline, this is to remove that. lines = ax.get_lines() for i in range(len(features)): lines.pop().remove() ax.legend(loc='best') lbls = ax.get_legend().get_texts() l = ax.get_legend() l.set_frame_on(True) l.get_frame().set_facecolor('white') l.get_frame().set_alpha(0.5) if isinstance(features[0], CountFeature): counts = np.array([f.counts for f in features]).T for icol, f in enumerate(features): for j, group in enumerate(sorted(df[var].unique())): sel = np.array(df[var] == group) ax.scatter([icol] * sel.sum(), methylation[sel, icol], edgecolors=colors[j], facecolors=colors[j], alpha=0.5, s=counts[sel, icol]) plt.draw() xmin, xmax = ax.get_xlim() ax.set_xlim(int(xmin) - 0.05, int(xmax) + 0.05) ax.get_legend().set_title(var) ax.set_ylabel('methylation') sns.axes_style({'axes.linewidth': 0, 'axes.grid': False}) sns.despine() sns.set_style("ticks") return ax
def compute_initial_figure(self): data = pd.read_csv('../examplesAndSandboxCode/irisdata.txt') parallel_coordinates(data, 'Name')
df.to_csv(OUTPATH, sep='\t', index=False, header=False) print "Wrote dataset of %i instances and %i attributes to %s" % (df.shape + (OUTPATH,)) with open('meta.json', 'w') as f: meta = {'feature_names': FEATURES, 'target_names': LABEL_MAP} json.dump(meta, f, indent=4) # Describe the dataset print df.describe() # Determine the shape of the data print "{} instances with {} features\n".format(*df.shape) # Determine the frequency of each class print df.groupby('label')['label'].count() # Create a scatter matrix of the dataframe features scatter_matrix(df, alpha=0.2, figsize=(12, 12), diagonal='kde') plt.show() # Parallel Coordinates plt.figure(figsize=(12,12)) parallel_coordinates(df,'label') plt.show() # Radviz plt.figure(figsize=(12,12)) radviz(df, 'label') plt.show()
def plotParallelCoordinates(): data = pandas.read_csv(csv_path, sep=';') plt.figure() parallel_coordinates(data, 'quality') plt.show()
# # TODO: Load up the Seeds Dataset into a Dataframe # It's located at 'Datasets/wheat.data' # # .. your code here .. df = pd.read_csv("Datasets/wheat.data", index_col=0, header=0) # # TODO: Drop the 'id', 'area', and 'perimeter' feature # # .. your code here .. df = df.drop('area', axis=1) df = df.drop('perimeter', axis=1) # # TODO: Plot a parallel coordinates chart grouped by # the 'wheat_type' feature. Be sure to set the optional # display parameter alpha to 0.4 # # .. your code here .. plt.figure() parallel_coordinates(df, 'wheat_type', alpha=0.4) plt.show()
NBA_players.pos.fillna(value = 'F', inplace = True) #NBA_players.pos.value_counts() NBA_players.replace('SG','G', inplace = True) #decided to change to Gs and Fs because All NBA teams NBA_players.replace('PG','G', inplace = True) NBA_players.replace('SF','F',inplace = True) NBA_players.replace('PF','F',inplace = True) #First just preprocess by taking average #Exploratory Analysis #Find most important features, parellel coordinates; coorealtion matrix NBA_players.to_csv('NBA_players.csv') corrMat =NBA_players.corr() NBA_players.columns from pandas.tools.plotting import parallel_coordinates features = [["g","gs","mp","fg","fga","fg_","x3p","x3pa","x3p_","x2p","x2pa","x2p_","ft","fta","ft_","orb","drb","trb","ast","stl","blk","tov","pf","pts"]] #features = [['g','gs','mp','pts','AST%','PER','STL%','USG%','FTr','3PAr','TS%','VORP','BPM','OBPM','DBPM','WS','WS/48','DWS','OWS',]] NBA_df = pd.DataFrame(NBA_players, columns = features) NBA_df['team']= NBA_players.team parallel_coordinates(data=NBA_df,class_column = 'team')
# histogram of class variable n, bins, patches = plt.hist(lymph['class'], facecolor='green') plt.xlabel('class') plt.grid(True) #plt.show() #remove classes lymph = lymph[lymph['class'].isin([1,2])] print(len(lymph)) plt.figure() parallel_coordinates(lymph, 'class',colormap='gist_rainbow') plt.xticks(rotation=30) #plt.show() seeds = [31,67,321,5,76,43,12,11] class_names = ["metastases","malign lymph"] clf = tree.DecisionTreeClassifier() clf_extra = tree.ExtraTreeClassifier() clf_pruned_cart = tree.DecisionTreeClassifier(min_samples_leaf=3, max_depth=5) test_errors_CART = [] test_errors_Extra = [] test_errors_cart_pruned = []
# http://scikit-learn.org/stable/auto_examples/cluster/plot_cluster_iris.html # --------------------------------------- # EXERCISE: Create a Parallel Coordinates # visualization with the classes # --------------------------------------- #================================ # Option 3: Parallel Coordinates from pandas.tools.plotting import parallel_coordinates # I'm going to convert to a pandas dataframe # Using a snippet of code we learned from one of Kevin's lectures! iris_df['Name'] = iris.target_names[iris.target] parallel_coordinates(data=iris_df, class_column='Name', colors=('#FF0054', '#FBD039', '#23C2BC')) ''' DETERMINING THE NUMBER OF CLUSTERS How do you choose k? There isn't a bright line, but we can evaluate performance metrics such as the silhouette coefficient and within sum of squared errors across values of k. scikit-learn Clustering metrics documentation: http://scikit-learn.org/stable/modules/classes.html#clustering-metrics ''' # Create a bunch of different models k_rng = range(1, 15) est = [KMeans(n_clusters=k).fit(d) for k in k_rng]
lines = list(list(int(i) for i in line if i) for line in lines) data = np.array(lines) pd_data = pd.DataFrame(data) pd_data.to_csv('lymphoma.csv') """ pd_data = pd.read_csv("lymphoma.csv", header=0, index_col=0) data = pd_data.values print type(data), data.shape # replace missing values, just as in the paper generator = np.random.RandomState(0) idx = np.where(data == 999) data[idx] = generator.randint(-800, 801, len(idx[0])) # cluster with same parameters as original paper model = ChengChurch(n_clusters=100, max_msr=1200, deletion_threshold=1.2, inverse_rows=True, random_state=0) model.fit(data) # find bicluster with smallest msr and plot it msr = lambda a: (np.power(a - a.mean(axis=1, keepdims=True) - a.mean(axis=0) + a.mean(), 2).mean()) msrs = list(msr(model.get_submatrix(i, data)) for i in range(100)) arr = model.get_submatrix(np.argmin(msrs), data) print type(arr), arr.shape df = DataFrame(arr) df["row"] = map(str, range(arr.shape[0])) parallel_coordinates(df, "row", linewidth=1.5) plt.xlabel("column") plt.ylabel("expression level") plt.gca().legend_ = None plt.show()
project_names = [ 'keystone', 'horizon', 'glance', 'swift', 'nova', 'cinder', 'neutron', 'heat', 'ceilometer', 'ironic', 'trove', 'designate', 'sahara', 'zaqar', 'barbican'] tempest_tests = [141, 192, 238, 374, 1142, 2649, 3076, 1731, 1689] openstack_projects = [3, 5, 7, 7, 9, 10, 11, None, None] df = pandas.DataFrame(tests_per_project) plt.figure() df = pandas.read_csv('i_hate_pandas.csv') plt.ylabel('# of tests') ax = parallel_coordinates(df, 'name') ax.legend(loc='upper center', bbox_to_anchor=(0.5, 1.05), ncol=3, fancybox=True, shadow=True) plt.savefig('tests_per_proj.png', dpi=900)
# --------------------------------------- # EXERCISE: Create a Parallel Coordinates # visualization with the classes # --------------------------------------- #================================ # Option 3: Parallel Coordinates from pandas.tools.plotting import parallel_coordinates # I'm going to convert to a pandas dataframe # Using a snippet of code we learned from one of Kevin's lectures! features = [name[:-5].title().replace(' ', '') for name in iris.feature_names] iris_df = pd.DataFrame(iris.data, columns = features) iris_df['Name'] = iris.target_names[iris.target] parallel_coordinates(data=iris_df, class_column='Name', colors=('#FF0054', '#FBD039', '#23C2BC')) ''' DETERMINING THE NUMBER OF CLUSTERS How do you choose k? There isn't a bright line, but we can evaluate performance metrics such as the silhouette coefficient and within sum of squared errors across values of k. scikit-learn Clustering metrics documentation: http://scikit-learn.org/stable/modules/classes.html#clustering-metrics ''' # Create a bunch of different models k_rng = range(1,15) est = [KMeans(n_clusters = k).fit(d) for k in k_rng]