コード例 #1
0
def visualization(file_path,fig_save_path):

    SAMPLE_NUMBER = 3000#作图采用的随机点的个数
    data = pd.read_csv(file_path)
    set1 = data[data.icol(27) >= 0.5]
    set2 = data[data.icol(27) <= 0.5]

    random1 = random.sample(list(range(set1.shape[0])),SAMPLE_NUMBER)#随机采集样本的index
    random2 = random.sample(list(range(set2.shape[0])),SAMPLE_NUMBER)

    result = []
    for i in range(len(random1)):
        result.append(set1[random1[i]:random1[i]+1])#采用[a:b]才能选择行
        result.append(set2[random2[i]:random2[i]+1])
    set = pd.concat(result)
    print(set)
    fig = plt.figure(figsize=(20, 10))
    ax1 = fig.add_subplot(1, 1, 1)
    ax1.axis([0,28,-8,8])#scale : [xmin xmax ymin ymax]
    try:
        parallel_coordinates(set,"1.000000000000000000e+00",alpha=0.3,ax=ax1)#以label作为分类作图,第二个参数需要表头的标签,但是现在表头变成了值,只能用值索引
    except:
        parallel_coordinates(set, "0.000000000000000000e+00", alpha=0.3,
                             ax=ax1)
    plt.savefig(fig_save_path,dpi=150)
    plt.show()
コード例 #2
0
def parallelStrokeDir(strokes):

    #strokes.append(['1','3','4'])
    #strokes.append(['5','4','3','2'])
    #strokes.append(['1','2','3'])
    
    strokes = sorted(strokes,key=len,reverse=True)
    
    maxStrokeLength = max([len(stroke) for stroke in strokes])
    
    strokesSep = {i+1:[] for i in range(maxStrokeLength)}
    
    si = [-0.05 for s in range(9)]
    
    for stroke in strokes:
        for i,s in enumerate(stroke):
            strokesSep[i+1].append(s+si[s])
            si[s] += 0.01
            
                    
    strokesSep = {label:Series(np.asarray(data,dtype=np.float64)) for label,data in strokesSep.iteritems()}     
    
    strokesSep['Name'] = np.asarray(range(1,len(strokes)+1),dtype=np.float64)
    
    df2 = DataFrame(strokesSep)
    
    parallel_coordinates(df2,'Name')
    plt.show()
コード例 #3
0
def visualize(config):

    # Create various visualizations of the data, this would help to create a feature vector
    for dataset in config['datasets']:
        scatter_matrix(dataset['df'],
                       alpha=0.2,
                       figsize=(20, 20),
                       diagonal='kde')
        fig_name = dataset['name'] + '_scatter_matrix' + '.png'
        plt.savefig(fig_name)
        plt.close()

        plt.figure(figsize=(20, 20))
        parallel_coordinates(dataset['df'], 'quality')
        fig_name = dataset['name'] + '_parallel_coordinates' + '.png'
        plt.savefig(fig_name)
        plt.close()

        plt.figure(figsize=(20, 20))
        radviz(dataset['df'], 'quality')
        fig_name = dataset['name'] + '_radviz' + '.png'
        plt.savefig(fig_name)
        plt.close()

    return OK
コード例 #4
0
def test_parallel_coords(pandas=False, outpath=None):
    """
    Runs the parallel coordinates visualizer on the dataset.

    Parameters
    ----------
    pandas : bool
        Run the pandas version of the function
    outpath : path or None
        Save the figure to disk rather than show (if None)
    """
    data = load_data('occupancy')  # Load the data
    features = ['temp', 'humid', 'light', 'co2', 'hratio']
    classes = ['unoccupied', 'occupied']
    X = data[features].as_matrix()
    y = data.occupied.as_matrix()

    if pandas:
        parallel_coordinates(data[features + ['occupied']], 'occupied')
        if outpath:
            plt.savefig(outpath)
        else:
            plt.show()

    else:
        visualizer = ParallelCoordinates(  # Instantiate the visualizer
            classes=classes, features=features)
        visualizer.fit(X, y)  # Fit the data to the visualizer
        visualizer.transform(X)  # Transform the data
        visualizer.poof(outpath=outpath)  # Draw/show/poof the data
コード例 #5
0
def plot_parallel_coordinates(data_frame):
    plotting.parallel_coordinates(data_frame, 'quality', colors = seaborn.color_palette('pastel', n_colors=6).as_hex())

    figure = pyplot.gcf()
    figure.set_size_inches(15, 15)
    pyplot.savefig('build/parallel_coordinates.png', dpi=300)
    pyplot.clf()
コード例 #6
0
def makeParallel(traindata):
    x,y = traindata 
    num = x.shape[0]
    df = DataFrame(np.hstack((x[num/2-100:num/2+100],y[num/2-100:num/2+100,None])),columns=range(x.shape[1])+["class"])
    parallel_coordinates(df, 'class')
    plt.title('Parallel coordinates F0-f0')
    plt.savefig('paralell_coordinates_F0-f0.png')
コード例 #7
0
def irisVisualization():
    sns.set(style="white", color_codes=True)
    irisdata = load_iris()
    iris = pd.DataFrame(irisdata.data, columns=irisdata.feature_names)
    iris['Species'] = pd.Categorical.from_codes(irisdata.target,
                                                irisdata.target_names)
    # sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)
    # pandas plot
    # iris.plot(kind="scatter", x="sepal length (cm)", y="sepal width (cm)")
    # iris.boxplot(by="Species", figsize=(12, 6))
    from pandas.tools.plotting import andrews_curves, parallel_coordinates
    andrews_curves(iris, "Species")
    parallel_coordinates(iris, "Species")

    # seaborn plot
    sns.jointplot(x="sepal length (cm)",
                  y="sepal width (cm)",
                  data=iris,
                  size=5)
    sns.FacetGrid(iris, hue="Species", size=5) \
        .map(plt.scatter, "sepal length (cm)", "sepal width (cm)").add_legend() # sns.kdeplot
    sns.boxplot(x="Species", y="sepal length (cm)", data=iris)
    sns.violinplot(x="Species", y="sepal length (cm)", data=iris, size=6)
    sns.pairplot(iris, hue="Species", size=3)
    sns.plt.show()
コード例 #8
0
def plot_viz():
    Attr_nr=57
    xtik=[i for i in range(Attr_nr)]
    sheet = xlrd.open_workbook('spam_data.xls').sheet_by_index(0)
    header = sheet.row_values(0, 1, 59)
    classLabel = sheet.col_values(0, 1, 4602)
    X = np.mat(np.empty((4601, 57)))
    for i, col_id in enumerate(range(1, 58)):
        X[:, i] = np.mat(sheet.col_values(col_id, 1, 4602)).T
    header = header[:len(header) - 1]
    x_std= stats.zscore(X, ddof=1)
    df = pd.DataFrame(data=x_std[:,:Attr_nr], columns=header[:Attr_nr])
    df['Name'] = classLabel
    plt.figure()
    #radviz(df, 'Name')
    parallel_coordinates(df, 'Name', color=['blue','red'])
    trimed_header=[h.replace("word_freq_","",-1).replace(':','',-1) for h in header[:Attr_nr]]
    plt.xticks(xtik,trimed_header, rotation='vertical')
    #andrews_curves(df, 'Name', colormap='winter')
    #plt.show()
    #normalizeing
    #df_norm = (df - df.mean()) / (df.max() - df.min())
    #df_norm.plot(kind='box')
    plt.savefig('C://Users//Bahram//Desktop//E2015//pic//att20.eps', format='eps', dpi=1000,bbox_inches='tight')
    plt.savefig('C://Users//Bahram//Desktop//E2015//pic//att20.png', format='png', dpi=1000,bbox_inches='tight')
    plt.show()
コード例 #9
0
ファイル: ford.py プロジェクト: mkurtz19/fordproj
def show_data():
    tData = numpy.genfromtxt('fordTrain.csv', skip_header=1, delimiter=',', max_rows=400000)

    x = tData[:,3:33]
    y = tData[:,2]

    mins = x.min(0)
    x = x - mins
    x = x - (x.max(0) - x.min(0)) / 2
    maxs = x.max(0)
    maxs[maxs==0] = 1
    x = 2 * x / maxs

    d = numpy.zeros(tData.shape)
    d[:,[0,1]] = tData[:,[0,1]]
    d[:,2] = y
    d[:,3:33] = x

    head = 'TrialID,ObsNum,IsAlert,P1,P2,P3,P4,P5,P6,P7,P8,E1,E2,E3,E4,E5,E6,E7,E8,E9,E10,E11,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11'
    numpy.savetxt('normalized_data.csv', d, delimiter=',', header=head, fmt='%1.3f')

    data = pandas.read_csv('normalized_data.csv', header=0, usecols=numpy.r_[2:33], sep=',', nrows=4000)

    parallel_coordinates(data, 'IsAlert', color=['r','b'])
    plt.show()
コード例 #10
0
ファイル: task2.py プロジェクト: tatusl/DKDExerciseWorks
def drawParallelCoordinatesWithScaledValues(data, columnNames):
	# Construct dataframe without quality attribute (don't want to scale that)
	dataWoQuality = pandas.DataFrame(data.iloc[:,0:11])
	# Scale values
	scaled = pandas.DataFrame(preprocessing.scale(dataWoQuality))

	# Construct dataframe with quality attribute values
	quality = pandas.DataFrame(data.iloc[:,11])

	# Concatenate dataframes
	allAttributes = pandas.concat([scaled, quality], axis=1)

	# Add column names
	allAttributes.columns = columnNames

	# Select random samples
	num = 1500
	allAttributes = allAttributes.loc[random.sample(list(allAttributes.index), num)]

	# Plot figure
	plt.figure()
	parallel_coordinates(allAttributes, 'quality')
	plt.title('Parallel coordinates, scaled values ' + 'n: ' + str(len(allAttributes)))
	plt.ylim(-2, 4)
	plt.show()
コード例 #11
0
ファイル: histograms.py プロジェクト: CrBillman/andrew
def multidimensional_plots(df, target_name, maxevents=10000):
    # normalize
    df_std = (df - df.mean()) / df.std()
    # put the unnormalized target back
    df_std[target_name] = df[target_name]
    # randomize the data frame order
    df_random = df_std.reindex(np.random.permutation(df_std.index))

    # make sure this doesn't take too long
    if df_random.shape[0] > maxevents:
        df_random = df_random[:maxevents]

    # Make a figure and declare the size
    fig = plt.figure(figsize=(9, 9))

    # Make histograms for each column and put them in the figure
    current_axis = fig.add_subplot(2, 2, 1)
    current_axis.set_title('Andrews Curves')
    andrews_curves(df_random, target_name, ax=current_axis)

    current_axis = fig.add_subplot(2, 2, 2)
    current_axis.set_title('Parallel Coordinates')
    parallel_coordinates(df_random,
                         target_name,
                         ax=current_axis,
                         colormap='gist_rainbow')

    current_axis = fig.add_subplot(2, 2, 3)
    current_axis.set_title('Radviz Spring Tension')
    radviz(df_random, target_name, ax=current_axis, colormap='jet')

    #fig.tight_layout()
    return fig
コード例 #12
0
ファイル: dbsPlotCanvas.py プロジェクト: behollis/DBSViewer
 def plot(self):
     self.prepare()
     try:
         data_frames = list()
         
         for dset_key in self.mw._vdata.keys():
             dset = self.mw._vdata[dset_key] # each dataset has a single patient outcome
             pscores = dset.pat_data_pandas_scores
             curr_df = pd.DataFrame.from_dict(pscores)
             curr_df['Outcome'] = dset_key
             data_frames.append(curr_df)
             
         tot_df = data_frames[0]
         for idx in range(1, len(data_frames), 1):
             # http://pandas.pydata.org/pandas-docs/version/0.16.2/generated/pandas.merge.html#pandas.merge
             tot_df = pd.merge(tot_df, data_frames[idx], how='outer')
         
         ax = self.fig.add_subplot(111)
         ax.set_xlabel('patient')
         ax.set_ylabel('clinical score')
         
         parallel_coordinates(tot_df, 'Outcome', ax = ax, linewidth = 3)
         ax.hold(False)
         
         self.canvas.draw()
     except:
         QtGui.QMessageBox.warning( self, 'Warning', \
                                   'Parallel coordinates plot failed. Check if there is dataset loaded.' )
コード例 #13
0
    def test_parallel_coordinates(self):
        from pandas.tools.plotting import parallel_coordinates
        from matplotlib import cm

        df = self.iris

        ax = _check_plot_works(parallel_coordinates,
                               frame=df,
                               class_column='Name')
        nlines = len(ax.get_lines())
        nxticks = len(ax.xaxis.get_ticklabels())

        rgba = ('#556270', '#4ECDC4', '#C7F464')
        ax = _check_plot_works(parallel_coordinates,
                               frame=df,
                               class_column='Name',
                               color=rgba)
        self._check_colors(ax.get_lines()[:10],
                           linecolors=rgba,
                           mapping=df['Name'][:10])

        cnames = ['dodgerblue', 'aquamarine', 'seagreen']
        ax = _check_plot_works(parallel_coordinates,
                               frame=df,
                               class_column='Name',
                               color=cnames)
        self._check_colors(ax.get_lines()[:10],
                           linecolors=cnames,
                           mapping=df['Name'][:10])

        ax = _check_plot_works(parallel_coordinates,
                               frame=df,
                               class_column='Name',
                               colormap=cm.jet)
        cmaps = lmap(cm.jet, np.linspace(0, 1, df['Name'].nunique()))
        self._check_colors(ax.get_lines()[:10],
                           linecolors=cmaps,
                           mapping=df['Name'][:10])

        ax = _check_plot_works(parallel_coordinates,
                               frame=df,
                               class_column='Name',
                               axvlines=False)
        assert len(ax.get_lines()) == (nlines - nxticks)

        colors = ['b', 'g', 'r']
        df = DataFrame({
            "A": [1, 2, 3],
            "B": [1, 2, 3],
            "C": [1, 2, 3],
            "Name": colors
        })
        ax = parallel_coordinates(df, 'Name', color=colors)
        handles, labels = ax.get_legend_handles_labels()
        self._check_colors(handles, linecolors=colors)

        with tm.assert_produces_warning(FutureWarning):
            parallel_coordinates(data=df, class_column='Name')
        with tm.assert_produces_warning(FutureWarning):
            parallel_coordinates(df, 'Name', colors=colors)
コード例 #14
0
ファイル: histograms.py プロジェクト: CrBillman/andrew
def multidimensional_plots(df, target_name, maxevents=10000):
    # normalize
    df_std = (df - df.mean())/df.std()
    # put the unnormalized target back
    df_std[target_name] = df[target_name]
    # randomize the data frame order
    df_random = df_std.reindex(np.random.permutation(df_std.index))

    # make sure this doesn't take too long
    if df_random.shape[0] > maxevents:
        df_random = df_random[:maxevents]
    
    # Make a figure and declare the size
    fig = plt.figure(figsize=(9, 9))

    # Make histograms for each column and put them in the figure
    current_axis = fig.add_subplot(2, 2, 1)
    current_axis.set_title('Andrews Curves')
    andrews_curves(df_random, target_name, ax=current_axis)

    current_axis = fig.add_subplot(2, 2, 2)
    current_axis.set_title('Parallel Coordinates')
    parallel_coordinates(df_random, target_name, ax=current_axis, colormap='gist_rainbow')

    current_axis = fig.add_subplot(2, 2, 3)
    current_axis.set_title('Radviz Spring Tension')
    radviz(df_random, target_name, ax=current_axis, colormap='jet')

    #fig.tight_layout()
    return fig
コード例 #15
0
def plot_viz():
    Attr_nr = 57
    xtik = [i for i in range(Attr_nr)]
    sheet = xlrd.open_workbook('spam_data.xls').sheet_by_index(0)
    header = sheet.row_values(0, 1, 59)
    classLabel = sheet.col_values(0, 1, 4602)
    X = np.mat(np.empty((4601, 57)))
    for i, col_id in enumerate(range(1, 58)):
        X[:, i] = np.mat(sheet.col_values(col_id, 1, 4602)).T
    header = header[:len(header) - 1]
    x_std = stats.zscore(X, ddof=1)
    df = pd.DataFrame(data=x_std[:, :Attr_nr], columns=header[:Attr_nr])
    df['Name'] = classLabel
    plt.figure()
    #radviz(df, 'Name')
    parallel_coordinates(df, 'Name', color=['blue', 'red'])
    trimed_header = [
        h.replace("word_freq_", "", -1).replace(':', '', -1)
        for h in header[:Attr_nr]
    ]
    plt.xticks(xtik, trimed_header, rotation='vertical')
    #andrews_curves(df, 'Name', colormap='winter')
    #plt.show()
    #normalizeing
    #df_norm = (df - df.mean()) / (df.max() - df.min())
    #df_norm.plot(kind='box')
    plt.savefig('C://Users//Bahram//Desktop//E2015//pic//att20.eps',
                format='eps',
                dpi=1000,
                bbox_inches='tight')
    plt.savefig('C://Users//Bahram//Desktop//E2015//pic//att20.png',
                format='png',
                dpi=1000,
                bbox_inches='tight')
    plt.show()
コード例 #16
0
def multidimensional_plots(df,
                           target_name,
                           maxevents=10000,
                           standardize=False):

    # randomize the data frame order
    df_random = df.reindex(np.random.permutation(df.index))[:maxevents]

    # Make a figure and declare the size
    fig = plt.figure(figsize=(9, 9))

    # Make histograms for each column and put them in the figure
    current_axis = fig.add_subplot(2, 2, 1)
    current_axis.set_title('Andrews Curves')
    andrews_curves(df_random, target_name, ax=current_axis)

    current_axis = fig.add_subplot(2, 2, 2)
    current_axis.set_title('Parallel Coordinates')
    parallel_coordinates(df_random,
                         target_name,
                         ax=current_axis,
                         colormap='gist_rainbow')

    current_axis = fig.add_subplot(2, 2, 3)
    current_axis.set_title('Radviz Spring Tension')
    radviz(df_random, target_name, ax=current_axis, colormap='jet')

    #fig.tight_layout()
    return fig
コード例 #17
0
def plot_dmr(covs, cluster_df, covariate, chrom, res, png, weights_df=None):
    from matplotlib import pyplot as plt
    from pandas.tools.plotting import parallel_coordinates
    colors = ('#e41a1c', '#377eb8', '#4daf4a')

    cdf = cluster_df.T
    try:
        cdf.columns = [
            '%s:%s' % (chrom, "{:,}".format(p)) for p in cdf.columns
        ]
    except ValueError:
        cdf.columns = list(cdf.columns)
    #cdf = 1 / (1 + np.exp(-cdf))
    mmax = cdf.max().max()
    mmin = cdf.min().min()
    cdf['group'] = getattr(covs, covariate)

    ax = plt.gca()

    if cdf.group.dtype == float:
        ax = parallel_coordinates(cdf, 'group', ax=ax)
        ax.get_legend().set_visible(False)
    else:
        ax = parallel_coordinates(cdf, 'group', colors=colors, ax=ax)
        lbls = ax.get_legend().get_texts()

        for lbl in lbls:
            lbl.set_text(covariate + ' ' + lbl.get_text())

    if weights_df is not None:
        W = weights_df.T
        W.columns = cdf.columns[:-1]
        while W.max().max() > 300:
            W = W.copy() / (W.max().max() / 300)

        for icol, cname in enumerate(W.columns):
            for j, g in enumerate(set(cdf['group'])):
                ax.scatter([icol] * sum(cdf['group'] == g),
                           cdf.ix[cdf['group'] == g, icol],
                           edgecolors=colors[j],
                           facecolors=colors[j],
                           alpha=0.5,
                           s=W.ix[cdf['group'] == g, icol])
        vals = ax.get_xlim()
        ax.set_xlim(vals[0] - 0.05, vals[1] + 0.05)
    if len(cdf.columns) > 6:
        if len(cdf.columns) > 20:
            lbls = ax.get_xticklabels()[::2]
            ax.set_xticks(ax.get_xticks()[::2])
            ax.set_xticklabels([x.get_text() for x in lbls], rotation=10)

        else:
            ax.set_xticklabels([x.get_text() for x in ax.get_xticklabels()],
                               rotation=10)

    ax.set_ylabel('methylation')
    if 0 <= mmin <= mmax <= 1:
        vals = ax.get_ylim()
        ax.set_ylim(max(0, vals[0]), min(1, vals[1]))
コード例 #18
0
def parallel_plot(data):
    from itertools import cycle, islice
    from pandas.tools.plotting import parallel_coordinates
    import matplotlib.pyplot as plt

    my_colors = list(islice(cycle(['b', 'r', 'g', 'y', 'k']), None, len(data)))
    plt.figure(figsize=(15, 8)).gca().axes.set_ylim([-2.5, +2.5])
    parallel_coordinates(data, 'prediction', color=my_colors, marker='o')
コード例 #19
0
ファイル: visualise.py プロジェクト: dschuan/data-mining-proj
def parallelVisualise(data, measure, colors, measurename):
    mms = MinMaxScaler()
    for header in list(data):
        data[header] = mms.fit_transform(data[header].values.reshape(-1, 1))
    data[measurename] = measure
    print(data)
    parallel_coordinates(data, measurename, color =colors)
    plt.show()
コード例 #20
0
ファイル: plotting.py プロジェクト: brentp/clustermodel
def plot_dmr(covs, cluster_df, covariate, chrom, res, png, weights_df=None):
    from matplotlib import pyplot as plt
    from pandas.tools.plotting import parallel_coordinates
    colors = ('#e41a1c', '#377eb8', '#4daf4a')

    cdf = cluster_df.T
    try:
        cdf.columns = ['%s:%s' % (chrom, "{:,}".format(p)) for p in cdf.columns]
    except ValueError:
        cdf.columns = list(cdf.columns)
    #cdf = 1 / (1 + np.exp(-cdf))
    mmax = cdf.max().max()
    mmin = cdf.min().min()
    cdf['group'] = getattr(covs, covariate)

    ax = plt.gca()

    if cdf.group.dtype == float:
        ax = parallel_coordinates(cdf, 'group', ax=ax)
        ax.get_legend().set_visible(False)
    else:
        ax = parallel_coordinates(cdf, 'group', colors=colors, ax=ax)
        lbls = ax.get_legend().get_texts()

        for lbl in lbls:
            lbl.set_text(covariate + ' ' + lbl.get_text())

    if weights_df is not None:
        W = weights_df.T
        W.columns = cdf.columns[:-1]
        while W.max().max() > 300:
            W = W.copy() / (W.max().max() / 300)

        for icol, cname in enumerate(W.columns):
            for j, g in enumerate(set(cdf['group'])):
                ax.scatter([icol] * sum(cdf['group'] == g),
                           cdf.ix[cdf['group'] == g, icol],
                           edgecolors=colors[j],
                           facecolors=colors[j],
                           alpha=0.5,
                           s=W.ix[cdf['group'] == g, icol])
        vals = ax.get_xlim()
        ax.set_xlim(vals[0] - 0.05, vals[1] + 0.05)
    if len(cdf.columns) > 6:
        if len(cdf.columns) > 20:
            lbls = ax.get_xticklabels()[::2]
            ax.set_xticks(ax.get_xticks()[::2])
            ax.set_xticklabels([x.get_text() for x in lbls], rotation=10)

        else:
            ax.set_xticklabels([x.get_text() for x in ax.get_xticklabels()],
                          rotation=10)


    ax.set_ylabel('methylation')
    if 0 <= mmin <= mmax <= 1:
        vals = ax.get_ylim()
        ax.set_ylim(max(0, vals[0]), min(1, vals[1]))
コード例 #21
0
def parallel_plot(data, P, ylim=[-3, +3], figsize=(15, 8), colors=None):

    my_colors = colors
    if my_colors is None:
        my_colors = list(
            islice(cycle(['b', 'r', 'g', 'y', 'c', 'k', 'm']), None, len(P)))

    plt.figure(figsize=figsize).gca().axes.set_ylim(ylim)
    parallel_coordinates(data, 'prediction', color=my_colors, marker='o')
コード例 #22
0
def plot_parallel_coordinates(data_frame):
    plotting.parallel_coordinates(data_frame,
                                  'quality',
                                  colors=seaborn.color_palette(
                                      'pastel', n_colors=6).as_hex())

    figure = pyplot.gcf()
    figure.set_size_inches(15, 15)
    pyplot.savefig('build/parallel_coordinates.png', dpi=300)
    pyplot.clf()
コード例 #23
0
ファイル: review.py プロジェクト: hanhha/kg_rose
def visualize_data(X, Y, name):
    m_lbl_survived = Y['Survived'].apply(lambda e: 'Survived'
                                         if e == 1 else 'Dead')
    plt.figure()
    parallel_coordinates(X.assign(Survived=m_lbl_survived),
                         'Survived',
                         color=["red", "green"])
    plt.suptitle(name)
    plt.savefig(name + ".png")
    plt.show(block=False)
コード例 #24
0
def parallel_plot():
    plt_feat =  ['Alcohol', 'MalicAcid', 'Ash', 'AlcalinityOfAsh', 'Magnesium', 'TotalPhenols', 
         'Flavanoids', 'NonflavanoidPhenols', 'Proanthocyanins', 'ColorIntensity', 
         'Hue', 'OD280/OD315', 'Proline']
    plt_feat1 =  ['MalicAcid', 'Ash', 'OD280/OD315', 'Magnesium','TotalPhenols']
    data_norm = pd.concat([X_norm[plt_feat1], y], axis=1)
    data2 = pd.concat([X[plt_feat1], y], axis=1)
    # Perform parallel coordinate plot
    parallel_coordinates(data_norm, 'Class')
    # parallel_coordinates(data_norm, 'Class')
    plt.show()
def build_parallel_coordinate_dashboard(train_pre):

    features = ['range', 'T2_V1', 'T1_V2', 'T2_V2', 'T1_V1']

    df = train_pre.copy()
    df['range'] = pd.cut(df['Hazard'], bins=[0,3,5,10,70], labels=["(1-3]","(3-6]","(6-10]","(10-70]"])
    df = df[df['Hazard'] < 11]

    # df = df[df['Hazard'].isin([1, 5, 10])]

    parallel_coordinates(df[features],'range')
コード例 #26
0
def ParrarelCorrelationPlotChart():
    iris = LoadDataset()
    X = pd.DataFrame(iris.data[:, :4],
                     columns=[
                         'sepal length', 'sepal width', 'petal  length',
                         'petal width'
                     ])  # we only take the first two features.
    X['class'] = iris.target

    #pcp plot
    parallel_coordinates(X, 'class')
    plt.show()
def build_parallel_coordinate_dashboard(train_pre):

    features = ['range', 'T2_V1', 'T1_V2', 'T2_V2', 'T1_V1']

    df = train_pre.copy()
    df['range'] = pd.cut(df['Hazard'],
                         bins=[0, 3, 5, 10, 70],
                         labels=["(1-3]", "(3-6]", "(6-10]", "(10-70]"])
    df = df[df['Hazard'] < 11]

    # df = df[df['Hazard'].isin([1, 5, 10])]

    parallel_coordinates(df[features], 'range')
コード例 #28
0
def pc(df, ref):
    new_columns = ['IVTT', 'WT', 'TP', 'UP', 'FS', 'RL', 'DO']
    df_norm = (df - ref.min()) / (ref.max() - ref.min())
    for i in range(0, len(columns)):
        new_columns[i] = columns[i] + ' (' + str(round(ref[columns[i]].min(),2)) + ')'
        #df_norm = df_norm.drop( df_norm[df_norm[columns[i]] > 1].index )
        #df_norm = df_norm.drop( df_norm[df_norm[columns[i]] < 0].index )
    df_norm.columns = new_columns
    df_norm['ID'] = df_norm.index
    parallel_coordinates(df_norm,'ID', colormap='prism').legend_.remove()
    for i in range(1, len(columns)+1):
        plt.figtext(i*unit,0.92,'(' + str(round(ref[columns[i-1]].max(),2)) + ')',ha='center')
    plt.suptitle(title)
    plt.show()
コード例 #29
0
ファイル: classnotes.py プロジェクト: jonolsu/classwork
def module3():
    """
    Notes on module 3
    """

    # The Seven Basic Tools of Quality: https://en.wikipedia.org/wiki/Seven_Basic_Tools_of_Quality

    #Histogram
    path = "C:/Users/jbennett02/Documents/Magic Briefcase/classwork/edx/Microsoft/DAT210x.b/module3/Datasets/"
    df = pd.read_csv(path + "wheat.data")
    matplotlib.style.use('ggplot')  # Look Pretty
    df.asymmetry.plot.hist(title='Asymmetry', bins=10)
    plt.show()

    #2D scatterplot
    df.plot.scatter(x='area', y='perimeter')
    plt.show()

    #3D scatterplot
    fig = plt.figure()
    ax = fig.add_subplot(111, projection='3d')
    ax.set_xlabel('area')
    ax.set_ylabel('perimeter')
    ax.set_zlabel('asymmetry')
    ax.scatter(df.area, df.perimeter, df.asymmetry, c='r', marker='.')
    plt.show()

    #Parallel Coordinates -- higher dimensionality visualizations
    data = load_iris()
    df = pd.DataFrame(data.data, columns=data.feature_names)
    df['target_names'] = [data.target_names[i] for i in data.target]
    # Parallel Coordinates Start Here:
    plt.figure()
    parallel_coordinates(df, 'target_names')
    plt.show()

    #Andrews curve
    plt.figure()
    andrews_curves(df, 'target_names')
    plt.show()

    #correlation plot
    df = pd.DataFrame(np.random.randn(1000, 5), columns=['a', 'b', 'c', 'd', 'e'])
    print(df.corr())
    plt.imshow(df.corr(), cmap=plt.cm.Blues, interpolation='nearest')
    plt.colorbar()
    tick_marks = [i for i in range(len(df.columns))]
    plt.xticks(tick_marks, df.columns, rotation='vertical')
    plt.yticks(tick_marks, df.columns)
コード例 #30
0
    def test_parallel_coordinates(self):
        from pandas.tools.plotting import parallel_coordinates
        from matplotlib import cm

        df = self.iris

        ax = _check_plot_works(parallel_coordinates,
                               frame=df, class_column='Name')
        nlines = len(ax.get_lines())
        nxticks = len(ax.xaxis.get_ticklabels())

        rgba = ('#556270', '#4ECDC4', '#C7F464')
        ax = _check_plot_works(parallel_coordinates,
                               frame=df, class_column='Name', color=rgba)
        self._check_colors(
            ax.get_lines()[:10], linecolors=rgba, mapping=df['Name'][:10])

        cnames = ['dodgerblue', 'aquamarine', 'seagreen']
        ax = _check_plot_works(parallel_coordinates,
                               frame=df, class_column='Name', color=cnames)
        self._check_colors(
            ax.get_lines()[:10], linecolors=cnames, mapping=df['Name'][:10])

        ax = _check_plot_works(parallel_coordinates,
                               frame=df, class_column='Name', colormap=cm.jet)
        cmaps = lmap(cm.jet, np.linspace(0, 1, df['Name'].nunique()))
        self._check_colors(
            ax.get_lines()[:10], linecolors=cmaps, mapping=df['Name'][:10])

        ax = _check_plot_works(parallel_coordinates,
                               frame=df, class_column='Name', axvlines=False)
        assert len(ax.get_lines()) == (nlines - nxticks)

        colors = ['b', 'g', 'r']
        df = DataFrame({"A": [1, 2, 3],
                        "B": [1, 2, 3],
                        "C": [1, 2, 3],
                        "Name": colors})
        ax = parallel_coordinates(df, 'Name', color=colors)
        handles, labels = ax.get_legend_handles_labels()
        self._check_colors(handles, linecolors=colors)

        with tm.assert_produces_warning(FutureWarning):
            parallel_coordinates(data=df, class_column='Name')
        with tm.assert_produces_warning(FutureWarning):
            parallel_coordinates(df, 'Name', colors=colors)
コード例 #31
0
def plot_parallel(topic_group, topic_group_name):
    #plot
    df_plot = pd.DataFrame(
        compare_cluster_epochs_nationality(topic_group),
        columns=['country', '1789-1847', '1848-1874', '1875-1914'])

    fig = plt.figure(figsize=(11, 4))
    plt.ylabel('relative frequency')
    plt.xlabel('time range')
    plt.title(
        'Topic: {0} {1}: Frequency over Time and by Nationalities'.format(
            topic_group_name, topic_group))
    #plt.suptitle('Cluster description: {0}'. format('to come ...'))
    parallel_coordinates(df_plot, 'country', colormap='jet', linewidth=5)

    plt.show()
    return fig
コード例 #32
0
    def Clonal_Evolution_Multidimensional_Data(self):
        i = 0.0
        Clonal_Evolution_df = pd.DataFrame()
        for df in DataStructs:
            if (i == 0):
                t = [i] * len(df)
                Clonal_Evolution_df = df
                Clonal_Evolution_df['t'] = pd.Series(
                    t, index=Clonal_Evolution_df.index)
            else:
                t = [i] * len(df)
                df['t'] = pd.Series(t, index=df.index)
                Clonal_Evolution_df = pd.concat([Clonal_Evolution_df, df],
                                                ignore_index=True)

            i = i + 1.0

        C = Clonal_Evolution_df['ID']
        S = Clonal_Evolution_df['Size']
        M = Clonal_Evolution_df['MR']
        P = Clonal_Evolution_df['PR']
        T = Clonal_Evolution_df['t']

        Normalised_df = pd.DataFrame(zip(T / max(T), S / max(S), P / max(P),
                                         M / max(M), C),
                                     columns=['t', 'Size', 'PR', 'MR', 'ID'])

        plt.figure()
        parallel_coordinates(Normalised_df, 'ID',
                             colormap='jet').set_title("PC Plot")
        plt.legend(loc='center left', bbox_to_anchor=(1.0, 0.5))
        plt.savefig('Clonal_Evolution_Parallel_Coords_Plot.eps',
                    format='eps',
                    dpi=1000)

        plt.figure()
        andrews_curves(Normalised_df, 'ID', colormap='jet')
        plt.legend(loc='center left', bbox_to_anchor=(1.0, 0.5))
        plt.savefig('Clonal_Evolution_Andrews_Curves_Plot.eps',
                    format='eps',
                    dpi=1000)

        plt.figure()
        radviz(Normalised_df, 'ID', colormap='jet')
        plt.legend(loc='center left', bbox_to_anchor=(1.0, 0.5))
        plt.savefig('Clonal_Evolution_RadViz_Plot.eps', format='eps', dpi=1000)
コード例 #33
0
ファイル: kmeans_np.py プロジェクト: lulugyf/pycode
def t4(tp='r'):
    # 可视化  conda install pandas  多维数据 可视化
    # http://cloga.info/%E6%95%B0%E6%8D%AE%E5%88%86%E6%9E%90/2016/10/12/multivariate-data-visualization
    import pandas as pd
    import matplotlib.pyplot as plt
    data = pd.read_csv('file:///e:/stock/xx1')
    from pandas.tools.plotting import andrews_curves
    from pandas.tools.plotting import parallel_coordinates
    from pandas.tools.plotting import radviz

    plt.figure()
    if tp == 'r':
        radviz(data, 'Name')
    elif tp == 'a':
        andrews_curves(data, 'Name')
    elif tp == 'p':
        parallel_coordinates(data, 'Name')
    plt.show()
コード例 #34
0
def coordenatesParallels(base, classe):
    plt.figure(figsize=(10, 8))
    ax = parallel_coordinates(base, classe)
    ax.legend(loc='center left',
              bbox_to_anchor=(0, 1),
              fancybox=True,
              ncol=2,
              fontsize='x-small')
    plt.show()
コード例 #35
0
ファイル: util.py プロジェクト: igbe/BioMLTools
def get_parallel_cordinate(data_to_bo_plotted,label,my_data,class_column_title = "class"):
    """

    :param data_to_bo_plotted: this is the numpy array of the data to be plotted
    :param label: The label array for the data_to_bo_plotted array
    :param my_data: the is the initial data extracted from the .csv file proir to any processing. This will form the
                    column for the panda frame
    :return:
    """

    parallel_coordinates_data = np.concatenate((data_to_bo_plotted, label.T), axis=1)
    #print "parallel_coordinates_data", parallel_coordinates_data

    df = pd.DataFrame(data=parallel_coordinates_data[0:, 0:],
                      index=[str(i) for i in range(1, len(data_to_bo_plotted) + 1)],
                      columns=my_data[0, 0:])  # 1st row as the column names
    # print df
    parallel_coordinates(df, class_column_title)
    plt.show()
コード例 #36
0
def plot_top5_batsman(df):
    # Top 5 Batsman of T20 cricket in each team
    df_sub = df[['Striker', 'Run_Scored', 'Batting_Team']]
    df_sub['Run_Scored'] = df_sub['Run_Scored'].astype(int)
    x = df_sub.pivot_table(index='Striker',
                           columns='Batting_Team',
                           aggfunc=sum)

    all_teams = df['team'].unique().tolist()

    top5players = {}
    for team in all_teams:
        y = x['Run_Scored'][team]
        y = y.dropna()
        top5players[team] = dict(y.sort_values(ascending=False)[:5])

    df_plot = pd.DataFrame(top5players).stack().reset_index()
    df_plot.columns = ['Player Name', 'Country', 'Total Score']
    parallel_coordinates(df_plot, class_column='Country')
    plt.show()
コード例 #37
0
 def createParallelCoordinates(self,data,base_dir,fileName):
     from pandas.tools.plotting import parallel_coordinates
     pdf = PdfPages(''.join([base_dir,fileName]))
     for cols in data.columns.values:
         if len(data[cols].value_counts()) <= 20 and len(data[cols].value_counts()) > 1:
             req_data = data._get_numeric_data()
             req_data[cols]= data[cols]
             fig = plt.figure()
             fig = parallel_coordinates(req_data, cols)
             fig.set_title(''.join(["plot of radviz vis ", cols]))
             pdf.savefig(fig.get_figure())
     pdf.close()
コード例 #38
0
def visualize(config):

    # Create various visualizations of the data, this would help to create a feature vector
    for dataset in config['datasets']:
        scatter_matrix(dataset['df'], alpha=0.2, figsize=(20, 20), diagonal='kde')
        fig_name = dataset['name'] + '_scatter_matrix' + '.png'
        plt.savefig(fig_name)
        plt.close()

        plt.figure(figsize=(20,20))
        parallel_coordinates(dataset['df'], 'quality')
        fig_name = dataset['name'] + '_parallel_coordinates' + '.png'
        plt.savefig(fig_name)
        plt.close()

        plt.figure(figsize=(20,20))
        radviz(dataset['df'], 'quality')
        fig_name = dataset['name'] + '_radviz' + '.png'
        plt.savefig(fig_name)
        plt.close()

    return OK
コード例 #39
0
def multidimensional_plots(df, target_name, maxevents=10000, standardize=False):

    # randomize the data frame order
    df_random = df.reindex(np.random.permutation(df.index))[:maxevents]
    
    # Make a figure and declare the size
    fig = plt.figure(figsize=(9, 9))

    # Make histograms for each column and put them in the figure
    current_axis = fig.add_subplot(2, 2, 1)
    current_axis.set_title('Andrews Curves')
    andrews_curves(df_random, target_name, ax=current_axis)

    current_axis = fig.add_subplot(2, 2, 2)
    current_axis.set_title('Parallel Coordinates')
    parallel_coordinates(df_random, target_name, ax=current_axis, colormap='gist_rainbow')

    current_axis = fig.add_subplot(2, 2, 3)
    current_axis.set_title('Radviz Spring Tension')
    radviz(df_random, target_name, ax=current_axis, colormap='jet')

    #fig.tight_layout()
    return fig
コード例 #40
0
def parallel_coordinate():
	"""
    This function draw to plot between the Age,Education,Profit,Loss and Hours/week to show how they are related.
    """
	data = pandas.read_csv('dataset.txt', sep=',', header=None, names=['Age','JobType','Other','Education','Education_Score','Relationship','Position','Status','Race','Sex','Profit','Loss','Hours/Week','Country','Income'])
	#print data
	# data = data.drop('Age', 1)
	data = data.drop('JobType', 1)
	data = data.drop('Other', 1)
	# data = data.drop('Education', 1)
	data = data.drop('Education_Score', 1)
	data = data.drop('Relationship', 1)
	data = data.drop('Position', 1)
	data = data.drop('Status', 1)
	data = data.drop('Race', 1)
	data = data.drop('Sex', 1)
	# data = data.drop('Profit', 1)
	# data = data.drop('Loss', 1)
	# data = data.drop('Hours/Week', 1)
	data = data.drop('Country', 1)
	data = data.drop('Income', 1)

	parallel_coordinates(data[:50], 'Education')
	plt.show()
コード例 #41
0
def Evt_Multi_D_Parellel_Plot(self, event):
	page = self.New_Tab.GetSelection()  
	panel = self.New_Tab.GetPage(page)
	self.selected_checkbox()
	panel.canvas.figure.clf()
	data_list = list()
	for variable in self.selected_checkboxes:
		data_list.append(variable[1])
	data_list.append("customer_number")
	data = self.data[data_list][self.minimum: self.maximum]

	ax = parallel_coordinates(data, "customer_number")
	for direction in ["left", "right", "top", "bottom"]:
		ax.spines[direction].set_color("none")
	panel.canvas.draw()
	return
コード例 #42
0
def Evt_Multi_D_Parellel_Plot(self, event):
    page = self.New_Tab.GetSelection()
    panel = self.New_Tab.GetPage(page)
    self.selected_checkbox()
    panel.canvas.figure.clf()
    data_list = list()
    for variable in self.selected_checkboxes:
        data_list.append(variable[1])
    data_list.append("customer_number")
    data = self.data[data_list][self.minimum:self.maximum]

    ax = parallel_coordinates(data, "customer_number")
    for direction in ["left", "right", "top", "bottom"]:
        ax.spines[direction].set_color("none")
    panel.canvas.draw()
    return
コード例 #43
0
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
import os
from pandas.tools.plotting import parallel_coordinates
os.chdir("Datasets")
# Look pretty...
# matplotlib.style.use('ggplot')
plt.style.use('ggplot')

#
# Loading up the Seeds Dataset into a Dataframe
df = pd.read_csv("wheat.data", sep=',', header=0)
print(df)

#
# Drop the 'id','area' and 'perimeter' features
df.drop(df.columns[[0, 1, 2]], axis=1, inplace=True)
print(df)

#
# Ploting a parallel coordinates chart grouped by
# the 'wheat_type' feature.
parallel_coordinates(df, 'wheat_type', alpha=.4)
plt.show()
コード例 #44
0
ファイル: parallel_coords.py プロジェクト: griblik/scratch
# -*- coding: utf-8 -*-
"""
Created on Fri Jul 15 15:54:49 2016

@author: ntelford
"""

from sklearn.datasets import load_iris
from pandas.tools.plotting import parallel_coordinates

import pandas as pd
import matplotlib.pyplot as plt
import matplotlib

matplotlib.style.use('ggplot')

data = load_iris()
df = pd.DataFrame(data.data, columns=data.feature_names)
df['target_names'] = [data.target_names[i] for i in data.target]

plt.figure()
parallel_coordinates(df, 'target_names')
plt.show()
コード例 #45
0
bowlindata['Wickt/Inning'] = bowlindata['Wickets']/ bowlindata['Innings']
bowlindata['ball/inning'] = bowlindata['Balls']/ bowlindata['Innings']
cols = [col for col in bowlindata.columns if col not in ['Best B Inning1', 'Best B Inning2', 'Best B Match1','Best B Match2']]
bowling3 = bowlindata[cols]

# Visualising statistics for no of matches played
plt.title("Histogram of No of Matches Played")
bowling3['Matches'].hist(bins= 50, figsize=(12,6))


## Data Pruning
India = bowling3[bowling3['Country']== 'India']
World = bowling3[bowling3['Country']!= 'India']

Indiadata = IndiaFinal[IndiaFinal['Wickets'] > 50]
Worldfinal = World[World['Innings'] > 30]
worldnew = Worldfinal[Worldfinal['Wickets'] > 50]
data = worldnew.iloc[:,3:]

### Scaling the data for better visualization
df_norm = data  / (data.max() - data.min())
df_norm['world'] = data['world']
plt.figure(figsize=(16,6))
plt.title("Rest of the world Bowling Data Visualization using Parallel Coordinates")
parallel_coordinates(df_norm, 'world')


## writing the output files
worldnew.to_csv("worlddatabowling.csv")
Indiadata.to_csv("indiadatabowling.csv")
コード例 #46
0
plt.hist(Master_df.Num_Adv_Event, bins=100, color='Orange',)
plt.title('Num_Adv_Event')

#Even though data has really fat tails I do a correlation matrix anyway
Corr_matrix = Master_df.corr()
Corr_matrix.to_html()

#Standardize columns and do parallel coordinates
from pandas.tools.plotting import parallel_coordinates
from sklearn.preprocessing import StandardScaler
Numerical_Add_Adv = Add_Adv_df
Numerical_Add_Adv = Numerical_Add_Adv.drop(['Unnamed: 0', 'Unnamed: 0.1', 
                        'Unnamed: 0.1','Unnamed: 0.1.1',
                        'Approval_Year','Trade_Name', 
                        'Active_Ing','Lan_Drug_Class',
                        'FDA_Drug_Class','Innovation_Cat',
                        'Top_25', 'Norm_Adv_Event'], axis=1)
scaler = StandardScaler()
scaler.fit(Numerical_Add_Adv)
X_scaled = scaler.transform(Numerical_Add_Adv)
X_scaled = pd.DataFrame(X_scaled, columns=['Num_Adv_Event','Num_Serious',
                                            'Num_Other','Num_Life_Threat',
                                            'Num_Hosp','Num_Congen_Anom',
                                            'Num_Disable','Num_Deaths',
                                            'Num_Male','Num_Female',
                                            'AE_Per_Year','Adj_Num_AE',
                                            'Adj_Per_Year'])
reindexed_Ino_Cat = Add_Adv_df.Innovation_Cat.reset_index(drop=True)                                         
X_scaled['Innovation_Cat'] = reindexed_Ino_Cat
parallel_coordinates(X_scaled, 'Innovation_Cat')
コード例 #47
0
import pandas
import matplotlib.pyplot as plt
from pandas.tools.plotting import parallel_coordinates

data = pandas.read_csv(
    r'C:\Python27\Lib\site-packages\pandas\tests\data\iris.csv', sep=',')
parallel_coordinates(data, 'Name')
plt.show()
コード例 #48
0
print('\nTarget Description')
print('--------------------')
print(df['class'].describe())
print(df['class'].value_counts())

import matplotlib.pyplot as plt
import numpy as np

plt.style.use('ggplot')  # look pretty

# some visualizations
print('\nData Visualization: Parallel Coordinates')
from pandas.tools.plotting import parallel_coordinates

plt.figure()
parallel_coordinates(df, 'class')
plt.show()

# target
print('we will use only two classes: iris-setosa and iris-versicolor')
y = df.iloc[0:100, 4].values
y
y.shape
y = np.where(y == 'Iris-setosa', -1, 1)
y
# features
print(
    'we will use only two features for classification: sepal length and petal length'
)
X = df.iloc[0:100, [0, 2]].values
X
コード例 #49
0
ファイル: plots.py プロジェクト: ranijames/Big-data_analysis
def plot_parcoord(df, reporter, patient):
    plt.clf()
    plt.rc("figure", figsize=(13, 6.5))
    parallel_coordinates(df.query('known_cancer_gene'), 'genes', ['alt_freq_ID_fix', 'alt_freq_REL_fix', 'alt_freq_CR'], alpha=0.8)
    plt.savefig(os.path.join(reporter.out_folder, '{}_parallel_coordinates.png'.format(patient)))
コード例 #50
0
ファイル: plot.py プロジェクト: LeonardJ09/crystal
def spaghetti_plot(cluster, cov, ax=None, ilogit=False, palette='Set1'):
    """Create a spaghetti plot of a modeled cluster. This is best when the
       number of samples is less than about 20. Otherwise, use
       :func:`~plot_cluster`

    .. plot::
        :include-source: true

        >>> import crystal
        >>> import crystal.utils as cu
        >>> covs, cluster = cu.real_count_cluster()
        >>> covs.head()
        >>> formula = "methylation ~ ko"
        >>> c = crystal.wrapper(crystal.zscore_cluster, formula, cluster, covs, "ko")
        >>> crystal.plot.spaghetti_plot(c, covs)

    .. note:: 
        in the case of CountFeature as from sequence data, the points
        are sized by the sequencing depth.
    """
    from pandas.tools.plotting import parallel_coordinates
    from crystal import CountFeature
    features = cluster['cluster']
    methylation = np.array([f.values for f in features]).T
    if ilogit:
        methylation = 1 / (1 + np.exp(-methylation))
    df = pd.DataFrame(methylation, columns=[f.spos for f in features])
    var = cluster['var']
    df[var] = [str(x) for x in cov[var]]

    mmax = methylation.max().max()
    mmin = methylation.min().min()
    if ax is None:
        fig, ax = plt.subplots(1)

    colors=sns.color_palette(palette)
    ax = parallel_coordinates(df, var, color=colors, ax=ax, use_columns=False)
    # pandas adds dark axvline, this is to remove that.
    lines = ax.get_lines()
    for i in range(len(features)):
        lines.pop().remove()
    ax.legend(loc='best')
    lbls = ax.get_legend().get_texts()
    l = ax.get_legend()
    l.set_frame_on(True)
    l.get_frame().set_facecolor('white')
    l.get_frame().set_alpha(0.5)
    if isinstance(features[0], CountFeature):
        counts = np.array([f.counts for f in features]).T
        for icol, f in enumerate(features):
            for j, group in enumerate(sorted(df[var].unique())):
                sel = np.array(df[var] == group)
                ax.scatter([icol] * sel.sum(),
                       methylation[sel, icol],
                       edgecolors=colors[j],
                       facecolors=colors[j],
                       alpha=0.5,
                       s=counts[sel, icol])
    plt.draw()
    xmin, xmax = ax.get_xlim()
    ax.set_xlim(int(xmin) - 0.05, int(xmax) + 0.05)
    ax.get_legend().set_title(var)
    ax.set_ylabel('methylation')
    sns.axes_style({'axes.linewidth': 0, 'axes.grid': False})
    sns.despine()
    sns.set_style("ticks")
    return ax
コード例 #51
0
ファイル: dbsPlotCanvas.py プロジェクト: behollis/DBSViewer
 def compute_initial_figure(self):
     data = pd.read_csv('../examplesAndSandboxCode/irisdata.txt')
     parallel_coordinates(data, 'Name')
コード例 #52
0
ファイル: wrangle.py プロジェクト: mekindig/machine-learning
    df.to_csv(OUTPATH, sep='\t', index=False, header=False)

    print "Wrote dataset of %i instances and %i attributes to %s" % (df.shape + (OUTPATH,))

    with open('meta.json', 'w') as f:
        meta = {'feature_names': FEATURES, 'target_names': LABEL_MAP}
        json.dump(meta, f, indent=4)

    # Describe the dataset
    print df.describe()

    # Determine the shape of the data
    print "{} instances with {} features\n".format(*df.shape)

    # Determine the frequency of each class
    print df.groupby('label')['label'].count()

    # Create a scatter matrix of the dataframe features
    scatter_matrix(df, alpha=0.2, figsize=(12, 12), diagonal='kde')
    plt.show()

    # Parallel Coordinates
    plt.figure(figsize=(12,12))
    parallel_coordinates(df,'label')
    plt.show()

    # Radviz
    plt.figure(figsize=(12,12))
    radviz(df, 'label')
    plt.show()
コード例 #53
0
ファイル: Exercise1.py プロジェクト: jopesy/DAKD_Exercises
def plotParallelCoordinates():
    data = pandas.read_csv(csv_path, sep=';')
    plt.figure()
    parallel_coordinates(data, 'quality')
    plt.show()
#
# TODO: Load up the Seeds Dataset into a Dataframe
# It's located at 'Datasets/wheat.data'
# 
# .. your code here ..
df = pd.read_csv("Datasets/wheat.data", index_col=0, header=0)


#
# TODO: Drop the 'id', 'area', and 'perimeter' feature
# 
# .. your code here ..

df = df.drop('area', axis=1)
df = df.drop('perimeter', axis=1)

#
# TODO: Plot a parallel coordinates chart grouped by
# the 'wheat_type' feature. Be sure to set the optional
# display parameter alpha to 0.4
# 
# .. your code here ..

plt.figure()
parallel_coordinates(df, 'wheat_type', alpha=0.4)

plt.show()


コード例 #55
0
ファイル: Preprocess.py プロジェクト: AntHar/DAT4-students
NBA_players.pos.fillna(value = 'F', inplace = True) 

#NBA_players.pos.value_counts()

NBA_players.replace('SG','G', inplace = True) #decided to change to Gs and Fs because All NBA teams 
NBA_players.replace('PG','G', inplace = True) 
NBA_players.replace('SF','F',inplace = True) 
NBA_players.replace('PF','F',inplace = True) 
#First just preprocess by taking average

#Exploratory Analysis
#Find most important features, parellel coordinates; coorealtion matrix

NBA_players.to_csv('NBA_players.csv')


corrMat =NBA_players.corr()
NBA_players.columns

from pandas.tools.plotting import parallel_coordinates
features = [["g","gs","mp","fg","fga","fg_","x3p","x3pa","x3p_","x2p","x2pa","x2p_","ft","fta","ft_","orb","drb","trb","ast","stl","blk","tov","pf","pts"]]
#features = [['g','gs','mp','pts','AST%','PER','STL%','USG%','FTr','3PAr','TS%','VORP','BPM','OBPM','DBPM','WS','WS/48','DWS','OWS',]]
NBA_df = pd.DataFrame(NBA_players, columns = features)
NBA_df['team']= NBA_players.team
parallel_coordinates(data=NBA_df,class_column = 'team')


                     


コード例 #56
0
ファイル: dtree.py プロジェクト: akash13singh/dm-assignment2


# histogram of class variable
n, bins, patches = plt.hist(lymph['class'], facecolor='green')
plt.xlabel('class')
plt.grid(True)
#plt.show()

#remove classes
lymph = lymph[lymph['class'].isin([1,2])]
print(len(lymph))


plt.figure()
parallel_coordinates(lymph, 'class',colormap='gist_rainbow')
plt.xticks(rotation=30)
#plt.show()


seeds = [31,67,321,5,76,43,12,11]
class_names = ["metastases","malign lymph"]


clf = tree.DecisionTreeClassifier()
clf_extra = tree.ExtraTreeClassifier()
clf_pruned_cart = tree.DecisionTreeClassifier(min_samples_leaf=3, max_depth=5)
test_errors_CART = []
test_errors_Extra = []
test_errors_cart_pruned = []
コード例 #57
0
# http://scikit-learn.org/stable/auto_examples/cluster/plot_cluster_iris.html

# ---------------------------------------
# EXERCISE: Create a Parallel Coordinates
#           visualization with the classes
# ---------------------------------------

#================================
# Option 3: Parallel Coordinates

from pandas.tools.plotting import parallel_coordinates
# I'm going to convert to a pandas dataframe
# Using a snippet of code we learned from one of Kevin's lectures!
iris_df['Name'] = iris.target_names[iris.target]
parallel_coordinates(data=iris_df,
                     class_column='Name',
                     colors=('#FF0054', '#FBD039', '#23C2BC'))
'''
DETERMINING THE NUMBER OF CLUSTERS
How do you choose k? There isn't a bright line, but we can evaluate 
performance metrics such as the silhouette coefficient and within sum of 
squared errors across values of k.

scikit-learn Clustering metrics documentation:
http://scikit-learn.org/stable/modules/classes.html#clustering-metrics
'''

# Create a bunch of different models
k_rng = range(1, 15)
est = [KMeans(n_clusters=k).fit(d) for k in k_rng]
コード例 #58
0
lines = list(list(int(i) for i in line if i) for line in lines)
data = np.array(lines)
pd_data = pd.DataFrame(data)
pd_data.to_csv('lymphoma.csv')
"""

pd_data = pd.read_csv("lymphoma.csv", header=0, index_col=0)
data = pd_data.values
print type(data), data.shape
# replace missing values, just as in the paper
generator = np.random.RandomState(0)
idx = np.where(data == 999)
data[idx] = generator.randint(-800, 801, len(idx[0]))

# cluster with same parameters as original paper
model = ChengChurch(n_clusters=100, max_msr=1200, deletion_threshold=1.2, inverse_rows=True, random_state=0)
model.fit(data)

# find bicluster with smallest msr and plot it
msr = lambda a: (np.power(a - a.mean(axis=1, keepdims=True) - a.mean(axis=0) + a.mean(), 2).mean())
msrs = list(msr(model.get_submatrix(i, data)) for i in range(100))
arr = model.get_submatrix(np.argmin(msrs), data)
print type(arr), arr.shape
df = DataFrame(arr)
df["row"] = map(str, range(arr.shape[0]))
parallel_coordinates(df, "row", linewidth=1.5)
plt.xlabel("column")
plt.ylabel("expression level")
plt.gca().legend_ = None
plt.show()
コード例 #59
0
project_names =  [
    'keystone',
    'horizon',
    'glance',
    'swift',
    'nova',
    'cinder',
    'neutron',
    'heat',
    'ceilometer',
    'ironic',
    'trove',
    'designate',
    'sahara',
    'zaqar',
    'barbican']
    
tempest_tests = [141, 192, 238, 374, 1142, 2649, 3076, 1731, 1689]
openstack_projects = [3, 5, 7, 7, 9, 10, 11, None, None]

df = pandas.DataFrame(tests_per_project)
plt.figure()

df = pandas.read_csv('i_hate_pandas.csv')

plt.ylabel('# of tests')
ax = parallel_coordinates(df, 'name')
ax.legend(loc='upper center', bbox_to_anchor=(0.5, 1.05),
           ncol=3, fancybox=True, shadow=True)
plt.savefig('tests_per_proj.png', dpi=900)
コード例 #60
0
# ---------------------------------------
# EXERCISE: Create a Parallel Coordinates 
#           visualization with the classes
# ---------------------------------------


#================================
# Option 3: Parallel Coordinates

from pandas.tools.plotting import parallel_coordinates
# I'm going to convert to a pandas dataframe
# Using a snippet of code we learned from one of Kevin's lectures!
features = [name[:-5].title().replace(' ', '') for name in iris.feature_names]
iris_df = pd.DataFrame(iris.data, columns = features)
iris_df['Name'] = iris.target_names[iris.target]
parallel_coordinates(data=iris_df, class_column='Name', 
                     colors=('#FF0054', '#FBD039', '#23C2BC'))
                     
'''
DETERMINING THE NUMBER OF CLUSTERS
How do you choose k? There isn't a bright line, but we can evaluate 
performance metrics such as the silhouette coefficient and within sum of 
squared errors across values of k.

scikit-learn Clustering metrics documentation:
http://scikit-learn.org/stable/modules/classes.html#clustering-metrics
'''

# Create a bunch of different models
k_rng = range(1,15)
est = [KMeans(n_clusters = k).fit(d) for k in k_rng]