Beispiel #1
0
def plot_tests(genomes, pairs, out, plot, cats, y_lab, normalize = False):
    """
    plot test data
    """
    lengths = []
    slopes = []
    samples = []
    n50s = []
    for g, s in pairs:
        sample = genomes[g]['samples'][s]
        s = s.rsplit('.', 1)[0].replace('_', ' ')
        l, n50, m = sample['test']
        lengths.extend(l)
        slopes.extend(m) 
        samples.extend([s for i in m])
        n50s.extend(n50)
    if normalize == 'log2':
        slopes = log_trans(slopes)
    slope_fs = pd.DataFrame({cats:lengths, y_lab:slopes, 'sample':samples, 'n50':n50s})
    slope_fs.to_csv(out, sep = '\t')
    slope_fs = slope_fs[slope_fs[y_lab] != False]
    sns.set_style('whitegrid')
    sns.set_context('poster')
    sns_plot = sns.boxplot(x = cats, y = y_lab, data = slope_fs, \
            hue = 'sample', palette = 'deep')
    sns.stripplot(x = cats, y = y_lab, data = slope_fs, \
            hue = 'sample', palette = 'deep', \
            jitter = True, size = 5, edgecolor = 'gray')
    plt.legend(loc = 'upper right', bbox_to_anchor=(1.05, 1))
    sns_plot.figure.savefig('%s' % (plot), bbox_inches = 'tight')
def main(argv):

    # Lists of marker styles and line styles
    markers = 10 * ['o','^','x']
    lines = 10 * ['-','--','-.']

    infile = sys.argv[1]

    resframe = pd.read_csv(infile)

    print "Summary of all results found:"
    print resframe

    fig, ax = plt.subplots()

    # sns.pointplot(x='Writers', y='Write Bandwidth (MiB/s)',
    #   data=resframe, hue='Scheme', scale=0.75, markers=markers,
    #   linestyles=lines, estimator=np.median, dodge=True, ci=100.0)
    sns.stripplot(x='Writers', y='Write Bandwidth (MiB/s)',
      data=resframe, hue='Scheme', jitter=True, split=True)
    ax.set_ylim(ymin=0)

    plt.ylabel('Write Bandwidth / MiB/s')
    plt.xlabel('Writers')
    plt.legend()
    plt.savefig('dist_bandwidth_stats.png')
    plt.clf()

    sys.exit(0)
Beispiel #3
0
    def plot_errors_for_elements(self, ax=None, **kwargs):
        """
        Plot the relative errors associated to the chemical elements.
        """
        dict_list = []
        for idx, row in self.iterrows():
            rerr = 100 * (row["this"] - row["ae"]) / row["ae"]
            for symbol in set(species_from_formula(row.formula)):
                dict_list.append(dict(
                    element=symbol,
                    rerr=rerr,
                    formula=row.formula,
                    struct_type=row.struct_type,
                    ))

        frame = DataFrame(dict_list)
        order = sort_symbols_by_Z(set(frame["element"]))
        #print_frame(frame)

        import seaborn as sns
        ax, fig, plt = get_ax_fig_plt(ax=ax)

        # Draw violinplot
        #sns.violinplot(x="element", y="rerr", order=order, data=frame, ax=ax, orient="v")

        # Box plot
        ax = sns.boxplot(x="element", y="rerr", data=frame, ax=ax, order=order, whis=np.inf, color="c")
        # Add in points to show each observation
        sns.stripplot(x="element", y="rerr", data=frame, ax=ax, order=order,
                      jitter=True, size=5, color=".3", linewidth=0)

        sns.despine(left=True)
        ax.set_ylabel("Relative error %")
        ax.grid(True)
        return fig
Beispiel #4
0
def make_plots(groups):

    sns.stripplot("ammo", "moa", data=groups, jitter=True)
    postprocess()
    plt.savefig("points.png")

    plt.clf()
    sns.boxplot("ammo", "moa", data=groups)
    postprocess()
    plt.savefig("boxplot.png")

    plt.clf()
    sns.barplot("ammo", "mean", data=groups, ci=None)
    plt.title("mean moa for best 9 of 10 five shot groups")
    plt.ylabel("moa")
    postprocess()
    plt.savefig("avg_moa.png")

    plt.clf()
    std = groups["standard"]
    std = std[std.notnull()]

    fig, axes = plt.subplots(ncols=2)
    sns.distplot(std, ax=axes[0])
    stats.probplot(std, plot=axes[1])
    fig.set_size_inches(6, 4)
    fig.tight_layout()
    plt.savefig("qqplot.png")
Beispiel #5
0
def view_distribution(df,x="type",y="rate", plt=plt):
    asset = df.symbol.values[0]
    plt.figure(1,figsize=(15,15))
    sns.violinplot(x=x, y=y, data=df, inner=None)
    sns.stripplot(x=x, y=y, data=df, jitter=True, color="white", edgecolor="gray")
    plt.title(y+' distribution ('+asset+')')
    plt.show()
Beispiel #6
0
def Create_WildPlot(X, y1, y2, y3, y4):
    #Creates strip plot of x and y
    xlab = X.name
    ylab = 'Wilderness Area'
    xlab = xlab.replace("_"," ")
    figlab = ylab + " vs " + xlab
    filelab =  "Plots/" + figlab.replace(" ","") + ".pdf"
    f, ax = plt.subplots(figsize=(5, 5))
    
    y = y1
    n = len(y)
    
    for i in range (0,n):
        if y1[i] == 1:
            y[i] = 1
        elif y2[i] == 1:
            y[i] = 2
        elif y3[i] == 1:
            y[i] = 3
        elif y4[i] == 1:
            y[i] = 4
    
    sns.stripplot(x = X, y = y, jitter = True, size = 5, linewidth = 0.1, ax = ax)
    sns.plt.title(figlab)
    sns.plt.xlabel(xlab)
    sns.plt.ylabel(ylab)
    savefig(filelab)
Beispiel #7
0
def _plot_categorical_and_continuous(df, xlabel, ylabel, x_keys, y_keys, ax,
                                     cmap, n_cat=5, plottype="box"):
    """
    Plot a categorical variable and a continuous variable against each
    other. Types of plots include box plot, violin plot, strip plot and swarm
    plot.

    Parameters
    ----------
    df : pd.DataFrame
        A pandas DataFrame with the data

    xlabel : str
        The column name for the variable on the x-axis

    ylabel : str
        The column name for the variable on the y-axis

    ax : matplotlib.Axes object
        The matplotlib.Axes object to plot the bubble plot into

    cmap : matplotlib.cm.colormap
        A matplotlib colormap to use for shading the bubbles

    n_cat : int
        The number of categories; used for creating the colour map

    plottype : {"box" | "violin" | "strip" | "swarm"}
        The type of plot to produce; default is a box plot

    Returns
    -------
    ax : matplotlib.Axes object
        The same matplotlib.Axes object for further manipulation

    """
    if x_keys is xlabel:
        keys = y_keys
    elif y_keys is ylabel:
        keys = x_keys
    else:
        raise Exception("Something went terribly, horribly wrong!")

    current_palette = sns.color_palette(cmap, n_cat)
    if plottype == "box":
        sns.boxplot(x=xlabel, y=ylabel, data=df, order=keys,
                    palette=current_palette, ax=ax)
    elif plottype == "strip":
        sns.stripplot(x=xlabel, y=ylabel, data=df, order=keys,
                      palette=current_palette, ax=ax)
    elif plottype == "swarm":
        sns.swarmplot(x=xlabel, y=ylabel, data=df, order=keys,
                      palette=current_palette, ax=ax)
    elif plottype == "violin":
        sns.violinplot(x=xlabel, y=ylabel, data=df, order=keys,
                       palette=current_palette, ax=ax)
    else:
        raise Exception("plottype not recognized!")

    return ax
Beispiel #8
0
def p7(data):
    # Распределение выживших
    f, (ax1, ax2) = plt.subplots(2, 1, figsize=(10, 8), sharex=True)

    sns.stripplot(
        "Pclass",
        "Age",
        "Survived",
        data=data[data["Sex"] == "male"],
        palette="Set2",
        size=20,
        hue_order=(1, 0),
        marker="D",
        alpha=0.25,
        jitter=True,
        ax=ax1,
    )
    ax1.set_title("MALE")

    sns.stripplot(
        "Pclass",
        "Age",
        "Survived",
        data=data[data["Sex"] == "female"],
        palette="Set2",
        size=20,
        marker="D",
        alpha=0.25,
        order=(1, 2, 3),
        jitter=True,
        ax=ax2,
    )
    ax2.set_title("FEMALE")

    plt.show()
Beispiel #9
0
def stripplot_to_pdf(data, save_path, x=None, y=None, hue=None,
                     style='whitegrid', fontsize=2, rows=1, cols=1,
                     figsize=(4, 4), **kwargs):
    """ Data plotted as stripplot using seaborn and saved in a pdf
    given in save_path

    Parameters
    ----------
    data : pd.DataFrame or path to csv file
        single or list of data to plot into pdf.

    save_path : str
        Path to save the pdf plot.

    """
    if isinstance(data, basestring):
        data = pd.read_csv(data)

    if isinstance(data, (list, tuple)):
        cols = len(data)

    if not isinstance(data, (list, tuple)):
        data = [data, ]

    sns.set_style(style)
    sns.set(font_scale=fontsize)

    with PdfPages(save_path) as pdf:
        fig, axes = plt.subplots(nrows=rows, ncols=cols, figsize=figsize,
                                 squeeze=True, sharey=True)
        axes = axes.reshape(-1)
        for ax, d in zip(axes, data):
            sns.stripplot(x=x, y=y, hue=hue, data=d, ax=ax, **kwargs)
        pdf.savefig(fig)
        plt.close()
def timePlotScatter(data):
    geneNamesDict = {}
    for _, row in data.iterrows():
        geneNamesDict[row['Gene']] = 1

    data = data.pivot_table('Values', ['Sample'], ['Gene', 'Time'])
    geneList = geneNamesDict.keys()

    counter = 1
    box = input("Do you want a boxplot for each timepoint? (y/n): ")
    normalize = input("Would you like to normalize the y-axis? (y/n): ")
    ylabel = input("What should the y-axis label be?: ")

    for key in geneList:
        title = key
        plt.figure(counter)
        tempTable = data[key]
        if normalize == 'y':
            tempTable = tempTable / np.amax(tempTable.values)
            title = key + " Normalized"
        if box == "y":
            makeBoxplot(tempTable)
           
        sns.stripplot(data=tempTable, size = 7, jitter = True, palette = sns.color_palette("Set1", n_colors=8, desat=.9))
        plt.title(title)
        plt.ylabel(ylabel)
        plt.xlabel('Time(min)')
        counter += 1
    plt.gca().set_ylim(bottom = 0)
    plt.show()
def conditionPlot(data):

    data = data.pivot_table('Values', ['Sample'], ['Gene', 'Condition'])
    answer = input("Do you want a boxplot to go with your data? (y/n): ")
    normalize = input("Would you like to normalize the y-axis? (y/n): ")
    if normalize == 'y':
        data = data / np.amax(data.fillna(0).values)
        
    if answer == "y":
        print("What kind of boxplot do you want?")
        print("\"1\" for a quartile boxplot.")
        print("\"2\" for a standard deviation boxplot.")
        boxStyle = input("Your answer: ")

        if boxStyle == "1":
            makeBoxplotQuartile(data)
        else:
            makeBoxplotDeviation(data)

        makeBoxplot(data)
        # Iterate through the columns, generating data to form the boxplot.

    sns.stripplot(data=data, size = 7, jitter = True, palette = sns.color_palette("Set1", n_colors=8, desat=.9))
    ylabel = input("What should the y-axis label be?: ")

    plt.ylabel(ylabel)
    plt.xlabel('Gene/Condition')
    exportExcel(data);
    plt.show()
def plot_compare_median_consensus(output_dir, df_order, metric, type = 'ts',DISPLAY = 0):
    plt.figure()


    if type =='ts':
        #sb.tsplot(data=df_order, value=metric,time='order',unit="algorithm",condition="algorithm",err_style="unit_traces")

        ax = sb.boxplot(x=metric, y="algorithm", data=df_order,
                 whis=np.inf, color="c")

        # Add in points to show each observation
        sb.stripplot(x=metric, y="algorithm", data=df_order,
                jitter=True, size=3, color=".3", linewidth=0)
        ax.set_xscale("log")
        sb.despine(trim=True)

       # plt.xlabel('images sorted by the average neuron distance of the median reconstruction')
        plt.savefig(output_dir + '/ts_compare_median_with_consensus_'+metric+'.png', format='png')


    if type =='lm':
        sb.lmplot(x="order", y=metric, hue="algorithm", data=df_order)
        plt.xlabel('images sorted by the average neuron distance of the median reconstruction')
        plt.savefig(output_dir + '/lm_compare_median_with_consensus_'+metric+'.lm.png', format='png')

    if DISPLAY:
         plt.show()
    plt.close()
def stripplot_mean_score(df, save_path, atlas=None, suffix=None, x=None,
                         y=None, hue=None, style='whitegrid', fontsize=14,
                         jitter=.2, figsize=(9, 3), leg_pos=2, axx=None):

    def change_label_name(row, label):
        row[label] = new_names[row[label]]
        return row

    ylabel = atlas
    aliases = {'kmeans': 'K-Means',
               'ica': 'GroupICA',
               'dictlearn': 'Dictionary Learning',
               'basc': 'BASC'}
    if atlas == 'kmeans':
        new_names = {'no': 'Without\n regions extracted',
                     'yes': 'With\n regions extracted'}
        df = df.apply(lambda x: change_label_name(x, y), axis=1)
    else:
        new_names = {'no': 'Without\n regions extracted',
                     'yes': 'With\n regions extracted'}
        df = df.apply(lambda x: change_label_name(x, y), axis=1)

    # change the name of the dataset to upper
    df['dataset'] = df['dataset'].str.upper()

    # make labels of the y axes shorter
    # df[y] = df[y].str.wrap(13)

    rc('xtick', labelsize=12)
    rc('ytick', labelsize=16)
    rc('axes', labelweight='bold')  # string.capitalize
    rc('legend', fontsize=fontsize)

    n_data = len(df['dataset'].unique())
    palette = color_palette(n_data)

    # draw a default vline at x=0 that spans the yrange
    axx.axvline(x=0, linewidth=4, zorder=0, color='0.6')

    sns.boxplot(data=df, x=x, y=y, fliersize=0, linewidth=2,
                boxprops={'facecolor': '0.5', 'edgecolor': '.0'},
                width=0.5, ax=axx)

    sns.stripplot(data=df, x=x, y=y, hue=hue, edgecolor='gray',
                  size=5, split=True, palette=datasets_palette, jitter=jitter,
                  ax=axx)

    axx.set_xlabel('')
    # axx.set_ylabel(aliases[ylabel], fontsize=15)
    axx.set_ylabel('')
    plt.text(.5, 1.02, aliases[key], transform=ax.transAxes, size=15, ha='center')

    # make the positive labels with "+"
    axx_xticklabels = []
    for x in axx.get_xticks():
        if x > 0:
            axx_xticklabels.append('+' + str(x) + '$\%$')
        else:
            axx_xticklabels.append(str(x) + '$\%$')
    axx.set_xticklabels(axx_xticklabels)
    def CheckShannonIndex(self, labels=None, condition_dict=None, fig_title=None):
        # Description: calculate the Shannon entropy of all samples, and plot on boxplot
        # If labels is specified, also plot the entropy of samples in each of the labels.
        def ShannonIndex(numList):   ## Calculate Shannon Entropy
            SU = sum(numList)
            SDI = 0.0
            for num in numList:
            	freq = float(num)/SU
            	if freq>0:
        	    	SDI = SDI - freq * np.log(freq)
            return SDI

        print('Making Shannon Diversity boxplot for all samples')

        # Calculate shannon entropy for each sample
        SDIs = pd.DataFrame(index=self.abun_df.index, columns=['SDI'])
        for sample in self.abun_df.index:
            SDIs.loc[sample, 'SDI'] = ShannonIndex(self.abun_df.loc[sample])
        # Add metadata labels to the df containing SDIs
        SDIs = pd.concat([SDIs, self.meta_df], axis=1)
        SDIs['SDI'] = SDIs['SDI'].astype('float64')
        self.SDI = SDIs

        # Plot all boxplots, and save if fig_title was given
        if fig_title:
            fig_ext = fig_title.rsplit('.',1)[1]
            fig_title = fig_title.rsplit('.',1)[0]

        # First plot SDI of all samples
        if fig_title:
            ax = sb.violinplot(x=SDIs['SDI'], inner=None, saturation=0.35)
            ax = sb.stripplot(x=SDIs['SDI'], jitter=True, size=5, linewidth=0.6)
            fig = ax.get_figure()
            fig.savefig(fig_title + '_all.violinplot.' + fig_ext)
            plt.close()
            # Do the boxplot
            ax = sb.boxplot(x=SDIs['SDI'])
            ax = sb.stripplot(x=SDIs['SDI'], jitter=True, size=5, linewidth=0.6)
            fig = ax.get_figure()
            fig.savefig(fig_title + '_all.boxplot.' + fig_ext)
            plt.close()

        if labels:
            print('Making boxplots separated by labels: ')
            for label in labels:
                print(label + '...')
                # Try with seaborn library
                SDIs[label] = SDIs[label].astype('category')
                ax = sb.violinplot(x=label, y='SDI', data=SDIs, saturation=0.35, inner=None)
                ax = sb.stripplot(x=label, y='SDI', data=SDIs, jitter=True, size=5, linewidth=0.6)
                fig = ax.get_figure()
                fig.savefig(fig_title + '_' + label + '.violinplot.' + fig_ext)
                plt.close(fig)
                # Boxplot
                ax = sb.boxplot(x=label, y='SDI', data=SDIs, saturation=0.35)
                ax = sb.stripplot(x=label, y='SDI', data=SDIs, jitter=True, size=5, linewidth=0.6)
                fig = ax.get_figure()
                fig.savefig(fig_title + '_' + label + '.boxplot.' + fig_ext)
                plt.close()
Beispiel #15
0
def strip(X, y, description):
    '''for visualizing categorical data'''
    for i in X.iteritems():
        feature_title = i[0]
        sns.stripplot(x=i[1], y=y, jitter=True)
        plt.savefig('visuals/'+feature_title+'_'+description+'_strips')
        print('visuals/'+feature_title+'_'+description+'_strips')
        plt.close()
def bar_box_violin_dot_plots(data, category_col, numeric_col, axes,
                             file_name=None):
    sns.barplot(category_col, numeric_col, data=data, ax=axes[0])
    sns.boxplot(category_col, numeric_col,
                data=data[data[numeric_col].notnull()], ax=axes[2])
    sns.violinplot(category_col, numeric_col, data=data, kind='violin', inner="quartile", scale='count', split=True,
                   ax=axes[3])
    sns.stripplot(category_col, numeric_col, data=data, jitter=True, ax=axes[1])
    sns.despine(left=True)
def plot_domestic_origin(df, predicted=None):
    ax = plt.subplot(111)
    ax.xaxis.set_major_formatter(tkr.FuncFormatter(lambda x, 
                                                   pos: ('%.0f')%(x*1e-6)))
    sns.stripplot(x="DomLifeGross", y="OriginC", data=df)
    sns.plt.xlabel("Domestic Lifetime Gross (millions)")
    sns.plt.ylabel("Country of Origin")
    sns.despine()
    sns.plt.show()
def conditionPlot(data):

    data = data.pivot_table('Values', ['Sample'], ['Gene', 'Condition'])
    answer = input("Do you want a boxplot to go with your data? (y/n): ")
    if answer == "y":
        sns.boxplot(data=data)
    sns.stripplot(data=data, size = 6, jitter = True, edgecolor = "black")
    plt.ylabel('Values')
    plt.xlabel('Gene/Condition')
    plt.show()
Beispiel #19
0
    def plot_errors_for_elements(self, ax=None, **kwargs):
        """
        Plot the relative errors associated to the chemical elements.
        """
        dict_list = []
        for idx, row in self.iterrows():
            rerr = 100 * (row["this"] - row["ae"]) / row["ae"]
            for symbol in set(species_from_formula(row.formula)):
                dict_list.append(dict(
                    element=symbol,
                    rerr=rerr,
                    formula=row.formula,
                    struct_type=row.struct_type,
                    ))

        frame = DataFrame(dict_list)
        order = sort_symbols_by_Z(set(frame["element"]))
        #print_frame(frame)

        import seaborn as sns
        ax, fig, plt = get_ax_fig_plt(ax=ax)

        # Draw violinplot
        #sns.violinplot(x="element", y="rerr", order=order, data=frame, ax=ax, orient="v")

        # Box plot
        ax = sns.boxplot(x="element", y="rerr", data=frame, ax=ax, order=order, whis=np.inf, color="c")
        # Add in points to show each observation
        sns.stripplot(x="element", y="rerr", data=frame, ax=ax, order=order, hue='struct_type',
        #              jitter=True, size=5, color=".3", linewidth=0)
                      jitter=0, size=4, color=".3", linewidth=0, palette=sns.color_palette("muted"))

        sns.despine(left=True)
        ax.set_ylabel("Relative error %")

        labels = ax.get_xticklabels()
        ticks = ax.get_xticks()
        ticks1 = range(min(ticks), max(ticks)+1, 2)
        ticks2 = range(min(ticks) + 1, max(ticks)+1, 2)
        labels1 = [labels[i].get_text() for i in ticks1]
        labels2 = [labels[i].get_text() for i in ticks2]

        #       ax.tick_params(which='both', direction='out')
        #ax.set_ylim(-1, 1)
        ax.set_xticks(ticks1)
        ax.set_xticklabels(labels1, rotation=90)
        ax2 = ax.twiny()
        ax2.set_zorder(-1)
        ax2.set_xticks(ticks2)
        ax2.set_xticklabels(labels2, rotation=90)
        ax2.set_xlim(ax.get_xlim())

        ax.grid(True)
        return fig
Beispiel #20
0
def plot_scatterBox(df,xData,yData,title,fileName,plotAspect=1,colorVal=None):
    plt.figure(figsize=(6*plotAspect,6))
    if(colorVal):
        sns_plot = sns.boxplot(x=xData,y=yData,data=df,color=colorVal)
        sns.stripplot(x=xData,y=yData,size=9,data=df,color=colorVal,edgecolor='gray',linewidth=1)
    else:
        sns_plot = sns.boxplot(x=xData,y=yData,data=df)
        sns.stripplot(x=xData,y=yData,size=9,data=df,edgecolor='gray',linewidth=1)
    
    plt.title(title)
    fig = sns_plot.get_figure()
    process_plot(fileName)
Beispiel #21
0
def plot_feature_importance(features, fitted_forest):
    """Using a fitted random forest, make a cleveland dot plot of the computed feature importances. """
    plt.figure()
    vals = fitted_forest.feature_importances_
    sortorder = np.flipud(np.argsort(vals))
    features = np.array(features)
    with sns.axes_style("whitegrid"):
        sns.stripplot(y=features[sortorder], x=vals[sortorder], orient="h", color='red', size=10)
    xl = plt.xlim()
    plt.xlim(0,xl[1])
    plt.grid(axis='y',linestyle=':')
    plt.xlabel('Feature importance score')
Beispiel #22
0
    def plot_hints(self, with_soc=False, **kwargs):
        # Build pandas dataframe with results.
        rows = []
        for p in self:
            if not p.has_dojo_report:
                cprint("Cannot find dojo_report in %s" % p.basename, "magenta")
                continue
            report = p.dojo_report
            row = {att: getattr(p, att) for att in ("basename", "symbol", "Z", "Z_val", "l_max")}

            # Get deltafactor data with/without SOC
            df_dict = report.get_last_df_results(with_soc=with_soc)
            row.update(df_dict)
            for struct_type in ["fcc", "bcc"]:
                gbrv_dict = report.get_last_gbrv_results(struct_type, with_soc=with_soc)
            row.update(gbrv_dict)

            # Get the hints
            hint = p.hint_for_accuracy(accuracy="normal")
            row.update(dict(ecut=hint.ecut, pawecutdg=hint.pawecutdg))

            rows.append(row)

        import pandas as pd
        frame = pd.DataFrame(rows)

        def print_frame(x):
            import pandas as pd
            with pd.option_context('display.max_rows', len(x),
                                   'display.max_columns', len(list(x.keys()))):
                print(x)

        print_frame(frame)
        # Create axes
        #import matplotlib.pyplot as plt

        import seaborn as sns
        ax, fig, plt = get_ax_fig_plt(ax=None)

        #order = sort_symbols_by_Z(set(frame["element"]))

        # Box plot
        ax = sns.boxplot(x="symbol", y="ecut", data=frame, ax=ax, #order=order,
                         whis=np.inf, color="c")
        # Add in points to show each observation
        sns.stripplot(x="symbol", y="ecut", data=frame, ax=ax, #order=order,
                      jitter=True, size=5, color=".3", linewidth=0)

        sns.despine(left=True)
        ax.set_ylabel("Relative error %")
        ax.grid(True)

        return fig
Beispiel #23
0
def outcomeBoxplot(cyDf, cyVar, outcomeVar, printP=True, axh=None):
    if axh is None:
        axh = plt.gca()
    axh.cla()
    sns.boxplot(y=cyVar, x=outcomeVar, data=cyDf, ax=axh, order=[0,1])
    sns.stripplot(y=cyVar, x=outcomeVar, data=cyDf, jitter=True, ax=axh, order=[0,1])
    plt.xticks([0,1], ['False', 'True'])
    if printP:
        tmp = cyDf[[cyVar, outcomeVar]].dropna()
        z, pvalue = stats.ranksums(tmp[cyVar].loc[tmp[outcomeVar] == 1], tmp[cyVar].loc[tmp[outcomeVar] == 0])
        annParams = dict(textcoords='offset points', xytext=(0,-5), ha='center', va='top', color='black', weight='bold', size='medium')
        plt.annotate('p = %1.3g' % pvalue, xy=(0.5,plt.ylim()[1]), **annParams)
    plt.show()
Beispiel #24
0
def plotCrossCompartmentBoxplot(cyDfA, cyDfB):
    rho,pvalue,qvalue = crosscorr(cyDfA[sorted(cyDfA.columns)], cyDfB[sorted(cyDfB.columns)])
        
    s = [rho.loc[i,j] for i,j in itertools.product(rho.index, rho.columns) if i == j]
    d = [rho.loc[i,j] for i,j in itertools.product(rho.index, rho.columns) if i != j]
    a = pd.DataFrame({'Group':['Same']*len(s) + ['Different']*len(d), '$\\rho$':s+d})
    
    plt.clf()
    sns.boxplot(x='Group', y='$\\rho$', data=a)
    sns.stripplot(x='Group', y='$\\rho$', data=a, jitter=True)
    plt.xlabel('')
    plt.ylim((-1,1))
    plt.tight_layout()
def parseExcelManual(filename):
    df = pd.read_excel(filename)
    print("We found the following columns:")
    print(df.columns.values)
    print("")
    xcol = input("Which column would you like to occupy the x-axis?: ")
    
    while xcol not in df.columns.values:
        print("Invalid column name")
        xcol = input("Please enter a valid column name: ")

    ycol = input("Which column would you like to occupy the y-axis?: ")
    while ycol not in df.columns.values:
        print("Invalid column name")
        ycol = input("Please enter a valid column name: ")

    title = ycol + " vs " + xcol
    
    # making a new series, where xcol is the label, and then makes correspondence
    # between the label and the value
    d = {xcol: df[xcol], ycol: df[ycol]}
    
    # then put it into a data frame
    reorg = pd.DataFrame(data=d)
    
    # dropna removes anything thats without a value (NaN)
    reorg = reorg.dropna(axis = 0)
    
    # flips/trasnposes to make it easier to work with 
    reorg = reorg.pivot_table(ycol, xcol, reorg.index)
    reorg = reorg.T

    box = input("Do you want a boxplot? (y/n): ")
    normalize = input("Do you want to normalize the y-axis? (y/n): ")
    if normalize == 'y':
        # amax is the max value of the values, fills NaN with zeroes
        reorg = reorg / np.amax(reorg.fillna(0).values)
        title = title + " Normalized"
        
    if box == "y":
        makeBoxplot(reorg)
    sns.stripplot(data=reorg, size = 7, jitter = True, palette = sns.color_palette("Set1", n_colors=8, desat=.9))

    plt.ylabel(ycol)
    plt.xlabel(xcol)
    plt.title(title)
    plt.gca().set_ylim(bottom = 0)

    exportExcel(reorg);

    plt.show()
Beispiel #26
0
def Create_Plot(X,y):
    #Creates strip plot of x and y
    xlab = X.name
    ylab = y.name
    xlab = xlab.replace("_"," ")
    ylab = ylab.replace("_"," ")
    figlab = ylab + " vs " + xlab
    filelab =  "Plots/" + figlab.replace(" ","") + ".pdf"
    f, ax = plt.subplots(figsize=(5, 5))
    sns.stripplot(x = X, y = y, jitter = True, size = 5, linewidth = 0.1, ax = ax)
    sns.plt.title(figlab)
    sns.plt.xlabel(xlab)
    sns.plt.ylabel(ylab)
    savefig(filelab)
    def BB_vs_Sidechain():
        # Make bins for BB RMSDs
        number_of_bins = 5
        bin_size = len(bb_vs_sidechain_df['WT-Mutant Backbone RMSD']) / number_of_bins + 1

        # Assign arbitrary bin identifiers for BB Group
        for index, row in bb_vs_sidechain_df.iterrows():
            bb_vs_sidechain_df.loc[index, 'BB Group'] = ((index + 1) // bin_size)
        # Find bin boundaries for BB group and add to dict
        bin_rename_dict = {}
        for name, group in bb_vs_sidechain_df.groupby('BB Group'):
            bin_rename_dict[name] = '%s -\n%s' % (group['WT-Mutant Backbone RMSD'].iloc[0], group['WT-Mutant Backbone RMSD'].iloc[len(group) - 1])
        # Rename bin identifiers to bin boundary values in BB group
        for index, row in bb_vs_sidechain_df.iterrows():
            bb_vs_sidechain_df.loc[index, 'BB Group'] = bin_rename_dict[bb_vs_sidechain_df.loc[index, 'BB Group']]

        # Assign bin identifiers for DDG Group
        for DDG_type in ['Experimental DDG', 'Predicted DDG']:
            for index, row in bb_vs_sidechain_df.iterrows():
                if row[DDG_type] > 2.5 or row[DDG_type] < -2.5:
                    bb_vs_sidechain_df.loc[index, DDG_type + ' Group'] = 'Extra Large DDG (DGG > 2.5 REU or DDG < -2.5 REU)'
                elif row[DDG_type] > 1 or row[DDG_type] < -1:
                    bb_vs_sidechain_df.loc[index, DDG_type + ' Group'] = 'Large DDG (2.5 REU > DGG > 1 REU or -2.5 < DDG < -1 REU)'
                elif row[DDG_type] > 0.5 or row[DDG_type] < -0.5:
                    bb_vs_sidechain_df.loc[index, DDG_type + ' Group'] = 'Medium DDG (1 REU > DGG > 0.5 REU or -1 < DDG < -0.5 REU)'
                else:
                    bb_vs_sidechain_df.loc[index, DDG_type + ' Group'] = 'Small DDG (0.5 REU > DDG > -0.5 REU)'

            sns.set_style('white', {'axes.grid': True, 'axes.edgecolor': '0'})
            sns.set_context('paper', font_scale=1.5, rc={'lines.linewidth': 1})

            fig, ax = plt.subplots(figsize=(20, 10))
            fig.suptitle('WT PDB - Mutant PDB Neighborhood Backbone RMSD vs. \nMutant PDB - RosettaOut Point Mutant Residues All-Atom RMSD', fontsize = 24, y=1.0)
            with sns.cubehelix_palette(number_of_bins, start=0.5, rot=-.75):
                sns.boxplot(x=bb_vs_sidechain_df['BB Group'],
                            y=bb_vs_sidechain_df['Point Mutant RMSD'],
                            ax=ax
                            )
            with sns.color_palette("husl", number_of_bins):
                sns.stripplot(x='BB Group',
                              y='Point Mutant RMSD',
                              hue= DDG_type + ' Group',
                              data=bb_vs_sidechain_df,
                              jitter=True,
                              ax=ax
                              )

            ax.set(xlabel='WT PDB - Mutant PDB Neighborhood Backbone RMSD', ylabel='Mutant PDB - RosettaOut Point Mutant Residues All-Atom RMSD')
            output_pdf.savefig(fig, pad_inches=1, bbox_inches='tight')
def pltvar(data, labels, stem):
    (xlabel, ylabel) = labels

    kwargs = { 'x': xlabel, 'y': 'deviation', 'data': df }
    sns.boxplot(palette="PRGn", whis=np.inf, **kwargs)
    sns.stripplot(jitter=True, size=3, color='.3', linewidth=0, **kwargs)
    
    ax = plt.gca()
    ax.set_xlabel(xlabel.title() + ' window (minutes)')
    ax.set_ylabel(ylabel.title() + ' window std. dev. (jams/day)')

    fname = '-'.join([ 'variance', xlabel, stem ])
    dest = source.joinpath(fname).with_suffix('.png')
    plt.savefig(str(dest))
    plt.close()
def plot_similardishes(idx,xlim):
    match = yum_ingr2.iloc[yum_cos[idx].argsort()[-21:-1]][::-1]
    newidx = match.index.get_values()
    match['cosine'] = yum_cos[idx][newidx]
    match['rank'] = range(1,1+len(newidx))

    label1, label2 =[],[]
    for i in match.index:
        label1.append(match.ix[i,'cuisine'])
        label2.append(match.ix[i,'recipeName'])

    fig = plt.figure(figsize=(10,10))
    ax = sns.stripplot(y='rank', x='cosine', data=match, jitter=0.05,
                       hue='cuisine',size=15,orient="h")
    ax.set_title(yum_ingr2.ix[idx,'recipeName']+'('+yum_ingr2.ix[idx,'cuisine']+')',fontsize=18)
    ax.set_xlabel('Flavor cosine similarity',fontsize=18)
    ax.set_ylabel('Rank',fontsize=18)
    ax.yaxis.grid(color='white')
    ax.xaxis.grid(color='white')

    for label, y,x, in zip(label2, match['rank'],match['cosine']):
         ax.text(x+0.001,y-1,label, ha = 'left')
    ax.legend(loc = 'lower right',prop={'size':14})
    ax.set_ylim([20,-1])
    ax.set_xlim(xlim)
def log2_oulierfilter(df_by_cell, plot=False):
    log2_df = np.log2(df_by_cell+1)
    top_log2 = find_top_common_genes(log2_df)
    if top_log2.empty:
        print("no common genes found")
        return log2_df, log2_df.transpose()
    log2_df2= pd.DataFrame(pd.to_numeric(log2_df, errors='coerce'))
    log_mean = top_log2.mean(axis=0).sort_values(ascending=False)
    log2_sorted = top_log2.reindex_axis(top_log2.mean(axis=0).sort_values(ascending=False).index, axis=1)
    xticks = []
    keep_col= []
    log2_cutoff = np.average(log2_sorted)-np.std(log2_sorted)
    avg_cutoff = np.average(log2_cutoff)
    for col, m in zip(log2_sorted.columns.tolist(),log2_sorted.mean()):
        if m > avg_cutoff:
            keep_col.append(col)
            xticks.append(col+' '+str("%.2f" % m))
    filtered_df_by_cell = df_by_cell[keep_col]
    filtered_df_by_gene = filtered_df_by_cell.transpose()
    filtered_log2 = np.log2(filtered_df_by_cell[filtered_df_by_cell>0])
    if plot:
        ax = sns.boxplot(data=filtered_log2, whis= .75, notch=True)
        ax = sns.stripplot(x=filtered_log2.columns.values, y=filtered_log2.mean(axis=0), size=4, jitter=True, edgecolor="gray")
        xtickNames = plt.setp(ax, xticklabels=xticks)
        plt.setp(xtickNames, rotation=90, fontsize=9)
        plt.show()
        plt.clf()
        sns.distplot(filtered_log2.mean())
        plt.show()
    log2_expdf_cell = np.log2(filtered_df_by_cell+1)
    log2_expdf_gene = log2_expdf_cell.transpose()
    return log2_expdf_cell, log2_expdf_gene
def plot_results(
    df,
    reg_weight_col,
    out_dir,
    dataset,
):
    fig_violin, axes_violin = plt.subplots(nrows=3, ncols=4, figsize=(30, 30))
    fig_box, axes_box = plt.subplots(nrows=3, ncols=4, figsize=(30, 30))
    fig_mean, axes_mean = plt.subplots(nrows=3, ncols=4, figsize=(30, 30))
    for metric, ax_violin, ax_box, ax_mean in zip(
            DIS_METRICS,
            axes_violin.flatten(),
            axes_box.flatten(),
            axes_mean.flatten(),
    ):
        metric_df, metric = get_metric_df(df, metric)
        print()
        print(
            metric_df.groupby(MODEL_COL_STR)
            [metric].mean().reset_index().sort_values(metric, ascending=False))

        metric_df = metric_df.sort_values(reg_weight_col)

        sns.violinplot(
            x=reg_weight_col,
            y=metric,
            data=metric_df,
            cut=0,
            ax=ax_violin,
        )
        for tick in ax_violin.get_xticklabels():
            tick.set_rotation(45)

        sns.boxplot(
            x=reg_weight_col,
            y=metric,
            data=metric_df,
            ax=ax_box,
        )
        for tick in ax_box.get_xticklabels():
            tick.set_rotation(45)

        # group and aggregate to obtain means per model
        metric_df = metric_df.groupby(reg_weight_col)[metric].mean()
        sns.stripplot(
            x=list(map("{:.2E}".format, metric_df.index.values)),
            y=metric_df.values,
            ax=ax_mean,
            size=25,
        )
        ax_mean.set_ylabel(metric)

        for tick in ax_mean.get_xticklabels():
            tick.set_rotation(45)

    fig_violin.savefig(out_dir / f'{dataset}_violin.png')
    fig_box.savefig(out_dir / f'{dataset}_box.png')
    fig_mean.savefig(out_dir / f'{dataset}_mean.png')

    for fig in (fig_violin, fig_box, fig_mean):
        plt.close(fig)
Beispiel #32
0
sns.palplot(sns.cubehelix_palette(n_colors=8, start=1.7, rot=0.2, dark=0, light=.95, reverse=True))


# *start* is always between 0 and 3. *rot* an abbreviation for rotation is kept between -1 and 1. *reverse* converses the color ordering and *hue* refers to plot appearance.

# ## Generic Seaborn Plots:

# In[36]:


# Loading up built-in dataset:
tips = sns.load_dataset("tips")

# Creating Strip plot for day-wise revenue:
sns.stripplot(x="day", y="total_bill", data=tips, color="g")


# This does the job for us but let us try to get better results by plotting each day in different color instead of same color. For this, we shall replace `color` parameter with `palette` parameter:

# In[40]:


# Set Theme:
sns.set_style('whitegrid')

# Creating Strip plot for day-wise revenue:
sns.swarmplot(x="day", y="total_bill", data=tips, palette="viridis")


# In[ ]:
            try:
                if len(item_box.find_elements_by_css_selector(".item-sold-out-badge")) > 0:
                    sold = "SOLD"
                else:
                    sold = "NOT SOLD"
                sub_title = item_box.find_element_by_class_name("items-box-body")
                title = sub_title.find_element_by_tag_name("h3").text
                item_price = item_box.find_element_by_css_selector(".items-box-price")
                price_text = item_price.text
                price_text = re.sub(r",", "", price_text).lstrip("¥ ")
                price_text_int = int(price_text)
                print(price_text_int)
                url = item_box.find_element_by_tag_name("a").get_attribute("href")
                data  = pd.Series( [ sold,title,price_text_int,url ], index=df_main.columns )
                grdata = pd.Series( [ sold,price_text_int ], index=df_graf.columns )
                df_main = df_main.append( data, ignore_index=True )
                df_graf = df_graf.append( grdata, ignore_index=True )
            except Exception as e:
                print(e)
    else:
        print('No items anymore...')
        break

print(df_main)
sns.stripplot(x='SOLD', y='PRICE', data=df_graf)
plt.show()
sns.pairplot(df_graf,hue="SOLD")
plt.show()
print('Writing out to CSV file...')
df_main.to_csv("pricedata.csv", encoding="utf_8_sig")
print("Done")
plt.plot(np.array(all_struct_voxels), np.array(all_struct_voxels), color = "gray", linestyle = "dashdot", linewidth = 1) # identity line
plt.ylabel("Voxels in 80um eroded volume")
plt.xlabel("Voxels in original volume")
plt.xlim([0,250000]);plt.ylim([0, 150000])
plt.savefig(os.path.join(fig_dst, "voxels_scatter_org_vs_eroded_250000_voxels.pdf"), bbox_inches = "tight")

#%%

missing_struct_voxels_sort = np.sort(np.array(missing_struct_voxels))
missing_struct_names_sort = np.array(missing_struct_names)[np.argsort(np.array(missing_struct_voxels))]

df = pd.DataFrame()
df["num_voxels"] = missing_struct_voxels+all_struct_voxels
df["type"] = ["eroded"]*len(missing_struct_voxels) + ["original"]*len(all_struct_voxels)

sns.stripplot(x = "num_voxels", y = "type", data = df,  color = "crimson", orient = "h")
sns.boxplot(x = "num_voxels", y = "type", data = df, orient = "h", showfliers=False, showcaps=False, 
            boxprops={'facecolor':'None'})
plt.xlim([0, 200000])
plt.xlabel("Total number of voxels in structure")
plt.ylabel("Structures 'zero'ed' out vs. all original structures")
plt.savefig(os.path.join(fig_dst, "boxplot_total_voxels_org_vs_eroded.pdf"), bbox_inches = "tight")

#%%
#export missing structures name, id, and total voxel count

dataf = pd.DataFrame()
dataf["name"] = missing_struct_names
dataf["id"] = missing_struct_ids
dataf["parent_name"] = missing_struct_parents
dataf["voxels_in_structure"] = missing_struct_voxels
Beispiel #35
0
        return []
    return insert_packet(spreading_factor - 1) + [spreading_factor] + insert_packet(spreading_factor - 1)

sf_as_category = pd.Categorical(insert_packet(12), categories=[7, 8, 9, 10, 11, 12], ordered=True)
pyramid = pd.DataFrame({'SF': sf_as_category})
pyramid['seq_num'] = pyramid.index
cmap = sns.color_palette('Blues_d', 6)
fig, ax = plt.subplots(figsize=(4, 3))
plot = sns.scatterplot(x=pyramid.index, y='SF', data=pyramid, hue='SF', legend=False, palette=cmap, ax=ax)
plot.set_title('Spreading factor sequence')
plot.set_ylabel('spreading factor')
plot.set_xlabel('sequence number')
fig.savefig("sf-sequence.svg")

lora_mons_static = pd.read_pickle('data/lora_mons_static_clean.pkl.gz')
channel = lora_mons_static.query('gtw_id == "eui-0000024b08030186"')[['received', 'dev_id', 'rssi', 'snr', 'data_rate']].set_index('received').sort_index()
channel.index = channel.index.tz_convert('Europe/Brussels')
channel['spreading_factor'] = channel['data_rate'].str.extract('SF([0-9]+)BW').astype(dtype=np.int64)

ax = sns.stripplot(x='spreading_factor', y='rssi', data=channel, alpha=0.3)
ax.set(ylabel='RSSI (dBm)', xlabel='Spreading Factor', title='Distribution of received packets RSSI');
ax.figure.savefig('rssi_sf.png')

ax = sns.stripplot(x='spreading_factor', y='snr', data=channel, alpha=0.3)
ax.set(ylabel='SNR (dB)', xlabel='Spreading Factor', title='Distribution of received packets SNR');
ax.figure.savefig('snr_sf.png')

ax = sns.scatterplot(x='snr', y='rssi', data=channel, alpha=0.3)
ax.set(xlabel='SNR (dB)', ylabel='RSSI (dBm)');
ax.figure.savefig('rssi_snr.png')
Beispiel #36
0
    fig.suptitle(f"{name} ({n_verts})", fontsize=40, y=1.04)
    plt.tight_layout()
    stashfig(f"{g}-gridplot-sf-sorted")
    print()

#%%

shuffle_df = pd.DataFrame(shuffled_triu_outs)
true_df = pd.DataFrame(true_triu_outs)
fig, ax = plt.subplots(1, 1, figsize=(10, 6))
ax = sns.stripplot(
    data=shuffle_df,
    x="Graph",
    y="Proportion",
    linewidth=1,
    alpha=0.4,
    jitter=0.3,
    size=5,
    ax=ax,
)
# ax = sns.violinplot(data=shuffle_df, x="Graph", y="Proportion", ax=ax)

ax = sns.stripplot(
    data=true_df,
    x="Graph",
    y="Proportion",
    marker="_",
    linewidth=2,
    s=90,
    ax=ax,
    label="True",
Beispiel #37
0
    pickle.dump(model,f)
training['prob']=model.predict_proba(training[features])[:,1]
testing['prob']=model.predict_proba(testing[features])[:,1]
oot_data['prob']=model.predict_proba(oot_data[features])[:,1]
build_cut,bins = model_method.ks_lift_chart(training['y'],training['prob'],'train')
test_cut=model_method.ks_lift_chart(testing['y'],testing['prob'],'testing',bins=bins)
oot_cut = model_method.ks_lift_chart(oot_data['y'],oot_data['prob'],'oot',bins=bins)

month_ks=model_method.month_ks(testing,'app_date')

feature_results = var_cut.get_feature_result(training[features+['y']],'y')

importance_df = model_method.get_xgboost_importances(model,return_df=True)
data_var = model_method.var_avg_plot([training,testing,oot_data],importance_df.index.tolist()[:10],q=10)
model_method.var_lift_plot(training['y'],training['xx2337'],'xx2337')

model_method.var_cut_plot([training,testing],importance_df.index.tolist()[:10],q=10)

var_psi_ie = model_method.var_psi_chart(training,testing,importance_df.index.tolist()[:10],'app_date')

#model_method.get_plot_tree(model)
#PSI(testing[testing['y']==1]['prob'],training[training['y']==1]['prob'])
#for i in importance_df.index.tolist()[:10]:
#    print(PSI(testing[i],training[i]))

import seaborn as sns
sns.set_style('whitegrid')
sns.stripplot(x='app_date',y='xx2392',hue='y',data=training,jitter = True,dodge=True)


Beispiel #38
0
def taxa_abundance_box_plot(
    taxa, metadata=None, hue=None, hue_order=None,
    add_datapoints=False, level=1, by=None, ax=None,
    figsize=None, count=0, exclude_samples=None,
    include_samples=None, exclude_taxa=None, sort_by_names=False,
    sample_names=None, csv_file=None, size=5, pseudocount=False,
    taxa_names=None, brief_xlabels=False, show_means=False,
    meanprops=None, show_others=True, sort_by_mean=True,
    jitter=1, alpha=None, artist_kwargs=None
):
    """Create a taxa abundance box plot.

    +----------------+-----------------------------------------------------+
    | q2-taxa plugin | Example                                             |
    +================+=====================================================+
    | QIIME 2 CLI    | qiime taxa barplot [OPTIONS]                        |
    +----------------+-----------------------------------------------------+
    | QIIME 2 API    | from qiime2.plugins.taxa.visualizers import barplot |
    +----------------+-----------------------------------------------------+

    Parameters
    ----------
    taxa : str or qiime2.Visualization
        Visualization file or object from the q2-taxa plugin.
    metadata : str or qiime2.Metadata, optional
        Metadata file or object.
    hue : str, optional
        Grouping variable that will produce boxes with different colors.
    hue_order : list, optional
        Specify the order of categorical levels of the 'hue' semantic.
    add_datapoints : bool, default: False
        Show datapoints on top of the boxes.
    level : int, default: 1
        Taxonomic level at which the features should be collapsed.
    by : list, optional
        Column name(s) to be used for sorting the samples. Using 'sample-id'
        will sort the samples by their name, in addition to other column
        name(s) that may have been provided. If multiple items are provided,
        sorting will occur by the order of the items.
    ax : matplotlib.axes.Axes, optional
        Axes object to draw the plot onto, otherwise uses the current Axes.
    figsize : tuple, optional
        Width, height in inches. Format: (float, float).
    count : int, default: 0
        The number of taxa to display. When 0, display all.
    exclude_samples : dict, optional
        Filtering logic used for sample exclusion.
        Format: {'col': ['item', ...], ...}.
    include_samples : dict, optional
        Filtering logic used for sample inclusion.
        Format: {'col': ['item', ...], ...}.
    exclude_taxa : list, optional
        The taxa names to be excluded when matched. Case insenstivie.
    sort_by_names : bool, default: False
        If true, sort the columns (i.e. species) to be displayed by name.
    sample_names : list, optional
        List of sample IDs to be included.
    csv_file : str, optional
        Path of the .csv file to output the dataframe to.
    size : float, default: 5.0
        Radius of the markers, in points.
    pseudocount : bool, default: False
        Add pseudocount to remove zeros.
    taxa_names : list, optional
        List of taxa names to be displayed.
    brief_xlabels : bool, default: False
        If true, only display the smallest taxa rank in the x-axis labels.
    show_means : bool, default: False
        Add means to the boxes.
    meanprops : dict, optional
        The meanprops argument as in matplotlib.pyplot.boxplot.
    show_others : bool, default: True
        Include the 'Others' category.
    sort_by_mean : bool, default: True
        Sort taxa by their mean relative abundance after sample filtration.
    jitter : float, default: 1
        Amount of jitter (only along the categorical axis) to apply.
    alpha : float, optional
        Proportional opacity of the points.
    artist_kwargs : dict, optional
        Keyword arguments passed down to the _artist() method.

    Returns
    -------
    matplotlib.axes.Axes
        Axes object with the plot drawn onto it.

    See Also
    --------
    taxa_abundance_bar_plot
    addpairs

    Examples
    --------
    Below is a simple example showing taxonomic abundance at the phylum
    level (i.e. ``level=2``).

    >>> qzv_file = '/Users/sbslee/Desktop/dokdo/data/moving-pictures-tutorial/taxa-bar-plots.qzv'
    >>> dokdo.taxa_abundance_box_plot(qzv_file, level=2, figsize=(8, 7))
    >>> plt.tight_layout()

    .. image:: images/taxa_abundance_box_plot-1.png

    We can control how many taxa to display with ``count``. Also, we can
    make the x-axis tick labels pretty with ``brief_xlabels``. We can
    manually set the x-axis tick labels with ``xticklabels``. Lastly, we
    can select specific taxa to display with ``taxa_names``.

    >>> fig, [[ax1, ax2], [ax3, ax4]] = plt.subplots(2, 2, figsize=(10, 10))
    >>> kwargs = {'level' : 2}
    >>> artist_kwargs1 = dict(title='count=4')
    >>> artist_kwargs2 = dict(title='brief_xlabels=True')
    >>> artist_kwargs3 = dict(xticklabels=['A', 'B', 'C', 'D'], title="xticklabels=['A', 'B', 'C', 'D']")
    >>> artist_kwargs4 = dict(title="taxa_names=[...]")
    >>> dokdo.taxa_abundance_box_plot(qzv_file, ax=ax1, count=4, artist_kwargs=artist_kwargs1, **kwargs)
    >>> dokdo.taxa_abundance_box_plot(qzv_file, ax=ax2, count=4, brief_xlabels=True, artist_kwargs=artist_kwargs2, **kwargs)
    >>> dokdo.taxa_abundance_box_plot(qzv_file, ax=ax3, count=4, artist_kwargs=artist_kwargs3, **kwargs)
    >>> dokdo.taxa_abundance_box_plot(qzv_file, ax=ax4, taxa_names=['k__Bacteria;p__Firmicutes', 'k__Bacteria;p__Proteobacteria'], artist_kwargs=artist_kwargs4, **kwargs)
    >>> plt.tight_layout()

    .. image:: images/taxa_abundance_box_plot-2.png

    We can group the boxes by a metadata column with ``hue``. For this
    plot, we will draw the y-axis in log scale with ``ylog``. To do
    this, we actually need to adjust the y-axis limits with ``ymin``
    and ``ymax``, and also add a pseudocount of 1 to remove 0s with
    ``pseudocount`` (because 0s cannot be shown in log scale). We will
    also add data points with ``add_datapoints=True``.

    >>> artist_kwargs = dict(ylog=True, ymin=0.05, ymax=200, show_legend=True)
    >>> dokdo.taxa_abundance_box_plot(qzv_file,
    ...                               level=2,
    ...                               figsize=(10, 7),
    ...                               hue='body-site',
    ...                               size=3,
    ...                               count=4,
    ...                               pseudocount=True,
    ...                               add_datapoints=True,
    ...                               artist_kwargs=artist_kwargs)
    >>> plt.tight_layout()

    .. image:: images/taxa_abundance_box_plot-3.png
    """
    with tempfile.TemporaryDirectory() as t:
        _parse_input(taxa, t)
        df = pd.read_csv(f'{t}/level-{level}.csv', index_col=0)

    # If provided, update the metadata.
    if metadata is None:
        pass
    else:
        mf = dokdo.get_mf(metadata)
        cols = _get_mf_cols(df)
        df.drop(columns=cols, inplace=True)
        df = pd.concat([df, mf], axis=1, join='inner')

    df["sample-id"] = df.index

    # If provided, sort the samples for display in the x-axis.
    if by:
        df = df.sort_values(by=by)

    # If provided, exclude the specified taxa.
    if isinstance(exclude_taxa, list):
        dropped = []
        for tax in exclude_taxa:
            for col in df.columns:
                if tax.lower() in col.lower():
                    dropped.append(col)
        dropped = list(set(dropped))
        df = df.drop(columns=dropped)

    # Remove the metadata columns.
    cols = _get_mf_cols(df)
    mf = df[cols]
    df = df.drop(columns=cols)

    df, mf = _filter_samples(df, mf, exclude_samples, include_samples)

    # If provided, only include the specified samples.
    if isinstance(sample_names, list):
        df = df.loc[sample_names]
        mf = mf.loc[sample_names]

    if sort_by_mean:
        df = _sort_by_mean(df)

    if ax is None:
        fig, ax = plt.subplots(figsize=figsize)

    # Add a pseudocount.
    if pseudocount:
        df = df + 1

    # Convert counts to proportions.
    df = df.div(df.sum(axis=1), axis=0)

    df = _get_others_col(df, count, taxa_names, show_others)

    if sort_by_names:
        df = df.reindex(sorted(df.columns), axis=1)

    _taxa_names = df.columns

    df = df * 100

    if hue is not None:
        df2 = pd.concat([df, mf[hue]], axis=1, join='inner')
        df2 = pd.melt(df2, id_vars=[hue])
    else:
        df2 = pd.melt(df)



    if meanprops:
        _meanprops = meanprops
    else:
        _meanprops={'marker':'x',
                    'markerfacecolor':'white',
                    'markeredgecolor':'white',
                    'markersize':'10'}

    d = {}

    if show_means:
        d['showmeans'] = True
        d['meanprops'] = _meanprops

    sns.boxplot(x='variable',
                y='value',
                hue=hue,
                hue_order=hue_order,
                data=df2,
                ax=ax,
                **d)

    if add_datapoints:
        remove_duplicates = True
        # Alternative method: sns.swarmplot()
        sns.stripplot(x='variable',
                      y='value',
                      hue=hue,
                      hue_order=hue_order,
                      data=df2,
                      ax=ax,
                      color='black',
                      size=size,
                      dodge=True,
                      jitter=jitter,
                      alpha=alpha)
    else:
        remove_duplicates = False

    # If provided, output the dataframe as a .csv file.
    if csv_file is not None:
        df3 = pd.concat([df, mf], axis=1, join='inner')
        df3.to_csv(csv_file)

    if brief_xlabels:
        xticklabels = [dokdo.pname(x.get_text()) for x in ax.get_xticklabels()]
    else:
        xticklabels = None

    if artist_kwargs is None:
        artist_kwargs = {}

    artist_kwargs = {'xrot': 45,
                     'xha': 'right',
                     'xlabel': '',
                     'ylabel': 'Relative abundance (%)',
                     'xticklabels': xticklabels,
                     'remove_duplicates': remove_duplicates,
                     **artist_kwargs}

    if hue is not None:
        artist_kwargs['legend_title'] = hue

    ax = _artist(ax, **artist_kwargs)

    return ax
warnings.filterwarnings('ignore')
plt.rcParams['figure.figsize'] = (20, 10)
plt.style.use('fivethirtyeight')
sns.boxplot(df_hair_dryer['star_rating'], df_hair_dryer['length'], palette = 'Blues')
plt.title("Relations between Review Length and Star Rating", fontsize = 50)
plt.show()

# Stripplot
warnings.filterwarnings('ignore')
plt.rcParams['figure.figsize'] = (20, 10)
plt.style.use('fivethirtyeight')
plt.xlabel('star_rating', fontsize = 50)
plt.ylabel('review_length', fontsize = 50)
plt.xticks(fontsize=40)
plt.yticks(fontsize=40)
sns.stripplot(df_hair_dryer['star_rating'], df_hair_dryer['length'], palette = 'Reds')
plt.title("Relations between Review Length and Star Rating", fontsize = 60)
plt.show()

'''
---------------------------------2----------------------------------
'''

'''
------------------------------Part a--------------------------------
'''

# Cleaning the reviews
import re
import nltk
nltk.download('stopwords')
for l in range(len(links)):
   for r in range(num_runs):
      if 1 in (surrogates[l, r, :] > TE[l, r]):
         p_vals[l, r] = 1-(np.argmax(surrogates[l, r, :] > TE[l, r])/num_surrogates)
      else:
         p_vals[l, r] = 0

print(p_vals)
p_vals = np.delete(p_vals, obj = 1, axis = 1)
print(p_vals)

fig, axs = plt.subplots(figsize = (6, 6))
#sns.boxplot(data = np.transpose(p_vals[:, :]), palette = "Set3", linewidth = 2, width = 0.5, fliersize = 4)
sns.boxplot(data = np.transpose(p_vals[:, :]), palette = "colorblind",
             linewidth = 4, width = 0.5, fliersize = 0)
sns.stripplot(data = np.transpose(p_vals[:, :]), palette = "colorblind",
             linewidth = 3, size = 10)
plt.hlines(0.05, -0.5, 5.5, color = "black", linewidth = 2, linestyle='--')
plt.xticks([0, 1, 2, 3, 4, 5], LINKS)

#plt.xlabel("connection")
plt.ylabel("p value")
plt.ylim([-0.1, 1.19])

#for i in [0, 1, 2, 3, 5]:
for i in range(6):
   plt.scatter(i, 1.1, s=1000, c='green', marker='$✓$')
#for i in [4]:
#   plt.scatter(i, 1.1, s=1000, c='red', marker='$×$')


plt.tight_layout()
 def df_function(collection_df, attribute, ax):
     sns.stripplot(x=attribute, y=SCORE, hue=split, data=collection_df,
                   order=sorted(collection_df[attribute].unique()),
                   jitter=1, dodge=True, alpha=0.5, ax=ax)
 def df_function(collection_df, ax):
     hue_order_option = {'hue_order': sorted(collection_df[split].unique())} if split else {}
     sns.stripplot(x=attribute, y=SCORE, hue=split, data=collection_df,
                   order=sorted(collection_df[attribute].unique()),
                   **hue_order_option,
                   jitter=1, dodge=True, alpha=0.5, ax=ax)
Beispiel #43
0
 ax = axs[row, 0]
 # sns.violinplot(
 #     data=neuron_df[neuron_df["neuron_type"].isin(row_neuron_types)],
 #     x="neuron_type",
 #     y=f"component_score_{i}",
 #     hue="neuron_type",
 #     palette=neuron_type_palette,
 #     ax=ax,
 #     inner=None,
 # )
 sns.stripplot(
     data=neuron_df[neuron_df["neuron_type"].isin(row_neuron_types)],
     x="neuron_type",
     y=f"component_score_{i}",
     hue="neuron_type",
     hue_order=row_neuron_types,  # ensures sorting stays the same
     order=row_neuron_types,  # ensures sorting stays the same
     palette=neuron_type_palette,
     ax=ax,
     s=2,
 )
 ax.get_legend().remove()
 ax.set(xlim=(-1, n_per_row),
        ylim=(y_min, y_max),
        xlabel="",
        ylabel="",
        yticks=[])
 ax.axhline(0, color="black", linestyle=":", linewidth=1)
 ax.tick_params(length=0)
 plt.setp(ax.get_xticklabels(), rotation=45)
 for tick in ax.get_xticklabels():
def visulaization(cv_df):
    fig, ax = plt.subplots(figsize=(30,30))
    sns.boxplot(x='model_name', y='accuracy', data=cv_df)
    sns.stripplot(x='model_name', y='accuracy', data=cv_df, size=8, jitter=True, edgecolor="gray", linewidth=2)
    plt.show()
Beispiel #45
0
density = density[:, 1:]
counts_per_struct = counts_per_struct[1:, :]
#%%

#boxplots for counts
import seaborn as sns

#first, rearrange structures in ASCENDING order (will be plotted as descending, -_-) by density and counts
order = np.argsort(np.median(counts_per_struct.T, axis=0))[::-1]
sois_sort = np.array(nuclei)[order][:10]

#boxplots of percent counts
plt.figure(figsize=(5, 4))
df = pd.DataFrame(pcounts)
df.columns = nuclei
g = sns.stripplot(data=df, color="dimgrey", orient="h", order=sois_sort)
sns.boxplot(data=df,
            orient="h",
            showfliers=False,
            showcaps=False,
            boxprops={'facecolor': 'None'},
            order=sois_sort)
plt.xlabel("# Neurons")
plt.ylabel("Subnucleus")
plt.savefig(os.path.join(fig_dst, "thal_counts_boxplots.pdf"),
            bbox_inches="tight")

#%%

#boxplots of density
#first, rearrange structures in ASCENDING order (will be plotted as descending, -_-) by density and counts
#explore the len of categorical variable Studio, used in the assignment
len(mov.Genre.unique()

#filter the dataframe by genre
mov2 = mov[(mov.Genre == 'action') | (mov.Genre == 'adventure') | (mov.Genre == 'animation') | (mov.Genre == 'comedy') | (mov.Genre == 'drama')]

#filter the mov2 dataframe by studio
mov3 = mov2[(mov2.Studio == 'Buena Vista Studios') | (mov2.Studio == 'Fox') | (mov2.Studio == 'Paramount Pictures') | (mov2.Studio == 'Sony') | (mov2.Studio == 'Universal') | (mov2.Studio == 'WB')]

#check how the filters worked
print (mov3.Genre.unique())
print (mov3.Studio.unique())
print (len(mov3))

#define the style
sns.set(style="darkgrid", palette="muted", color_codes=True)

#plot the boxsplots
ax = sns.boxplot(data=mov3, x='Genre', y='Gross % US', orient='v', color='lightgray', showfliers=False)
plt.setp(ax.artists, alpha=0.5)

#add in points to show each observation
sns.stripplot(x='Genre', y='Gross % US', data=mov3, jitter=True, size=6, linewidth=0, hue = 'Studio', alpha=0.7)

ax.axes.set_title('Domestic Gross % by Genre',fontsize=30)
ax.set_xlabel('Genre',fontsize=20)
ax.set_ylabel('Gross % US',fontsize=20)

#define where to place the legend
ax.legend(bbox_to_anchor=(1.05, 1), loc=2)
})
ObjectiveC = pd.DataFrame({
    'Linguagem de Programação':
    np.repeat('Objective-C', 40),
    'Quantidade de Palavras':
    (322, 443, 446, 462, 710, 219, 446, 463, 461, 461, 764, 1059, 37, 446, 446,
     37, 446, 37, 39, 866, 462, 446, 37, 446, 666, 462, 461, 446, 461, 39, 462,
     443, 37, 443, 8, 446, 446, 461, 324, 461)
})

df = MATLAB.append(Julia).append(Clojure).append(Perl).append(ObjectiveC)

# boxplot
ax = sns.boxplot(x='Linguagem de Programação',
                 y='Quantidade de Palavras',
                 data=df)
# add stripplot
ax = sns.stripplot(x='Linguagem de Programação',
                   y='Quantidade de Palavras',
                   data=df,
                   color="orange",
                   jitter=0.2,
                   size=2.5)

# add title
plt.title(
    "Boxplot da contagem de palavras das 5 linguagens de programação com menos códigos de conduta",
    loc="left")

# show the graph
plt.show()
Beispiel #48
0
from pydataset import data
import seaborn as sns

df = data('mtcars')
df

#%%quantiles
intervals = np.linspace(0, 1, 11)
intervals
df.mpg.sort_values()
np.sort(df.mpg)[16]
df.quantile(q=0.5, axis=0)  #columns
df.quantile(q=intervals, axis=0)  #columns
df.boxplot()
df.boxplot(column=['mpg'])
ax = sns.stripplot(x="gear", y="mpg", data=df)

#quantiles
q3, q1 = np.percentile(df['hp'], [75, 25])
q3, q1
q3 - q1

from scipy import stats
IQR = stats.iqr(df['hp'])
IQR


#define function to calculate interquartile range
def find_iqr(x):
    return np.subtract(*np.percentile(x, [75, 25]))
Beispiel #49
0
    412, 413, 414, 415, 417, 418, 419, 421, 422, 423, 425, 426, 426, 427, 427,
    429, 430, 431, 432, 432, 433, 434, 435, 436, 437, 438, 439, 440, 441, 442,
    444, 445, 446, 447, 448, 449, 450, 453, 454, 455, 457, 458, 459, 460, 460,
    461, 462, 463, 464, 465, 466, 467, 468, 469, 470, 471, 473, 474, 475, 476,
    477, 478, 479, 480, 481, 482, 483, 484, 485, 486, 487, 488, 489, 490, 491,
    492, 493, 494, 495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505, 506,
    507, 508, 509, 510, 511, 512, 513, 514, 540
]

df = pd.DataFrame(pd_data)

df["targeted"] = [int(y) in our_loci for y in [x[12:-12] for x in df["locus"]]]

# df["evalue"].plot.kde()
# plt.xlim(0.1,0)
# plt.savefig("./tblastx_evals.png")
# plt.clf()

#evalue distribution
sns.stripplot(data=df, x="targeted", y="evalue", alpha=0.5)
plt.yscale('log', nonpositive='clip')
plt.tight_layout()
plt.savefig("./tblastx_evals.png")
plt.clf()

#bitscore distribution
sns.stripplot(data=df, x="targeted", y="bitscore", alpha=0.5)
plt.tight_layout()
plt.savefig("./tblastx_bitscores.png")
plt.clf()
Beispiel #50
0
    X,
    y,
    cv=RepeatedKFold(n_splits=5, n_repeats=5),
    return_estimator=True,
    n_jobs=-1,
)
coefs = pd.DataFrame(
    [
        est.named_steps["transformedtargetregressor"].regressor_.coef_
        * X_train_preprocessed.std(axis=0)
        for est in cv_model["estimator"]
    ],
    columns=feature_names,
)
plt.figure(figsize=(9, 7))
sns.stripplot(data=coefs, orient="h", color="k", alpha=0.5)
sns.boxplot(data=coefs, orient="h", color="cyan", saturation=0.5)
plt.axvline(x=0, color=".5")
plt.xlabel("Coefficient importance")
plt.title("Coefficient importance and its variability")
plt.subplots_adjust(left=0.3)

# %%
# The problem of correlated variables
# -----------------------------------
#
# The AGE and EXPERIENCE coefficients are affected by strong variability which
# might be due to the collinearity between the 2 features: as AGE and
# EXPERIENCE vary together in the data, their effect is difficult to tease
# apart.
#
Cars.head(10)

Cars.describe()
Cars.choice.value_counts()

# Boxplot of independent variable distribution for each category of choice
sns.boxplot(x="choice", y="cost.car", data=Cars)
sns.boxplot(x="choice", y="cost.carpool", data=Cars)
sns.boxplot(x="choice", y="cost.bus", data=Cars)
sns.boxplot(x="choice", y="cost.rail", data=Cars)
sns.boxplot(x="choice", y="time.car", data=Cars)
sns.boxplot(x="choice", y="time.bus", data=Cars)
sns.boxplot(x="choice", y="time.rail", data=Cars)

# Scatter plot for each categorical choice of car
sns.stripplot(x="choice", y="cost.car", jitter=True, data=Cars)
sns.stripplot(x="choice", y="cost.carpool", jitter=True, data=Cars)
sns.stripplot(x="choice", y="cost.carpool", jitter=True, data=Cars)
sns.stripplot(x="choice", y="cost.rail", jitter=True, data=Cars)
sns.stripplot(x="choice", y="time.cars", jitter=True, data=Cars)
sns.stripplot(x="choice", y="time.bus", jitter=True, data=Cars)
sns.stripplot(x="choice", y="time.rail", jitter=True, data=Cars)

# Scatter plot between each possible pair of independent variable and also histogram for each independent variable
sns.pairplot(
    Cars, hue="choice"
)  # With showing the category of each car choice in the scatter plot
sns.pairplot(Cars)  # Normal

# Correlation values between each independent features
Cars.corr()
def plotProfileDataDuration(profileData, night, valueCat):
    fig, axes = plt.subplots(nrows=5, ncols=6, figsize=(14, 12))

    row = 0
    col = 0
    fig.suptitle(t="{} of events (night {})".format(valueCat, night),
                 y=1.2,
                 fontweight='bold')

    #plot the data for each behavioural event
    for behavEvent in behaviouralEventOneMouse[:-2]:
        event = behavEvent + valueCat
        print("event: ", event)

        profileValueDictionary = getProfileValues(profileData=profileData,
                                                  night=night,
                                                  event=event)
        y = profileValueDictionary["value"]
        x = profileValueDictionary["genotype"]
        genotypeType = Counter(x)
        group = profileValueDictionary["exp"]

        print("y: ", y)
        print("x: ", x)
        print("group: ", group)
        experimentType = Counter(group)
        print("Nb of experiments: ", len(experimentType))

        axes[row, col].set_xlim(-0.5, 1.5)
        axes[row, col].set_ylim(min(y) - 0.2 * max(y), max(y) + 0.2 * max(y))
        sns.stripplot(x, y, jitter=True, hue=group, s=5, ax=axes[row, col])
        axes[row, col].set_title(behavEvent)
        axes[row, col].set_ylabel("{} (frames)".format(valueCat))
        axes[row, col].legend().set_visible(False)
        axes[row, col].spines['right'].set_visible(False)
        axes[row, col].spines['top'].set_visible(False)

        if col < 5:
            col += 1
            row = row
        else:
            col = 0
            row += 1

    #plot the data for the total distance traveled
    profileValueDictionary = getProfileValues(profileData=profileData,
                                              night=night,
                                              event="totalDistance")
    y = profileValueDictionary["value"]
    x = profileValueDictionary["genotype"]
    genotypeType = Counter(x)
    group = profileValueDictionary["exp"]

    print("y: ", y)
    print("x: ", x)
    print("group: ", group)
    experimentType = Counter(group)
    print("Nb of experiments: ", len(experimentType))

    axes[row, col].set_xlim(-0.5, 1.5)
    axes[row, col].set_ylim(min(y) - 0.2 * max(y), max(y) + 0.2 * max(y))
    sns.stripplot(x, y, jitter=True, hue=group, s=5, ax=axes[row, col])
    axes[row, col].set_title("Activity")
    axes[row, col].set_ylabel("total distance (m)")
    axes[row, col].legend().set_visible(False)
    axes[row, col].spines['right'].set_visible(False)
    axes[row, col].spines['top'].set_visible(False)

    if col < 7:
        col += 1
        row = row
    else:
        col = 0
        row += 1

    fig.tight_layout()
    fig.savefig("FigProfile{}_Events_night_{}.pdf".format(valueCat, night),
                dpi=100)
    plt.close(fig)
Y


# In[61]:


import seaborn as sns
import matplotlib.pyplot as plt
plt.figure(figsize=(20,15), facecolor='white')
plotnumber = 1

for column in X:
    if plotnumber<=len(X) :
        ax = plt.subplot(3,3,plotnumber)
        sns.stripplot(Y,X[column])
    plotnumber+=1
plt.show()


# In[45]:


from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y,test_size=.30,random_state=355)


# In[46]:


from sklearn.tree import DecisionTreeClassifier
Beispiel #54
0
#!/usr/bin/env python
# coding: utf-8

# In[1]:

import seaborn as sns
sns.set_theme(style="whitegrid")
tips = sns.load_dataset("tips")
ax = sns.stripplot(x=tips["total_bill"])

# In[70]:

sns.__version__

# In[16]:

import pandas as pd

# In[46]:

tips[tips.day == "Sun"]

# In[57]:

rec = tips.iloc[[77, 90, 19, 1]]

# In[58]:

# In[59]:

type(tips)
Beispiel #55
0
def rank_genes_groups_violin(adata,
                             groups=None,
                             n_genes=20,
                             use_raw=None,
                             split=True,
                             scale='width',
                             strip=True,
                             jitter=True,
                             size=1,
                             computed_distribution=False,
                             ax=None,
                             show=None,
                             save=None):
    """Plot ranking of genes for all tested comparisons.

    Parameters
    ----------
    adata : :class:`~scanpy.api.AnnData`
        Annotated data matrix.
    groups : list of `str`, optional (default: `None`)
        List of group names.
    n_genes : `int`, optional (default: 20)
        Number of genes to show.
    use_raw : `bool`, optional (default: `None`)
        Use `raw` attribute of `adata` if present. Defaults to the value that
        was used in :func:`~scanpy.api.tl.rank_genes_groups`.
    split : `bool`, optional (default: `True`)
        Whether to split the violins or not.
    scale : `str` (default: 'width')
        See `seaborn.violinplot`.
    strip : `bool` (default: `True`)
        Show a strip plot on top of the violin plot.
    jitter : `int`, `float`, `bool`, optional (default: `True`)
        If set to 0, no points are drawn. See `seaborn.stripplot`.
    size : `int`, optional (default: 1)
        Size of the jitter points.
    computed_distribution : `bool`, optional (default: `False`)
        Set to `True` if you want to use the scaled and shifted distribution
        previously computed with the `compute_distribution` in
        :func:`scanpy.api.tl.rank_genes_groups`
    show : `bool`, optional (default: `None`)
        Show the plot, do not return axis.
    save : `bool` or `str`, optional (default: `None`)
        If `True` or a `str`, save the figure. A string is appended to the
        default filename. Infer the filetype if ending on \{'.pdf', '.png', '.svg'\}.
    ax : `matplotlib.Axes`, optional (default: `None`)
        A `matplotlib.Axes` object.
    """
    from ..tools import rank_genes_groups
    groups_key = str(adata.uns['rank_genes_groups']['params']['groupby'])
    if use_raw is None:
        use_raw = bool(adata.uns['rank_genes_groups']['params']['use_raw'])
    reference = str(adata.uns['rank_genes_groups']['params']['reference'])
    groups_names = (adata.uns['rank_genes_groups']['names'].dtype.names
                    if groups is None else groups)
    if isinstance(groups_names, str): groups_names = [groups_names]
    for group_name in groups_names:
        keys = []
        gene_names = adata.uns['rank_genes_groups']['names'][
            group_name][:n_genes]
        if computed_distribution:
            for gene_counter, gene_name in enumerate(gene_names):
                identifier = rank_genes_groups._build_identifier(
                    groups_key, group_name, gene_counter, gene_name)
                if compute_distribution and identifier not in set(
                        adata.obs_keys()):
                    raise ValueError(
                        'You need to set `compute_distribution=True` in '
                        '`sc.tl.rank_genes_groups()`.')
                keys.append(identifier)
        else:
            keys = gene_names
        # make a "hue" option!
        df = pd.DataFrame()
        for key in keys:
            if adata.raw is not None and use_raw:
                X_col = adata.raw[:, key].X
            else:
                X_col = adata[:, key].X
            if issparse(X_col): X_col = X_col.toarray().flatten()
            df[key] = X_col
        df['hue'] = adata.obs[groups_key].astype(str).values
        if reference == 'rest':
            df['hue'][df['hue'] != group_name] = 'rest'
        else:
            df['hue'][~df['hue'].isin([group_name, reference])] = np.nan
        df['hue'] = df['hue'].astype('category')
        df_tidy = pd.melt(df, id_vars='hue', value_vars=keys)
        x = 'variable'
        y = 'value'
        hue_order = [group_name, reference]
        import seaborn as sns
        ax = sns.violinplot(x=x,
                            y=y,
                            data=df_tidy,
                            inner=None,
                            hue_order=hue_order,
                            hue='hue',
                            split=split,
                            scale=scale,
                            orient='vertical',
                            ax=ax)
        if strip:
            ax = sns.stripplot(x=x,
                               y=y,
                               data=df_tidy,
                               hue='hue',
                               dodge=True,
                               hue_order=hue_order,
                               jitter=jitter,
                               color='black',
                               size=size,
                               ax=ax)
        ax.set_xlabel('genes')
        ax.set_title('{} vs. {}'.format(group_name, reference))
        ax.legend_.remove()
        if computed_distribution: ax.set_ylabel('z-score w.r.t. to bulk mean')
        else: ax.set_ylabel('expression')
        ax.set_xticklabels(gene_names, rotation='vertical')
        writekey = ('rank_genes_groups_' +
                    str(adata.uns['rank_genes_groups']['params']['groupby']) +
                    '_' + group_name)
        utils.savefig_or_show(writekey, show=show, save=save)
Beispiel #56
0
    model_name = model.__class__.__name__
    accuracies = cross_val_score(model,
                                 features,
                                 labels,
                                 scoring='accuracy',
                                 cv=CV)
    for fold_idx, accuracy in enumerate(accuracies):
        entries.append((model_name, fold_idx, accuracy))
cv_df = pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'accuracy'])

#绘制箱线图
sns.boxplot(x='model_name', y='accuracy', data=cv_df)
sns.stripplot(x='model_name',
              y='accuracy',
              data=cv_df,
              size=8,
              jitter=True,
              edgecolor="gray",
              linewidth=2)
plt.show()

#线性SVC模型调用
model = LinearSVC()
X_train, X_test, y_train, y_test, indices_train, indices_test = train_test_split(
    features,
    labels,
    data_after_stop.index,
    test_size=0.3,
    stratify=labels,
    random_state=0)
model.fit(X_train, y_train)
def window_boxplot_fepD_vs_BW(metadata, 
                              features, 
                              feat='motion_mode_paused_fraction',
                              windows=None,
                              save_dir=None):    
    
    import seaborn as sns
    from matplotlib import transforms
    from matplotlib import pyplot as plt

    plot_df = metadata[['bacteria_strain','window','date_yyyymmdd']].join(features[[feat]])
    
    if windows is not None:
        assert all(w in sorted(plot_df['window'].unique()) for w in windows)
        plot_df = plot_df[plot_df['window'].isin(windows)]
    else:
        windows = sorted(plot_df['window'].unique())
    
    bacteria_strain_list = ['BW', 'fepD']
    
    plt.close('all')
    fig, ax = plt.subplots(figsize=(max(8,len(windows)),8))
    sns.boxplot(x='window', 
                y=feat, 
                order=windows,
                hue='bacteria_strain', 
                hue_order=bacteria_strain_list, 
                dodge=True,
                ax=ax, 
                palette='tab10', 
                showfliers=False,
                data=plot_df)
    dates = list(plot_df['date_yyyymmdd'].unique())
    date_col_dict = dict(zip(dates, sns.color_palette('Greys', n_colors=len(dates))))
    for date in dates:
        sns.stripplot(x='window',
                      y=feat,
                      order=windows,
                      hue='bacteria_strain',
                      hue_order=bacteria_strain_list,
                      dodge=True,
                      ax=ax,
                      s=3, marker='D',
                      color=sns.set_palette(palette=[date_col_dict[date]], 
                                            n_colors=len(bacteria_strain_list)),
                      data=plot_df[plot_df['date_yyyymmdd']==date])
    
    # scale plot y-axis
    scale_outliers = False
    if scale_outliers:
        grouped_strain = plot_df.groupby('window')
        y_bar = grouped_strain[feat].median() # median is less skewed by outliers
        Q1, Q3 = grouped_strain[feat].quantile(0.25), grouped_strain[feat].quantile(0.75)
        IQR = Q3 - Q1
        plt.ylim(min(y_bar) - 2.5 * max(IQR), max(y_bar) + 2.5 * max(IQR))
 
    # load t-test results for fepD vs BW at each window
    t_test_path = stats_dir / 'pairwise_ttests' / 'fepD_window_results.csv'
    ttest_df = pd.read_csv(t_test_path, index_col=0)
    pvals = ttest_df[[c for c in ttest_df if 'pvals_' in c]]

    # annotate p-values
    for ii, window in enumerate(windows):
        p = pvals.loc[feat, 'pvals_{}'.format(window)]
        text = ax.get_xticklabels()[ii]
        assert text.get_text() == str(window)
        p_text = 'P < 0.001' if p < 0.001 else 'P = %.3f' % p
        trans = transforms.blended_transform_factory(ax.transData, ax.transAxes)
        plt.plot([ii-.2, ii-.2, ii+.2, ii+.2], 
                 [0.98, 0.99, 0.99, 0.98], lw=1.5, c='k', transform=trans)
        ax.text(ii, 1.01, p_text, fontsize=9, ha='center', va='bottom', transform=trans)

    # legend and labels
    n_labs = len(bacteria_strain_list)
    handles, labels = ax.get_legend_handles_labels()
    ax.legend(handles[:n_labs], labels[:n_labs], fontsize=12, frameon=False, loc=(1.01, 0.9),
              handletextpad=0.2)
    ax.set_xlabel('')
    ax.set_xticklabels([WINDOW_DICT_STIM_TYPE[w] for w in windows])
    ax.set_ylabel(feat.replace('_',' '), fontsize=12, labelpad=10)

    plt.subplots_adjust(right=0.85)
    
    if save_dir is not None:
        save_path = Path(save_dir) / '{}_windows'.format(len(windows)) / '{}.png'.format(feat)
        save_path.parent.mkdir(parents=True, exist_ok=True)
        plt.savefig(save_path, dpi=300)
    else:
        plt.show()    
      
    return
Beispiel #58
0
def generate_graphs(fname, fluoros, tps, conditions):
    """
    Takes transposed data and generates all sets of 
    dot plots and scatter plots across all conditions,
    fluorophores, and timepoints

    Extracts Data Frames from dic = get_user_dict(c_list, timepoints, fluorophores, xls, c_tp_list)

    df = dic[condition][cd_tp]
    """
    n = len(tps)

    #Retrieve data for each condition, timepoint, and fluorophore.

    #Step 0: Generate scatter list of permutations of fluorophores to graph
    scatter_list = list(itertools.combinations(fluoros, 2))
    print("Fluorophores to be plotted against each other: ")
    for pair in scatter_list:
        print(pair[0], " vs. ", pair[1], "\n")

    #Step 1: Load the tranposed file
    #Read Transposed Intermediate excel
    #Returns a dictionary - the keys are the sheet names, and the values are the sheets as dataframes.
    df_dic = pd.read_excel(fname, sheet_name=None)

    xls = pd.ExcelFile(fname)

    n = len(tps)
    f = len(fluoros)
    c = len(conditions)

    #Create sheet of all dataframes that need a timepoint column
    #parse conditions from sheetnames
    sheet_conds, sheet_tps, sheet_cd_tps = parse_sheetnames(xls)
    sheet_conds = list(set(sheet_conds))

    #Step 2: Generate dictionary of all dfs grouped by condition
    #plot_dic = {'BM' : [bm1.df, bm2.df, ...], ...}
    plot_dic = {}
    for cond in sheet_conds:
        plot_dic[cond] = []

    keys = list(df_dic.keys())
    for cond in sheet_conds:
        for i in range(len(keys)):
            key_low = keys[i].lower()
            cond_low = cond.lower()
            if cond_low in key_low:
                plot_dic[cond].append(df_dic[keys[i]])

    #Step 3: Remove statistical outliers and add Timepoint column to each dataframe
    #FILTER OUTLIERS OR NOT:
    finished = False
    while not (finished):
        answer = input(
            "Would you like to filter outliers from your plots?\nOutliers are values >2 SD from the mean. Answer Y or N: "
        )
        if answer.upper() == "Y":
            for cond in plot_dic:
                #Get the list of sheets for that condition
                sheets_list = plot_dic[cond]
                #for each df in a condition,
                for sheet_df in sheets_list:
                    #Get column names
                    columns = list(sheet_df.columns)
                    #For each column name
                    for col in columns:
                        #Filter outliers by stddev in each column
                        mean = sheet_df[col].mean()
                        sd = sheet_df[col].std()
                        sheet_df = sheet_df[(np.abs(sheet_df[col] - mean) <
                                             2 * sd)]
                for i in range(n):
                    df = sheets_list[i]
                    df['Timepoint'] = tps[i]
                    #Remove unnamed column
                    df.drop('Unnamed: 0', inplace=True, axis=1)
            finished = True
            break
        if answer.upper() == "N":
            for cond in plot_dic:
                #Get the list of sheets for that condition
                sheets_list = plot_dic[cond]
                for i in range(n):
                    df = sheets_list[i]
                    df['Timepoint'] = tps[i]
                    #Remove unnamed column
                    df.drop('Unnamed: 0', inplace=True, axis=1)
            finished = True
            break
        else:
            print("You did not type Y or N. Please reenter. \n")

    #Step 4: Plot Scatter plots with or without trendline
    #TREND LINE OR NOT:
    finished = False
    while not (finished):
        answer = input("Would you like a trend line? Answer Y or N: ")
        if answer.upper() == "Y":
            #lmplot == scatter plot with trendline
            for cond in sheet_conds:
                for pair in scatter_list:
                    kwargs = {'edgecolor': "white"}
                    g = sns.lmplot(x=pair[1],
                                   y=pair[0],
                                   hue='Timepoint',
                                   data=pd.concat(plot_dic[cond]),
                                   ci=None,
                                   scatter_kws=kwargs)
                    plt.xlabel(pair[1] + " Intensity (AU)")
                    plt.ylabel(pair[0] + ' Intensity (AU)')
                    plt.xlim(0, None)
                    plt.ylim(0, None)
                    plt.title(cond)
                    plt.show()
            finished = True
            break
        if answer.upper() == "N":
            #relplot == scatter plot without trendline
            for cond in sheet_conds:
                for pair in scatter_list:
                    g = sns.relplot(x=pair[1],
                                    y=pair[0],
                                    hue='Timepoint',
                                    data=pd.concat(plot_dic[cond]),
                                    kind='scatter')
                    plt.xlabel(pair[1] + " Intensity (AU)")
                    plt.ylabel(pair[0] + ' Intensity (AU)')
                    plt.xlim(0, None)
                    plt.ylim(0, None)
                    plt.title(cond)
                    plt.show()
            finished = True
            break
        else:
            print("You did not type Y or N. Please reenter. \n")

    #Step 5: Restructure data frames for Dot Plots
    #Add condition column to each dataframe
    dotplot_df = pd.DataFrame()
    for cond in plot_dic:
        length = len(plot_dic[cond])
        for i in range(length):
            #Get dataframe
            cond_df = plot_dic[cond][i]
            #Add Condition column
            cond_df['Condition'] = cond
            #Concatenate the df to master dotplot df
            dotplot_df = pd.concat([dotplot_df, cond_df])

    #Step 6: Plot Dot Plots
    print("\nDot plots to be plotted: ")
    for f in fluoros:
        print(f"{f}\n")
    #ADD BOXPLOT OR VIOLINPLOT OR NOT:
    finished = False
    while not (finished):
        answer = input(
            "Would you like a box plot or violin plot overlaid on the dot plots? Answer Y or N: "
        )
        if answer.upper() == "Y":
            finished2 = False
            while not (finished2):
                answer2 = input(
                    "Please enter box for box plot, and enter violin for violin plot: "
                )
                if answer2.lower() == "box":
                    for f in fluoros:
                        g = sns.boxplot(x="Condition",
                                        y=f,
                                        data=dotplot_df,
                                        hue='Timepoint')
                        g = sns.stripplot(x='Condition',
                                          y=f,
                                          hue="Timepoint",
                                          data=dotplot_df,
                                          jitter=True,
                                          dodge=True,
                                          edgecolor='w',
                                          linewidth=0.5)
                        plt.ylim(0, None)
                        plt.ylabel(f + " Intensity (AU)")
                        plt.title(f)
                        plt.show()
                    finished2 = True
                    break
                if answer2.lower() == "violin":
                    for f in fluoros:
                        g = sns.violinplot(x="Condition",
                                           y=f,
                                           data=dotplot_df,
                                           hue='Timepoint')
                        g = sns.stripplot(x='Condition',
                                          y=f,
                                          hue="Timepoint",
                                          data=dotplot_df,
                                          jitter=True,
                                          dodge=True,
                                          edgecolor='w',
                                          linewidth=0.5)
                        plt.ylim(0, None)
                        plt.ylabel(f + " Intensity (AU)")
                        plt.title(f)
                        plt.show()
                    finished2 = True
                    break
                else:
                    print("You did not type box or violin. Please reenter. \n")
            finished = True
            break
        if answer.upper() == "N":
            for f in fluoros:
                g = sns.stripplot(x='Condition',
                                  y=f,
                                  hue="Timepoint",
                                  data=dotplot_df,
                                  jitter=True,
                                  dodge=True,
                                  edgecolor='w',
                                  linewidth=0.5)
                plt.ylim(0, None)
                plt.ylabel(f + " Intensity (AU)")
                plt.title(f)
                plt.show()
            finished = True
            break
        else:
            print("You did not type Y or N. Please reenter. \n")

    pass
 def df_function(collection_df, other, ax):
     x_attr, hue_attr = (other, attribute) if not reverse else (attribute, other)
     sns.stripplot(x=x_attr, y=SCORE, hue=hue_attr, data=collection_df,
                   order=sorted(collection_df[x_attr].unique()),
                   jitter=0.1, dodge=True, alpha=0.5, ax=ax)
Beispiel #60
0
g = sns.pairplot(train[features_of_interest], hue='Survived', palette = 'seismic',
                 diag_kind='kde', diag_kws=dict(shade=True), plot_kws=dict(s=10))
g.set(xticklabels=[])


# ## 4.2 Breakdown by Categories
# The correlation is a nice start. Now let's show how survival changes with some of these categories

# In[ ]:


# Plot
sns.set_style('white')
fig = plt.figure(figsize=(12,12))
ax = sns.stripplot(x='Title', y='fare_pp', data=train, jitter=0.2,
                  alpha=0.9, hue='Survived', split=False, palette="RdBu")

# Label
title = plt.title("Titles and Money", fontsize=14, fontweight='bold')
title.set_position([.5, 1.03])
plt.ylabel('Fare per Person ($)', fontsize=11, fontweight='bold')
plt.xlabel('Title', fontsize=11, fontweight='bold')
ax.set_ylim(-1,100);

# Y-Axis Ticks
def dollars(x, pos):
    #The two args are the value and tick position
    return '$%1.2f' % (x)
formatter = FuncFormatter(dollars)
ax.yaxis.set_major_formatter(formatter)