Ejemplo n.º 1
0
def ca_box_plot_shopping():
    # 读取数据
    data1 = pd.read_csv('data/split_class/large_IGNORE_425_shopping_+1.txt', sep=' ', header=None)
    data2 = pd.read_csv('data/split_class/large_IGNORE_425_shopping_-1.txt', sep=' ', header=None)
    col1 = data1[2] / data1[1]
    col2 = data2[2] / data2[1]
    # print(col1.describe())
    # print(col2.describe())
    # col1.to_csv("shopping_+1.txt")
    # col2.to_csv("shopping_-1.txt")
    plt.figure(figsize=(8, 4))
    sns.boxplot(data=[col1, col2], fliersize=0.1, width=0.3)
    # sns.violinplot(data=[col1, col2], fliersize=0.1, width=0.3)

    plt.xticks((0, 1), ('Extroverts', 'Introverts'), fontsize=20)
    # plt.xlim(0.5, 2.5)

    plt.yticks(fontsize=20)
    plt.ylabel("Purchasing Index", fontsize=20)
    plt.ylim(0, 0.12)

    # plt.boxplot(data=[col1, col2], vert=False, sym='k+', showmeans=True, showfliers=True, notch=1)
    # plt.yticks((1, 2), ('Extroverts', 'Introverts'), fontsize=25, rotation=30)
    # plt.ylim(0.5, 2.5)
    #
    # plt.xticks(fontsize=30)
    # plt.xlabel("Purchasing Index", fontsize=30)
    # plt.xlim(0, 0.12)
    plt.savefig('figure/purchase_box.eps', dpi=300)
    plt.show()
Ejemplo n.º 2
0
def plot_return_quantiles(returns, df_weekly, df_monthly, ax=None, **kwargs):
    """
    Creates a box plot of daily, weekly, and monthly return distributions.

    Parameters
    ----------
    returns : pd.Series
        Daily returns of the strategy, non-cumulative.
    df_weekly : pd.Series
        Weekly returns of the strategy, non-cumulative.
    df_monthly : pd.Series
        Monthly returns of the strategy, non-cumulative.
    ax : matplotlib.Axes, optional
        Axes upon which to plot.
    **kwargs, optional
        Passed to seaborn plotting function.

    Returns
    -------
    ax : matplotlib.Axes
        The axes that were plotted on.
    """

    if ax is None:
        ax = plt.gca()

    sns.boxplot(data=[returns, df_weekly, df_monthly],
                ax=ax, **kwargs)
    ax.set_xticklabels(['daily', 'weekly', 'monthly'])
    ax.set_title('Return quantiles')
    return ax
def plot_retest_data(retest_data, size=4.6, save_dir=None):
    colors = [sns.color_palette('Reds_d',3)[0], sns.color_palette('Blues_d',3)[0]]
    f = plt.figure(figsize=(size,size*.75))
    # plot boxes
    with sns.axes_style('white'):
        box_ax = f.add_axes([.15,.1,.8,.5]) 
        sns.boxplot(x='icc3.k', y='Measure Category', ax=box_ax, data=retest_data,
                    palette={'Survey': colors[0], 'Task': colors[1]}, saturation=1,
                    width=.5, linewidth=size/4)
    box_ax.text(0, 1, '%s Task measures' % Task_N, color=colors[1], fontsize=size*2)
    box_ax.text(0, 1.2, '%s Survey measures' % Survey_N, color=colors[0], fontsize=size*2)
    box_ax.set_ylabel('Measure category', fontsize=size*2, labelpad=size)
    box_ax.set_xlabel('Intraclass correlation coefficient', fontsize=size*2, labelpad=size)
    box_ax.tick_params(labelsize=size*1.5, pad=size, length=2)
    [i.set_linewidth(size/5) for i in box_ax.spines.values()]

    # plot distributions
    dist_ax = f.add_axes([.15,.6,.8,.4]) 
    dist_ax.set_xlim(*box_ax.get_xlim())
    dist_ax.set_xticklabels('')
    dist_ax.tick_params(length=0)
    for i, (name, g) in enumerate(retest_data.groupby('Measure Category')):
        sns.kdeplot(g['icc3.k'], color=colors[i], ax=dist_ax, linewidth=size/3, 
                    shade=True, legend=False)
    dist_ax.set_ylim((0, dist_ax.get_ylim()[1]))
    dist_ax.axis('off')
    if save_dir:
        plt.savefig(save_dir, dpi=dpi, bbox_inches='tight')
Ejemplo n.º 4
0
def ca_box_plot_driving():
    # 读取数据
    # n_bins = 5000
    data = pd.read_csv('data/drive_index.txt', header=None)
    data1 = data[data[1] == 0]
    data2 = data[data[1] == 1]
    col1 = data1[9]
    col2 = data2[9]
    col1 = col1[col1 <= 0.2]
    col2 = col2[col2 <= 0.2]
    # print(col1.describe())
    # print(col2.describe())
    # col1.to_csv("shopping_+1.txt")
    # col2.to_csv("shopping_-1.txt")
    plt.figure(figsize=(8, 4))
    sns.boxplot(data=[col1, col2], width=0.3)
    # sns.violinplot(data=[col1, col2], fliersize=0.1, width=0.3)

    plt.xticks((0, 1), ('Extroverts', 'Introverts'))
    # plt.xlim(0.5, 2.5)

    # plt.yticks(fontsize=20)
    plt.ylabel("Drive Index")
    plt.ylim(0, 0.015)

    # plt.boxplot(data=[col1, col2], vert=False, sym='k+', showmeans=True, showfliers=True, notch=1)
    # plt.yticks((1, 2), ('Extroverts', 'Introverts'), fontsize=25, rotation=30)
    # plt.ylim(0.5, 2.5)
    #
    # plt.xticks(fontsize=30)
    # plt.xlabel("Purchasing Index", fontsize=30)
    # plt.xlim(0, 0.12)
    # plt.savefig('figure/purchase_box.eps', dpi=300)
    plt.show()
Ejemplo n.º 5
0
def plot_op(operation):
    """ Plots operation for all models
    """

    df = pd.read_csv(RESULT_FOLDER + RESULT_FILE, usecols=[1, 2, 3])
    print(df.columns)
    df.columns = ['mo', 'node', 'time']
    #print df.head()

    ele = mo(operation)

    qpare = df[df.mo == ele[0]]
    qpare = qpare.append(df[df.mo == ele[1]])
    qpare = qpare.append(df[df.mo == ele[2]])
    qpare = qpare.append(df[df.mo == ele[3]])

    f, ax = plt.subplots()
    ax.set(yscale="log")

    ax.set_title('Query time')
    sns.set_style("whitegrid")
    sns.boxplot(x='mo', y='time', data=qpare)
    ax.set_xlabel("model-operation")
    ax.set_ylabel("time [s]")

    #sns.plt.show()
    sns.plt.savefig(RESULT_FOLDER + operation + '.png')
    sns.plt.clf()
Ejemplo n.º 6
0
    def MAF_comparison_boxplot(self):
        long_format_mafs = self._generate_maf_long_df()

        populations_to_plot = {
            "superpopulation": ['AFR', 'EUR', 'AMR'],
            "population": Dataset.used_populations(),
        }
        for population_level, long_df in long_format_mafs.items():
            population_list = populations_to_plot[population_level]
            mask = long_df["population"].isin(population_list)
            long_df = long_df[mask]
            fig_width = 13 if population_level == "population" else 7
            fig = plt.figure(figsize=(fig_width, 4))
            ax = fig.add_subplot(1, 1, 1)

            panel_labels = long_df["panel"].unique()
            colors = [v for k, v in panel_colors().items() if k in panel_labels]

            sns.boxplot(data=long_df, x="population", y="MAF", hue="panel",
                        ax=ax, linewidth=0.3, showcaps=False, showfliers=False,
                        palette=sns.color_palette(colors), width=0.70)

            self._boxplot_aesthetics(ax)

            filename = "MAF_comparison__{}".format(population_level)
            plt.savefig(join(self.PLOTS_DIR, filename), bbox_inches="tight")
            plt.show()
Ejemplo n.º 7
0
def trust_perspectives_wrt_someone(trust_frame, wrt='targets'):
    """
    Generates a 'matrix' of trust assessments of each nodes perspective from every other one, grouped by 'var'
    :param wrt:
    :param trust_frame:
    :return:
    """
    if wrt == 'targets':
        base = "observer"
        comp = "target"
        perspective = "objective"
    elif wrt == 'observer':
        base = "target"
        comp = "observer"
        perspective = "subjective"

    groups = trust_frame.unstack(base).stack(comp).groupby(level=['var'])
    n_nodes = trust_frame.shape[1]

    f, ax = plt.subplots(len(groups), n_nodes, figsize=(16, 2 * len(groups)), sharey=True)
    plt.subplots_adjust(hspace=0.2, wspace=0.05, top=0.951)
    for i, (var, group) in enumerate(groups):
        for j, (jvar, jgroup) in enumerate(group.groupby(level=comp)):
            sns.boxplot(jgroup, ax=ax[i][j], **_boxplot_kwargs)
            if not i:  # first plot
                ax[i][j].set_title(jvar)
        map(lambda a: a.set_xlabel(""), ax[i])
        if i + 1 < len(groups):
            ax[i][0].set_xlabel(base.capitalize())
        ax[i][0].set_ylabel("{0:.4f}".format(float(var)))
    f.suptitle(
        "Plots of Per-Node {0} Trust Values".format(perspective.capitalize()),
        fontsize=24)
    return f
Ejemplo n.º 8
0
def do_nb_linear(case, models, name, fun):
    fig, ax = plt.subplots(figsize=(16, 9))
    nbs = []
    pes = []
    tops = []
    bottoms = []
    nindivs = 50
    for model, n0 in models:
        nb = Nbs[(model, n0)]
        vals, ci, r2, sr2, j, ssize = \
            case["Newb"][(model, n0)][(None, nindivs, 100, "SNP")]
        vals, ci = fun(n0, get_bname(model), nindivs, vals, ci,
                       r2=r2, sr2=sr2, j=j)
        if len(vals) == 0:
            continue
        bottom, top = list(zip(*ci))
        nbs.append(nb)
        tops.append(top)
        pes.append(vals)
        bottoms.append(bottom)
    # pylab.yscale('log')
    sns.boxplot(tops, notch=0, sym="")
    sns.boxplot(bottoms)
    ax.set_xticks(1 + np.arange(len(nbs)))
    ax.set_xticklabels([str(nb) for nb in nbs])
    ax.set_ylim(0, max(nbs))
    ax.set_ylabel("$\hat{N}_{e}$", fontsize=32)
    ax.set_xlabel("Target (simulated) ${N}_{b}$", fontsize=32)
Ejemplo n.º 9
0
def fig_boxplotcomparison(regular, phospho, sheet, column, bounds, figpath):
    """
    Figure 2c from Aurora paper
    """
    #%%
    xvals = []
    yvals = []
    for i, j in zip([regular[sheet], regular[column], phospho[sheet].values, phospho[column].values], ["peptides", "norm_peptides", "phospho", "norm_phospho"]):
        xvals.extend(i)
        yvals.extend([j]*len(i))

    df = pd.DataFrame([xvals, yvals]).transpose()
    df.columns = [column, "peptide type"]
    #%%
    lower_bound = bounds[0]
    upper_bound = bounds[1]
    bmap = brewer2mpl.get_map('Paired', 'Qualitative', 6).mpl_colors
    f, ax = plt.subplots(1, figsize=(11.69, 8.27))
    sns.boxplot(x="peptide type", y=column, data=df, palette=bmap)
    ax.set(xticks=[0, 1, 2, 3],
           xticklabels=["peptides", "peptides \n(normalized)", "phospho-\npeptides", "phosphopeptides \n(normalized)"])
    ax.set(ylim=(-2, 2), ylabel="log2 (fold change)", title=column)
    ax.axhline(lower_bound, ls="--", lw=2, color="red", alpha=0.7)
    ax.axhline(upper_bound, ls="--", lw=2, color="red", alpha=0.7)
    ax.yaxis.set_major_locator(MaxNLocator(4))
    sns.despine()
    cutils.save_fig(f, figpath+"162_BoxplotNorm_{}".format(column))
Ejemplo n.º 10
0
    def plot_boxes(self, peaks):
        """Draw a boxplot to show the distribution of copes at peaks."""
        cope_data = nib.load(self.inputs.cope_file).get_data()
        peak_spheres = self._peaks_to_spheres(peaks).get_data()
        peak_dists = np.zeros((cope_data.shape[-1], len(peaks)))
        for i, peak in enumerate(peaks, 1):
            sphere_mean = cope_data[peak_spheres == i].mean(axis=(0))
            peak_dists[:, i - 1] = sphere_mean

        with sns.axes_style("whitegrid"):
            f, ax = plt.subplots(figsize=(9, float(len(peaks)) / 3 + 0.33))

        try:
            # seaborn >= 0.6
            sns.boxplot(data=peak_dists, palette="husl", orient="h", ax=ax)
            labels = np.arange(len(peaks)) + 1
        except TypeError:
            # seaborn < 0.6
            pal = sns.husl_palette(peak_dists.shape[1])[::-1]
            sns.boxplot(peak_dists[:, ::-1], color=pal, ax=ax, vert=False)
            labels = np.arange(len(peaks))[::-1] + 1

        sns.despine(left=True, bottom=True)
        ax.axvline(0, c=".3", ls="--")
        ax.set(yticklabels=labels, ylabel="Local Maximum", xlabel="COPE Value")

        out_fname = op.realpath("peak_boxplot.png")
        self.out_files.append(out_fname)
        f.savefig(out_fname, bbox_inches="tight")
        plt.close(f)
Ejemplo n.º 11
0
def do_cohort(case, model, N0, nindiv, corr_name):
    last = 0.5
    fig, ax = plt.subplots(figsize=(16, 9))
    nb = Nbs[(model, N0)]
    #fig.suptitle("Nb: %d (N1: %d) - different cohorts - 100 SNPs -%s" %
    #            (nb, N0, corr_name), fontsize=18)
    fig.suptitle("Nb: %d - different cohorts - 100 SNPs - %s" %
                (nb, corr_name), fontsize=24)
    box_vals = []
    labels = []
    tops = []
    bottoms = []
    hmeans = []
    bname = get_bname(model)

    for cohort in cohorts:
        vals, ci, r2, sr2, j, ssize = \
            case[cohort][(model, N0)][(None, nindiv, 100, "SNP")]
        for cname, corrections in get_corrs(N0, bname, nindiv, vals,
                                            ci, r2, sr2, j):
            if cname != corr_name:
                continue
            cvals, cci = corrections
            vals = cvals
            ci = cci
            break
        box_vals.append(vals)
        hmeans.append(hmean(vals))
        bottom, top = list(zip(*ci))
        top = [100000 if x is None else x for x in top]
        bottom = [100000 if x is None else x for x in bottom]
        tops.append(np.percentile(top, 90))
        bottoms.append(np.percentile(bottom, 10))
        if cohort == 'c2c':
            labels.append("2 cohorts")
        elif cohort == 'c3c':
            labels.append("3 cohorts")
        else:
            labels.append("%s" % cohort)
        if cohort == cohorts[-1]:
            pos = len(labels) + 0.5
            ax.axvline(pos, color="k", lw=0.2)
            ax.text(last + (pos - last) / 2, 0, "%d Individuals sampled" % nindiv,
                    ha="center", va="bottom", size=24,
                    rotation="horizontal")
            last = pos
    ax.set_ylim(0, nb * 3)
    ax.set_ylabel('$\hat{N}_{e}$', fontsize=32)
    ax.axhline(nb, color="k", lw=0.3)
    sns.boxplot(box_vals, notch=0, sym="")
    ax.set_xticks(1 + np.arange(len(labels)))
    ax.set_xticklabels(labels, fontsize=24)
    ax.plot([1 + x for x in range(len(tops))], tops, "rx")
    ax.plot([1 + x for x in range(len(bottoms))], bottoms, "rx")
    ax.plot([1 + x for x in range(len(hmeans))], hmeans, "k+")
    yticks = [0, nb // 2, nb, 2 * nb, 3 * nb]
    ax.set_yticks(yticks)
    ax.set_yticklabels([str(y) for y in yticks], fontsize=14)
    #fig.savefig("output/cohort-%s-%s-%d.png" % (model, corr_name, N0))
    return fig
def stratify_numtot_age(classifier, numtot_dict, class_name, class2_enter, mirna2age,finname):
	pd_precursor = []

	ages_yes = []
	ages_no = []

	class_vals = flatten(classifier.values())

	for val in numtot_dict:
		if val not in mirna2age: continue
		if val in class_vals:
			pd_precursor.append([numtot_dict[val], 'In miRNA %s' %(class_name), mirna2age[val]])
			ages_yes.append(mirna2age[val])
		else:
			pd_precursor.append([numtot_dict[val], 'Not in miRNA %s' %(class_name), mirna2age[val]])
			ages_no.append(mirna2age[val])

	ages_lst = list(set(ages_yes).intersection(set(ages_no)))


	db = pd.DataFrame(pd_precursor, columns=[class2_enter, 'miRNA Class', 'Age (MY)'])
	print spearmanr(db[class2_enter].tolist(), db['Age (MY)'].tolist())

	sns.boxplot(x='Age (MY)', y=class2_enter, showfliers=False, data=db)
	sns.plt.savefig('../figures/%s.pdf' %(finname),bbox_inches='tight')
	sns.plt.close()
Ejemplo n.º 13
0
def plot_return_quantiles(returns, df_weekly, df_monthly, ax=None, **kwargs):
    """Creates a box plot of daily, weekly, and monthly return
    distributions.

    Parameters
    ----------
    returns : pd.Series
        Daily returns of the strategy, noncumulative.
         - See full explanation in tears.create_full_tear_sheet.
    df_weekly : pd.Series
        Weekly returns of the strategy, noncumulative.
         - See timeseries.aggregate_returns.
    df_monthly : pd.Series
        Monthly returns of the strategy, noncumulative.
         - See timeseries.aggregate_returns.
    ax : matplotlib.Axes, optional
        Axes upon which to plot.
    **kwargs, optional
        Passed to seaborn plotting function.

    Returns
    -------
    ax : matplotlib.Axes
        The axes that were plotted on.

    """

    if ax is None:
        ax = plt.gca()

    sns.boxplot(data=[returns, df_weekly, df_monthly], ax=ax, **kwargs)
    ax.set_xticklabels(["daily", "weekly", "monthly"])
    ax.set_title("Return quantiles")
    return ax
def posterior_predictive_bin_fracs(post_bin_counts, bin_counts):
    # compare with mean and variance in category fractions
    total_counts = bin_counts.sum(axis=1)
    bin_fracs = bin_counts.apply(lambda x: x / total_counts, axis=0)
    mean_bin_frac = bin_fracs.mean(axis=0)
    std_bin_frac = bin_fracs.std(axis=0)

    # get expected mean and expected variance from MCMC samples
    post_bin_fracs = post_bin_counts.apply(lambda x: x / post_bin_counts.sum(axis=1), axis=0)

    post_bin_mean = post_bin_fracs.mean(axis=0, level=1)
    post_bin_std = post_bin_fracs.std(axis=0, level=1)

    fig = plt.figure()
    ax1 = plt.subplot(211)
    sns.boxplot(post_bin_mean, ax=ax1)
    plt.plot(1 + np.arange(len(mean_bin_frac)), mean_bin_frac, 'ko')
    ax1.set_ylabel('Mean over data')
    ax2 = plt.subplot(212)
    sns.boxplot(post_bin_std, ax=ax2)
    ax2.plot(1 + np.arange(len(std_bin_frac)), std_bin_frac, 'ko')
    ax2.set_ylabel('Std over data')
    ax2.set_xlabel('Bin ID')
    plt.tight_layout()

    return ax1, ax2
Ejemplo n.º 15
0
def plotResults(tr, resultKey='resultInputPsf', doRates=False, title='', asHist=False, doPrint=True, actuallyPlot=True):
    import matplotlib.pyplot as plt
    import matplotlib
    matplotlib.style.use('ggplot')

    import seaborn as sns
    sns.set(style="whitegrid", palette="pastel", color_codes=True)

    methods = ['ALstack', 'ZOGY', 'SZOGY', 'ALstack_decorr']
    tr = [t for t in tr if t is not None and t[resultKey]]
    FN = pd.DataFrame({key: np.array([t[resultKey][key]['FN'] for t in tr]) for key in methods})
    FP = pd.DataFrame({key: np.array([t[resultKey][key]['FP'] for t in tr]) for key in methods})
    TP = pd.DataFrame({key: np.array([t[resultKey][key]['TP'] for t in tr]) for key in methods})
    title_suffix = 's'
    if doRates:
        FN /= (FN + TP)
        FP /= (FN + TP)
        TP /= (FN + TP)
        title_suffix = ' rate'
    if doPrint:
        print 'FN:', '\n', FN.mean()
        print 'FP:', '\n', FP.mean()
        print 'TP:', '\n', TP.mean()

    if not actuallyPlot:
        return TP, FP, FN

    matplotlib.rcParams['figure.figsize'] = (18.0, 6.0)
    fig, axes = plt.subplots(nrows=1, ncols=2)

    if not asHist:
        sns.violinplot(data=TP, cut=True, linewidth=0.3, bw=0.25, scale='width', alpha=0.5, ax=axes[0])
        if TP.shape[0] < 500:
            sns.swarmplot(data=TP, color='black', size=3, alpha=0.3, ax=axes[0])
        sns.boxplot(data=TP, saturation=0.5, boxprops={'facecolor': 'None'},
                    whiskerprops={'linewidth': 0}, showfliers=False, ax=axes[0])
        plt.setp(axes[0], alpha=0.3)
        axes[0].set_ylabel('True positive' + title_suffix)
        axes[0].set_title(title)
        sns.violinplot(data=FP, cut=True, linewidth=0.3, bw=0.5, scale='width', ax=axes[1])
        if FP.shape[0] < 500:
            sns.swarmplot(data=FP, color='black', size=3, alpha=0.3, ax=axes[1])
        sns.boxplot(data=FP, saturation=0.5, boxprops={'facecolor': 'None'},
                    whiskerprops={'linewidth': 0}, showfliers=False, ax=axes[1])
        plt.setp(axes[1], alpha=0.3)
        axes[1].set_ylabel('False positive' + title_suffix)
        axes[1].set_title(title)
    else:
        for t in TP:
            sns.distplot(TP[t], label=t, norm_hist=False, ax=axes[0])
        axes[0].set_xlabel('True positive' + title_suffix)
        axes[0].set_title(title)
        legend = axes[0].legend(loc='upper left', shadow=True)
        for t in FP:
            sns.distplot(FP[t], label=t, norm_hist=False, ax=axes[1])
        axes[1].set_xlabel('False positive' + title_suffix)
        axes[1].set_title(title)
        legend = axes[1].legend(loc='upper left', shadow=True)

    return TP, FP, FN
Ejemplo n.º 16
0
def compare_seq_counts_among_param_sets(out_dir, 
                                        qual_vals,
                                        length_vals,
                                        ylim=None):
    
    param_sets = product(qual_vals, length_vals)
    
    out_dir = out_dir.rstrip('/')+'/'
    data = []
    
    for param_set in param_sets:
        workdir = out_dir+'minqual%i_minlength%i/' % param_set
        data += [l+['minqual%i_minlength%i' % param_set] for l in sequence_counts(workdir)]
    
    headers = ['smpl', 'data type', 'count', 'param set']
    
    df =  pd.DataFrame(data, columns=headers)
    #print df
    sns.set(style="ticks")
    # Draw a nested boxplot to show bills by day and sex
    sns.boxplot(x="data type", y="count", hue="param set", data=df, palette="PRGn")
    #sns.despine(offset=10, trim=True)
    plt.xticks(rotation=30)
    if ylim:
        plt.ylim(ylim)
def stripplot_mean_score(df, save_path, atlas=None, suffix=None, x=None,
                         y=None, hue=None, style='whitegrid', fontsize=14,
                         jitter=.2, figsize=(9, 3), leg_pos=2, axx=None):

    def change_label_name(row, label):
        row[label] = new_names[row[label]]
        return row

    ylabel = atlas
    aliases = {'kmeans': 'K-Means',
               'ica': 'GroupICA',
               'dictlearn': 'Dictionary Learning',
               'basc': 'BASC'}
    if atlas == 'kmeans':
        new_names = {'no': 'Without\n regions extracted',
                     'yes': 'With\n regions extracted'}
        df = df.apply(lambda x: change_label_name(x, y), axis=1)
    else:
        new_names = {'no': 'Without\n regions extracted',
                     'yes': 'With\n regions extracted'}
        df = df.apply(lambda x: change_label_name(x, y), axis=1)

    # change the name of the dataset to upper
    df['dataset'] = df['dataset'].str.upper()

    # make labels of the y axes shorter
    # df[y] = df[y].str.wrap(13)

    rc('xtick', labelsize=12)
    rc('ytick', labelsize=16)
    rc('axes', labelweight='bold')  # string.capitalize
    rc('legend', fontsize=fontsize)

    n_data = len(df['dataset'].unique())
    palette = color_palette(n_data)

    # draw a default vline at x=0 that spans the yrange
    axx.axvline(x=0, linewidth=4, zorder=0, color='0.6')

    sns.boxplot(data=df, x=x, y=y, fliersize=0, linewidth=2,
                boxprops={'facecolor': '0.5', 'edgecolor': '.0'},
                width=0.5, ax=axx)

    sns.stripplot(data=df, x=x, y=y, hue=hue, edgecolor='gray',
                  size=5, split=True, palette=datasets_palette, jitter=jitter,
                  ax=axx)

    axx.set_xlabel('')
    # axx.set_ylabel(aliases[ylabel], fontsize=15)
    axx.set_ylabel('')
    plt.text(.5, 1.02, aliases[key], transform=ax.transAxes, size=15, ha='center')

    # make the positive labels with "+"
    axx_xticklabels = []
    for x in axx.get_xticks():
        if x > 0:
            axx_xticklabels.append('+' + str(x) + '$\%$')
        else:
            axx_xticklabels.append(str(x) + '$\%$')
    axx.set_xticklabels(axx_xticklabels)
Ejemplo n.º 18
0
def make_plot(X_train, y_train, X, y, test_data, model, model_name, features, response):
    feature = X.columns
    f, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, sharey=False)
    sns.regplot(X[feature[4]], y, test_data, ax=ax1)
    sns.boxplot(X[feature[4]], y, color="Blues_r", ax=ax2)
    model.fit(X_train, y_train)
    sns.residplot(X[feature[4]], (model.predict(X) - y) ** 2, color="indianred", lowess=True, ax=ax3)
    if model_name is 'linear':
        sns.interactplot(X[feature[3]], X[feature[4]], y, ax=ax4, filled=True, scatter_kws={"color": "dimgray"}, contour_kws={"alpha": .5})
    elif model_name is 'logistic':
        pal = sns.blend_palette(["#4169E1", "#DFAAEF", "#E16941"], as_cmap=True)
        levels = np.linspace(0, 1, 11)
        sns.interactplot(X[feature[3]], X[feature[4]], y, levels=levels, cmap=pal, logistic=True)
    else:
        pass
    ax1.set_title('Regression')
    ax2.set_title(feature[4]+' Value')
    ax3.set_title(feature[4]+' Residuals')
    ax4.set_title('Two-value Interaction')
    f.tight_layout()
    plt.savefig(model_name+'_'+feature[4], bbox_inches='tight')

    # Multi-variable correlation significance level
    f, ax = plt.subplots(figsize=(10, 10))
    cmap = sns.blend_palette(["#00008B", "#6A5ACD", "#F0F8FF",
                              "#FFE6F8", "#C71585", "#8B0000"], as_cmap=True)
    sns.corrplot(test_data, annot=False, diag_names=False, cmap=cmap)
    ax.grid(False)
    ax.set_title('Multi-variable correlation significance level')
    plt.savefig(model_name+'_multi-variable_correlation', bbox_inches='tight')

    # complete coefficient plot - believe this is only for linear regression
    sns.coefplot("diagnosis ~ "+' + '.join(features), test_data, intercept=True)
    plt.xticks(rotation='vertical')
    plt.savefig(model_name+'_coefficient_effects', bbox_inches='tight')
Ejemplo n.º 19
0
    def run(self):
        # get data
        df_train = pd.read_csv(self.input()[0].path, header=[0, 1])
        df_test = pd.read_csv(self.input()[1].path, header=[0, 1])
        df_train_no_covariance_shift = pd.read_csv(self.input()[2].path,
                                                   header=[0, 1])

        evaluation_setups = [EvaluationStruct("Proposed", rf)]
        # evaluate the different methods
        df_adapted = evaluate_data(df_train, noise_levels,
                                   df_test, noise_levels,
                                   evaluation_setups=evaluation_setups)
        df_adapted["data"] = "adapted"
        df_no_adaptation = evaluate_data(
                df_train.drop("weights", axis=1), noise_levels,
                df_test, noise_levels,
                evaluation_setups=evaluation_setups)
        df_no_adaptation["data"] = "source"
        df_no_covariance_shift = evaluate_data(
                df_train_no_covariance_shift, noise_levels,
                df_test, noise_levels,
                evaluation_setups=evaluation_setups)
        df_no_covariance_shift["data"] = "target"
        df = pd.concat([df_adapted, df_no_adaptation, df_no_covariance_shift])

        # plot it
        sns.boxplot(data=df, x="noise added [sigma %]", y="Errors", hue="data",
                    hue_order=["source", "adapted", "target"], fliersize=0)
        # tidy up plot
        plt.ylim((0, 40))
        plt.legend(loc='upper left')

        # finally save the figure
        plt.savefig(self.output().path, dpi=500)
Ejemplo n.º 20
0
def plot_perf_stats(returns, factor_returns, ax=None):
    """Create box plot of some performance metrics of the strategy.
    The width of the box whiskers is determined by a bootstrap.
    Parameters
    ----------
    returns : pd.Series
        Daily returns of the strategy, noncumulative.
         - See full explanation in tears.create_full_tear_sheet.
    factor_returns : pd.DataFrame, optional
        data set containing the Fama-French risk factors. See
        utils.load_portfolio_risk_factors.
    ax : matplotlib.Axes, optional
        Axes upon which to plot.
    Returns
    -------
    ax : matplotlib.Axes
        The axes that were plotted on.
    """
    if ax is None:
        ax = plt.gca()

    bootstrap_values = timeseries.perf_stats_bootstrap(returns,
                                                       factor_returns,
                                                       return_stats=False)
    bootstrap_values = bootstrap_values.drop('kurtosis', axis='columns')

    sns.boxplot(bootstrap_values, orient='h', ax=ax)

    return ax
Ejemplo n.º 21
0
def main():
    parser = ArgumentParser()
    parser.add_argument("-m", "--metadata", type=str, required=True,
            help="Metadata table")
#    parser.add_argument("-v", "--vars", type=str,
#            help="Variables to use for definitions. Currently only includes combinations of 'Date','Chla',\
#            'Temperature','Phosphate'. Comma separated. Defaults to 'Chla,Temperature,Phosphate'")
    parser.add_argument("-r", "--rangedef", type=str, 
            help="Range definitions. Not implemented yet.")
    parser.add_argument("-p", "--plot", action="store_true",
            help="Produce boxplots of each period")
    args = parser.parse_args()

    ## Read metadata
    meta = pd.read_csv(args.metadata, header=0, index_col=0, sep="\t")
    meta.rename(columns=lambda x: x.rstrip(), inplace=True)

    ## Add Julian day column
    meta = addJulDay(meta)

    ## Match ranges
    meta_m = match_ranges(ranges,meta,keys=["Chla","Temperature","Phosphate"])
    
    ## Write definitions
    meta_m["Period"].to_csv(sys.stdout, sep="\t")

    if args.plot:
        for v in plotvars:
            sns.boxplot(data=meta_m,x="Period",y=v,order=order)
            plt.savefig(v+".pdf",bbox_inches="tight")
            plt.close()
Ejemplo n.º 22
0
def make_plots(groups):

    sns.stripplot("ammo", "moa", data=groups, jitter=True)
    postprocess()
    plt.savefig("points.png")

    plt.clf()
    sns.boxplot("ammo", "moa", data=groups)
    postprocess()
    plt.savefig("boxplot.png")

    plt.clf()
    sns.barplot("ammo", "mean", data=groups, ci=None)
    plt.title("mean moa for best 9 of 10 five shot groups")
    plt.ylabel("moa")
    postprocess()
    plt.savefig("avg_moa.png")

    plt.clf()
    std = groups["standard"]
    std = std[std.notnull()]

    fig, axes = plt.subplots(ncols=2)
    sns.distplot(std, ax=axes[0])
    stats.probplot(std, plot=axes[1])
    fig.set_size_inches(6, 4)
    fig.tight_layout()
    plt.savefig("qqplot.png")
Ejemplo n.º 23
0
def _plot_categorical_and_continuous(df, xlabel, ylabel, x_keys, y_keys, ax,
                                     cmap, n_cat=5, plottype="box"):
    """
    Plot a categorical variable and a continuous variable against each
    other. Types of plots include box plot, violin plot, strip plot and swarm
    plot.

    Parameters
    ----------
    df : pd.DataFrame
        A pandas DataFrame with the data

    xlabel : str
        The column name for the variable on the x-axis

    ylabel : str
        The column name for the variable on the y-axis

    ax : matplotlib.Axes object
        The matplotlib.Axes object to plot the bubble plot into

    cmap : matplotlib.cm.colormap
        A matplotlib colormap to use for shading the bubbles

    n_cat : int
        The number of categories; used for creating the colour map

    plottype : {"box" | "violin" | "strip" | "swarm"}
        The type of plot to produce; default is a box plot

    Returns
    -------
    ax : matplotlib.Axes object
        The same matplotlib.Axes object for further manipulation

    """
    if x_keys is xlabel:
        keys = y_keys
    elif y_keys is ylabel:
        keys = x_keys
    else:
        raise Exception("Something went terribly, horribly wrong!")

    current_palette = sns.color_palette(cmap, n_cat)
    if plottype == "box":
        sns.boxplot(x=xlabel, y=ylabel, data=df, order=keys,
                    palette=current_palette, ax=ax)
    elif plottype == "strip":
        sns.stripplot(x=xlabel, y=ylabel, data=df, order=keys,
                      palette=current_palette, ax=ax)
    elif plottype == "swarm":
        sns.swarmplot(x=xlabel, y=ylabel, data=df, order=keys,
                      palette=current_palette, ax=ax)
    elif plottype == "violin":
        sns.violinplot(x=xlabel, y=ylabel, data=df, order=keys,
                       palette=current_palette, ax=ax)
    else:
        raise Exception("plottype not recognized!")

    return ax
Ejemplo n.º 24
0
def hist_boxplot(data, obs = None):
    """
        Plot histograms and boxplots of supplied pandas.DataFrame of data.

        :param data: The data to be plotted, typically forecast and reference distributions.
        :type data: pandas.DataFrame
        :param obs: Observed value to plot as vertical line on histogram and boxplot subplots if not None. Defaults to None.
        :type obs: float

        :returns: (matplotlib.figure.Figure, matplotlib.axes.Axes)
    """

    fig, axes = plt.subplots(nrows=2, ncols=1)

    data.plot(kind='hist', bins=25, alpha=0.6, ax=axes[0])

    sns.boxplot(data = data, orient='h', ax = axes[1])

    if obs is not None:
        for ax in axes:
            obs_line = ax.vlines(obs, *ax.get_ylim(), linestyle='dashed')
            obs_line.set_label('Observed')
            ax.legend()

    fig.tight_layout()

    return fig, axes
def plot_entropies(results, rotate='oblimin', 
                   dpi=300, figsize=(20,8), ext='png', plot_dir=None): 
    """ Plots factor analytic results as bars
    
    Args:
        results: a dimensional structure results object
        c: the number of components to use
        task_sublists: a dictionary whose values are sets of tasks, and 
                        whose keywords are labels for those lists
        dpi: the final dpi for the image
        figsize: scalar - the width of the plot. The height is determined
            by the number of factors
        ext: the extension for the saved figure
        plot_dir: the directory to save the figure. If none, do not save
    """
    EFA = results.EFA
    # plot entropies
    entropies = EFA.results['entropies_%s' % rotate].copy()
    null_entropies = EFA.results['null_entropies_%s' % rotate].copy()
    entropies.loc[:, 'group'] = 'real'
    null_entropies.loc[:, 'group'] = 'null'
    plot_entropies = pd.concat([entropies, null_entropies], 0)
    plot_entropies = plot_entropies.melt(id_vars= 'group',
                                         var_name = 'EFA',
                                         value_name = 'entropy')
    with sns.plotting_context('notebook', font_scale=1.8):
        f = plt.figure(figsize=figsize)
        sns.boxplot(x='EFA', y='entropy', data=plot_entropies, hue='group')
        plt.xlabel('# Factors')
        plt.ylabel('Entropy')
        plt.title('Distribution of Measure Specificity across Factor Solutions')
        if plot_dir is not None:
            f.savefig(path.join(plot_dir, 'entropies_across_factors.%s' % ext), 
                      bbox_inches='tight', dpi=dpi)
            plt.close()
Ejemplo n.º 26
0
def boxplotify(df, feature, path, title, save=True):

    fig, ax = plt.subplots(figsize=(12, 5))

    fig.suptitle(title, fontsize=20)

    boxplot(
        x=df['decade'], y=df[feature],
        hue=df['charted'],
        linewidth=2, ax=ax,
        palette={0: 'r', 1: 'g'}
    )

    yes = Patch(color='g', label='Yes')
    no = Patch(color='r', label='No')
    plt.legend(
        bbox_to_anchor=(1, 1), loc=2,
        ncol=1, shadow=True, title="Charted",
        handles=[yes, no]
    )

    if save:
        fig.savefig(staticDir.format(file=path))

    plt.show()
Ejemplo n.º 27
0
def caixas(exames=["BAC", "RBC", "MUC", "CAOXD", "HYA", "PAT", "WBC", "EPI", "TRI", "URI", "YEA", "AMO"]):
    for i in exames:
        plt.clf()
        plt.close()
        filename = "boxplot" + i + ".png"
        sns.boxplot(x="HORA", y="MEDIDA", hue="LOCAL", data=dfcontmelt[dfcontmelt.EXAME == i], palette="Blues", sym="")
        plt.savefig(filename)
Ejemplo n.º 28
0
def plot_times(config, segment_id, distribution):
    """
    Generates a plot to visualize the performance of the current athlete at a specific segment
    in comparison to other athletes.
    :param config: Config object providing API access via security token
    :param segment_id: ID of the strava segment in question
    :param distribution: Whether to plot the time distribution over efforts instead of a boxplot
    :return:
    """
    client = config.client
    ridden_segs = read_data(DATAFILE)

    all_efforts = client.get_segment_efforts(segment_id)
    X = [e.elapsed_time for e in all_efforts]
    X = np.array([datetime.timedelta.total_seconds(x) for x in X])
    Y = np.array([x for x in ridden_segs[segment_id].times])
    if distribution:
        plt.xlabel('Time in seconds')
        sns.distplot(X, hist=False, rug=True)
        sns.distplot(Y, hist=False, rug=True)
        plt.show()
    else:
        plt.ylabel('Time in seconds')
        data = np.array([X, Y])
        sns.boxplot(data=data, orient='v')
        plt.show()
Ejemplo n.º 29
0
def main():
    args = parse_args()
    run_sizes_path: Path = args.run_sizes_csv
    if not run_sizes_path.exists():
        with run_sizes_path.open('w') as run_sizes_csv:
            scan_run_folders(args.runs, run_sizes_csv, args.group_size)

    runs = pd.read_csv(run_sizes_path)
    runs['version'] = runs['version'].str.replace('version_', 'v')
    runs['version'] = runs['version'].str.replace('-UPDATED-BOWTIE', '')
    runs['version'] = runs['version'].str.replace('RC1', 'r1')
    runs.sort_values('version', inplace=True)
    plain_runs = runs.copy()
    plain_runs['size'] = plain_runs['outputs']
    plain_runs['type'] = 'unzipped'
    zipped_runs = runs.copy()
    zipped_runs['size'] = zipped_runs['zipped']
    zipped_runs['type'] = 'zipped'
    all_runs = pd.concat([plain_runs, zipped_runs])

    sns.boxplot(x='size', y='version', hue='type', data=all_runs)
    plt.xlabel('Output size (MB)')
    plt.title('MiSeq Disk Usage')

    plt.show()
Ejemplo n.º 30
0
def plotboxplots(cufflinks_t, target, out):
    """Function accepts tab delimited FPKM table generated via Cufflinks and generates a Seaborn box plot identifying up and downregulated genes as well as marking where
    the gene of interest falls among fold change distributions

    Args:
        cufflinks_t (str/path): Tab delimited FPKM table.
        target (str): Gene of interest to compare fold change distributions to
        out (str): Title for saved .eps image

    Returns:
        Nothing. Saves .eps boxplot image as out.eps

    """
    fc_filt, samples = returnfilterfc(cufflinks_t)
    dmpk = array(samples['Dmpk']).astype(float)
    fc = log10(dmpk[-2:].mean()) - log10(dmpk[:3].mean())
    neg_filt = fc_filt[fc_filt <= -1]
    pos_filt = fc_filt[fc_filt >= 1]
    fig, (ax1, ax2) = plt.subplots(ncols=2, sharey=True)
    if fc >= 0:
        symb = ax2
    else:
        symb = ax1
    ax1.set_xlabel('Negative Fold change')
    ax2.set_xlabel('Positive Fold change')
    sns.boxplot(neg_filt, showfliers=False, color='Orange', notch=True, orient='v', ax=ax1)
    sns.boxplot(pos_filt, showfliers=False, color='Grey', notch=True, orient='v', ax=ax2)
    symb.axhline(y=fc, linewidth=.8, color='red', linestyle='dashed')
    red_patch = mpatches.Patch(color='red', label='DMPK \nFold change = %s' % (round(fc, 2)))
    plt.legend(handles=[red_patch], loc=(.09, .01))
    plt.suptitle('Fold Change Distributions')
    plt.savefig(out, format='eps', dpi=1000)
Ejemplo n.º 31
0
matplotlib.rc('font', **font)

# import mpld3

# mpld3.enable_notebook()

sns.set_style('white')

df = pd.read_csv('Predictions.csv')
df['Average'] = df['Features'].apply(
    lambda f: df[df['Features'] == f]['MSE'].mean())
df = df.sort_values('Average')
unique_df = df.drop_duplicates(['Features'])

plt.subplots(figsize=(12, 8))
chart = sns.boxplot(x='Features', y='MSE', data=df, linewidth=1.0,
                    fliersize=2)  # , inner=None
# dup_df = df.drop_duplicates(subset=['Features'], keep='first', inplace=False)
# chart = sns.swarmplot(x='Features', y='MSE', data=df, hue='Correlation', linewidth=1.0, palette='Reds')
palette = matplotlib.cm.get_cmap('BrBG')
min_val = -1.0  #unique_df['Correlation'].min()
max_val = 1.0  #unique_df['Correlation'].max()
# print(min_val)
# print(max_val)
for i, box in enumerate(chart.artists):
    corr = unique_df.iloc[i]['Correlation']
    box.set_facecolor(palette((corr - min_val) / (max_val - min_val)))
    #if corr < 0.0:
    #        box.set_facecolor(palette((-corr / (2 * min_val) + 0.5)))
    #else:
    #        box.set_facecolor(palette((corr / (2 * max_val) + 0.5)))
def result_plots(df):
    
    plot_width = 255*2
    box_plot_len = 170*2
    font_size = 12
    
    # CRF over BLEL
    plt.figure(constrained_layout=True, figsize=(plot_width/72,box_plot_len/72))
    plt.grid(axis='y',color='gray', linestyle='-', linewidth=0.5)
    sn.boxplot(x = 'BLEL', 
               y = 'CRF', 
               linewidth = 0.5,
               color = 'aliceblue',
               flierprops={'markersize':2,'markeredgecolor': 'black'},
               showmeans=True,
               meanprops={'marker':"x",'markeredgecolor': 'black'},
               whis = (5,95),
               data = df[df['delta_t'] == 30],
               )
    plt.xticks([0,1,2,3,4,5],['30', '50', '70', '80', '90', '100'], fontsize = font_size)
    plt.yticks(fontsize = font_size)
    
    plt.xlabel('Bus-line electrification level in %', fontsize = font_size)
    plt.ylabel('CRF in %', fontsize = font_size)
    plt.yticks([0,2,4,6,8,10,12])
    
    # CRF over BLEL comparison between two peak averaging durations    
    plt.figure(constrained_layout=True, figsize=(plot_width/72,box_plot_len/72))
    plt.grid(axis='y',color='gray', linestyle='-', linewidth=0.5)
    Comparison = sn.boxplot(x = 'BLEL', 
               y = 'CRF',
               hue = 'delta_t',
               hue_order = [30,15],
               linewidth = 0.5,
               color = 'skyblue',
               flierprops={'markersize':2,'markeredgecolor': 'black'},
               showmeans=True,
               meanprops={'marker':"x",'markeredgecolor': 'black'},
               whis = (5,95),
               data = df,
               )
    plt.xticks([0,1,2,3,4,5],['30', '50', '70', '80', '90', '100'], fontsize = font_size)
    plt.yticks(fontsize = font_size)
    plt.xlabel('Bus-line electrification level in %', fontsize = font_size)
    plt.ylabel('CRF in %', fontsize = font_size)
    plt.yticks([0,2,4,6,8,10,12,14,16])
    Comparison.legend_.set_title('')
    handles, _ = Comparison.get_legend_handles_labels()
    Comparison.legend(handles,['Δt = 30 minutes', 'Δt = 15 minutes'],fontsize = font_size)
    
    # Scatter plot (CRF over number of chargers)
    plt.figure(constrained_layout=True, figsize=(plot_width/72,box_plot_len/72))
    plt.grid(axis='y',color='gray', linestyle='-', linewidth=0.5)
    plt.scatter(x='n_chargers',
                y='CRF', 
                s = 12,
                marker="o", 
                edgecolors='black', 
                c = 'aliceblue',
                data = df[df['delta_t'] == 30],
                )
    plt.xlabel('Number of chargers', fontsize = font_size)
    plt.ylabel('CRF in %', fontsize = font_size)
    plt.xticks([0,5,10,15,20,25,30], fontsize = font_size)
    plt.yticks([0,2,4,6,8,10,12])
    plt.yticks(fontsize = font_size)
    
    # Share of demand charge over BLEL
    df_modified = pd.melt(df, id_vars=['BLEL'], value_vars=['share_of_demand', 'share_of_demand_z'])
    df_modified.loc[df_modified['variable']=='share_of_demand','variable'] = 'with SES'
    df_modified.loc[df_modified['variable']=='share_of_demand_z','variable'] = 'without SES'
    
    plt.figure(constrained_layout=True, figsize=(plot_width/72,box_plot_len/72))
    plt.grid(axis='y',color='gray', linestyle='-', linewidth=0.5)
    sn.boxplot(x='BLEL', 
               y='value', 
               hue='variable',
               hue_order=['without SES','with SES'],
               linewidth = 0.5,
               flierprops={'markersize':2,'markeredgecolor': 'gray', 'linewidth':0.2},
               showmeans=True,
               meanprops={'marker':"x",'markeredgecolor': 'black'},
               whis = (5,95),
               data = df_modified,
               )
    plt.xticks([0,1,2,3,4,5],['30', '50', '70', '80', '90', '100'], fontsize = font_size)
    plt.yticks(fontsize = font_size)
    plt.xlabel('Bus-line electrification level in %', fontsize = font_size)
    plt.ylabel(r'$C_{demand}/C^*_{tot}$', fontsize = font_size)
    plt.ylim(0,0.5)
    plt.legend(ncol=1, fontsize = font_size)
Ejemplo n.º 33
0
    axes[0].set_xticklabels(xlabels,rotation=45, horizontalalignment='right')
    axes[0].tick_params(labelsize=8)

    ############################ plot TS start times ##################
    ts_stat = get_gpats_start_end_duration(  get_gpats_data(cur_dir,sta=sta,res='1min') )
    print(ts_stat.info())
    print(ts_stat)

    # round duration to closest 1hour !!
    ts_stat['duration'] = round((ts_stat['last'] - ts_stat['first'])/np.timedelta64(1, 'h') , 1)

    # convert start and end time to numeric
    ts_stat['first'] =  ts_stat['first'].dt.strftime('%H:%M').apply(conversion)
    ts_stat['last'] =  ts_stat['last'].dt.strftime('%H:%M').apply(conversion)

    sns.boxplot(data=ts_stat, x=ts_stat.index.month, y='first', linewidth=2, ax=axes[1])
    axes[1].set_ylabel('Onset (UTC)', color='g', fontsize=15)

    xlabels=[dict_mon[x+1] for x in axes[1].get_xticks()]
    axes[1].set_xticklabels(xlabels,rotation=45, horizontalalignment='right')
    axes[1].tick_params(labelsize=8)
    axes[1].set_xlabel('', color='g', fontsize=15)


    ############################ plot TS end times ##################

    sns.boxplot(data=ts_stat, x=ts_stat.index.month, y='last', linewidth=2, ax=axes[2])
    axes[2].set_ylabel('Finish (UTC)', color='g', fontsize=15)

    xlabels=[dict_mon[x+1] for x in axes[2].get_xticks()]
    axes[2].set_xticklabels(xlabels,rotation=45, horizontalalignment='right')
Ejemplo n.º 34
0
# EDA
data["Status"].value_counts()


data["Country"].value_counts()


plt.figure(figsize = (10, 8))

data.boxplot('Life expectancy ')
plt.show()


plt.figure(figsize = (10, 8))

sns.boxplot("Status", 'Life expectancy ', data = data)  # shows that life expectancy is higher in developed countries.
plt.xlabel("Status", fontsize = 16)
plt.ylabel("Total expenditure", fontsize = 16)

plt.show()


data_corr = data[["Life expectancy ",
                 "Adult Mortality",
                 "Schooling",
                 "Total expenditure",
                 "Diphtheria ",
                 "GDP",
                 "Population"]].corr()

data_corr
Ejemplo n.º 35
0
### 3. HANDLING OUTLIERS, EXTREME VALUES & SKEWNESS############################

df.shape  #rows: 10296

df['1stPolYear'].describe()
#Drop values >2016, as the database comes from 2016
df = df.drop(df[df['1stPolYear'] > 2016].index)
sns.kdeplot(df['1stPolYear']).set_title('1st Policy Year')

df['BirthYear'].describe()
#Drop values <1900
df = df.drop(df[df['BirthYear'] < 1900].index)
df['BirthYear'].hist(bins=50).set_title('Birth Year')

df['GrossMthSalary'].describe()
sns.boxplot(x=df['GrossMthSalary'])
#Drop Salary>30000
df = df.drop(df[df['GrossMthSalary'] > 30000].index)
df['GrossMthSalary'].hist(bins=50).set_title('Gross  Monthly Salary')

#Drop CustMonetVal< -2000
df['CustMonetVal'].describe()
sns.boxplot(x=df['CustMonetVal'])
df = df.drop(df[df['CustMonetVal'] < -2000].index)

#Drop ClaimRate > 3
df['ClaimRate'].describe()
sns.boxplot(x=df['ClaimRate'])
df = df.drop(df[df['ClaimRate'] > 3].index)

df['PremLOBMotor'].describe()
Ejemplo n.º 36
0
    paralist = [p for p in Hist_df.columns if p != 'label']
    #    paralist = ['total_distance','avg_moving_speed01' , 'linearity']  #testlist
    #    paralist = [p for p in Hist_df.columns if p.startswith('avg_moving_speed')]  #testlist
    print(str(len(paralist)) + ' boxplots will be created\nfinished boxplots:',
          end=' ')

    # create graphs for each parameter
    for i, para in enumerate(paralist):
        plt.figure(para)

        # ignore 0 values in avg_moving_speed
        if para.startswith('avg_moving_speed'):
            Hist_df[para] = [x if x > 0 else np.nan for x in Hist_df[para]]

        # generate boxplot with overlying datapoints
        ax = sns.boxplot(x='label', y=para, data=Hist_df, showfliers=False)
        ax = sns.swarmplot(x='label',
                           y=para,
                           data=Hist_df,
                           color='black',
                           alpha=0.5)
        # graph formatting
        ax.set_ylim(ax.get_ylim()[0], ax.get_ylim()[1] * 1.075)
        ax.xaxis.label.set_visible(False)

        # perform comparitive statistics on each plot and state them above plot
        Anov_F, Anov_p = anova(*[list(g[para]) for g in samples])
        kw_H, kw_p = kwtest(*[list(g[para]) for g in samples])
        plt.title(
            f'ANOVA p={round(Anov_p,p_dec)}; Kruskal-Wallis p={round(kw_p,p_dec)}'
        )
 ve.append(k)

amostra_paci_2['Subject_ID']=base_ids2['Subject_ID']
amostra_paci_2['fl_severidade']=base_ids2['fl_severidade']

df_vetor={'score':ve}
df_vetor_final=pd.DataFrame(df_vetor,columns=['score'])
print(df_vetor_final)
print()
print(base_unificada5_filtrada)
print()
amostra_paci_2['score']=df_vetor_final['score']
amostra_paci_2=amostra_paci_2.sort_values('score',ascending=False)
amostra_paci_2.to_csv("base_scorada_amostra.csv") 
import seaborn as sns 
sns.boxplot(x=amostra_paci_2['score'])
plt.savefig('boxplot_random.png')
plt.plot(amostra_paci_2['score'],'*')
plt.savefig('distribuicao_random.png')

###################  GRADIENT BOOSTING 
############################################
########
from sklearn.ensemble import GradientBoostingClassifier
tuned_parameters1= {
    "loss":["deviance","exponential"],
    "learning_rate": [0.05,0.075],
    "min_samples_split": np.linspace(0.1, 0.5, 6),
    "min_samples_leaf": np.linspace(0.1, 0.5, 6),
    "max_depth":[4,5,6],
    "max_features":["log2","sqrt"],
# list of categorical cols
categorical_cols = list(df.select_dtypes(include=['object']))
categorical_cols

print("\n>> Dtypes:\n{}".format(df.dtypes))

df.describe()

print(df["Y"].value_counts())

# Numerical data analysis
plt.figure(figsize=(10,8))
sns.distplot(df["nr.employed"])

get_ipython().run_line_magic('matplotlib', 'inline')
sns.boxplot(data=df, x="Y", y="nr.employed")
plt.show()

plt.figure(figsize=(10,8))
sns.distplot(df["euribor3m"])

get_ipython().run_line_magic('matplotlib', 'inline')
sns.boxplot(data=df, x="Y", y="euribor3m")
plt.show()

plt.figure(figsize=(10,8))
sns.distplot(df["cons.conf.idx"])

get_ipython().run_line_magic('matplotlib', 'inline')
sns.boxplot(data=df, x="Y", y="cons.conf.idx")
plt.show()
Ejemplo n.º 39
0
# The percentage of data retained from the initial dataset
len(lead_df) / initial[0] * 100

# - We have 70.88% of rows which is quite enough for analysis

# ### Data Visualization

# #### Univariate Analysis

# In[30]:

# Plotting the numerical variables
plt.figure(figsize=(14, 10))
plt.subplot(2, 3, 1)
sns.boxplot(lead_df['Total Time Spent on Website'])
plt.subplot(2, 3, 2)
sns.boxplot(lead_df['TotalVisits'])
plt.subplot(2, 3, 3)
sns.boxplot(lead_df['Page Views Per Visit'])
plt.show()

# The columns `TotalVisits` and `Page Views Per Visit` have outliers in it and needs to be treated

# **Handling the Outliers**

# In[31]:

# Capping the outliers to its 99th quantile value in Total Visits column
quant = lead_df['TotalVisits'].quantile([0.99])
lead_df['TotalVisits'] = np.clip(
# -*- coding: utf-8 -*-
"""
Created on Sat Dec 29 16:24:29 2018

@author: Raktim Mondol
"""

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
sns.set(style="whitegrid")

data = pd.read_csv('./figure_data/log_loss_data.csv')

ax = sns.boxplot(x="Metrics",
                 y="Score",
                 hue="Method",
                 data=data,
                 palette="Set2",
                 linewidth=2)

fig = ax.get_figure()
fig.savefig("./saved_figures/box_plot_log_loss.png", dpi=300)
Ejemplo n.º 41
0
cols = ['temp', 'atemp', 'windspeed', 'humidity']

pp = sns.pairplot(df[cols],
                  diag_kws=dict(shade=True),
                  diag_kind="kde",
                  kind="reg")

fig = pp.fig
fig.subplots_adjust(top=0.93, wspace=0.3)
fig.suptitle('Correlação das variáveis numéricas',
             fontsize=14,
             fontweight='bold')

# In[80]:

sns.boxplot(data=df[['temp', 'atemp', 'humidity', 'windspeed', 'count']],
            orient='h')
fig = plt.gcf()
fig.set_size_inches(12, 6)
fig.suptitle('Análise de Outliers', fontsize=14, fontweight='bold')

# In[81]:

fig, (ax1, ax2, ax3) = plt.subplots(nrows=3)
fig.set_size_inches(12, 18)

sns.factorplot(x="month",
               y="count",
               data=df,
               kind='bar',
               size=5,
               aspect=1.5,
Ejemplo n.º 42
0
def explore_integrity(interpolated_acti, path):
	""" Explore integrity of data after linear interpolation across trials 
	Arguments:
		interpolated_acti {array} -- activation tensor corrected via linear interpolation

	Keyword Arguments:
		None
		
	Returns:
		None
	"""
	N, T, K = interpolated_acti.shape
	nb_flag_roi = []
	nb_flag_trial = []

	for tol in range(50):
	    max_dist = np.nanmax(interpolated_acti, axis = 1)
	    max_mask = max_dist > tol

	    flag_roi = []
	    flag_trial = []
	    big_flag = max_mask * max_dist
	    
	    for roi in range(N):
	        for trial in range(K):
	            if max_mask[roi, trial]:
	                flag_roi.append(roi)
	                flag_trial.append(trial)
	                
	    flag_roi = list(set(flag_roi))
	    flag_trial = list(set(flag_trial))
	    nb_flag_roi.append(len(flag_roi))
	    nb_flag_trial.append(len(flag_trial))

	fig = plt.figure(figsize=(10,5))

	fig.add_subplot(1,2,1)
	plt.plot(np.arange(50), nb_flag_roi)
	plt.title('ROI lost')
	plt.xlabel('Threshold')
	plt.ylabel('Number of ROI')

	fig.add_subplot(1,2,2)
	plt.plot(np.arange(50), nb_flag_trial)
	plt.title('Trials lost')
	plt.xlabel('Threshold')
	plt.ylabel('Number of trials')
	plt.savefig(os.path.join(path, 'explore_integrity1.png'))


	fig = plt.figure(figsize = (25,15))

	fig.add_subplot(2,1,1)
	sns.boxplot(x = np.arange(K), y = [max_dist[:,i] for i in range(K)])
	plt.xticks(np.arange(K)[::40],  np.arange(K)[::40], rotation = 'horizontal')
	plt.xlabel('Trials', {'fontsize': 'large', 'fontweight' : 'roman'})
	plt.ylabel('Distribution of maxima across ROI', {'fontsize': 'large', 'fontweight' : 'roman'})

	fig.add_subplot(2,1,2)
	plt.imshow(big_flag, cmap = 'hot')
	plt.xticks(np.arange(K)[::40],  np.arange(K)[::40], rotation = 'horizontal')
	plt.xlabel('Trials', {'fontsize': 'large', 'fontweight' : 'roman'})
	plt.ylabel('ROI', {'fontsize': 'large', 'fontweight' : 'roman'})

	plt.savefig(os.path.join(path, 'explore_integrity2.png'))
Ejemplo n.º 43
0
palb_risk=mer[mer['IndivID'].isin(map(str,list(palb_ind)))]
chek_risk=mer[mer['IndivID'].isin(map(str,list(chek_ind)))]
atm_risk=mer[mer['IndivID'].isin(map(str,list(atm_ind)))]

palb_risk['Version'] = palb_risk['Version'].str.replace('v4beta14','PALB2')
chek_risk['Version'] = chek_risk['Version'].str.replace('v4beta14','CHEK2')
atm_risk['Version'] = atm_risk['Version'].str.replace('v4beta14','ATM')

mer_ver=pd.concat([chek_risk,atm_risk,palb_risk], axis=0)

mer_ver['Age']=mer_ver['Age'].astype('int64')
mer_ver['age_range']=pd.cut(mer_ver['Age'],bins=[20,30,40,50,60,70,80])
mer_ver['BrCaRisk%']=mer_ver['BrCaRisk%'].astype('float')

mer_ver=mer_ver.rename({'Version':'Genes'}, axis='columns')


bx = sns.boxplot(x="age_range", y="ratio", hue="Genes",
                  data=mer_ver, palette='colorblind', sym='').set_title('Risk Ratio')
plt.savefig('risk_ratio.png',dpi=500)


out_data_v4['BrCaRisk%']=out_data_v4['BrCaRisk%'].astype('float')
out_data_v4['Age']=out_data_v4['Age'].astype('int64')
top_ext_v4=out_data_v4.loc[(out_data_v4['Age']==80)&
                        (out_data_v4['BrCaRisk%']>70), ('FamID', 'BrCaRisk%')]

bottom_ext=out_data_v4.loc[(out_data_v4['Age']==80)&
                        (out_data_v4['BrCaRisk%']<2.8), ('FamID', 'BrCaRisk%')]
Ejemplo n.º 44
0
X_train['int.rate'] = X_train['int.rate'].apply(lambda x: x / 100)
X_test['int.rate'] = X_test['int.rate'].str.replace('%', " ").astype(float)
X_test['int.rate'] = X_test['int.rate'].map(lambda x: x / 100)
num_df = X_train.select_dtypes(include=[np.number])
cat_df = X_train.select_dtypes(include='object')
# Code ends here

# --------------
#Importing header files
import seaborn as sns

# Code starts here
cols = num_df.columns
fig, axes = plt.subplots(9, 1, figsize=(10, 10))
for i in range(0, 9):
    sns.boxplot(x=y_train, y=num_df[cols[i]], ax=axes[i])

# --------------
# Code starts here

cols = list(cat_df.columns)
fig, axes = plt.subplots(2, 2, figsize=(10, 10))
for i in range(0, 2):
    for j in range(0, 2):
        sns.countplot(x=X_train[cols[i * 2 + j]], hue=y_train, ax=axes[i, j])

# Code ends here

# --------------
#Importing header files
from sklearn.tree import DecisionTreeClassifier
Ejemplo n.º 45
0
from datetime import timedelta
#load the datset
data = pd.read_csv('loan_train.csv')
data.head()
print("csv =  \n", data.head())

#checking null values in the dataset
data.isnull()
print("null = \n", data.isnull())

#droping unwanted data columns
data.drop(['Unnamed: 0', 'Unnamed: 0.1'], axis=1, inplace=True)
print("after droping column = \n ", data.head(20))

#exploring the data
sns.boxplot(x='education', y='age', data=data)
plt.title("education based on the age ")
#plt.show()

sns.distplot(data['Principal'])
#plt.show()

Var_Corr = data.corr()
sns.heatmap(Var_Corr,
            xticklabels=Var_Corr.columns,
            yticklabels=Var_Corr.columns,
            annot=True)
#plt.show()

sns.countplot(x='Gender', data=data)
plt.title("count of male and female ")
 def visualization(self):
     """
     接口请求参数
         "tableName": "advertising",  # str,数据库表名
         "X": ["TV", "radio", "newspaper"],  # list,自变量,当表格方向为h时表示多个变量名,为v时表示分类变量字段
         "Y": ["sales"],  # list,因变量,当表格方向为v是使用
         "show_options": ["y_count", "pairs", "corr", "y_corr"], # 展示选项
         "x_count": [], # list,选择要展示频率分布直方图的自变量
         "box": [], # list,选择要展示箱型图的自变量
     :return:
     """
     try:
         res = []
         self.table_data = self.table_data.astype("float")
         data = self.table_data.describe()
         res.append(
             transform_table_data_to_html({
                 "data": data.values.tolist(),
                 "title": "描述性统计分析",
                 "col": data.columns.tolist(),
                 "row": data.index.tolist()
             }))
         if self.config.get("x_count") and self.config.get("x_count")[0]:
             for x in self.config["x_count"]:
                 sns.distplot(self.table_data[x], kde=False)
                 # 显示纵轴标签
                 plt.ylabel("frequency")
                 # 显示图标题
                 # plt.title("{} - frequency distribution histogram".format(x))
                 res.append({
                     "title":
                     "{} - 频率分布".format(x),
                     "base64":
                     "{}".format(self.plot_and_output_base64_png(plt))
                 })
         if "y_count" in self.config["show_options"]:
             sns.distplot(self.table_data[self.config["Y"][0]], kde=False)
             # 显示横轴标签
             plt.xlabel("section")
             # 显示纵轴标签
             plt.ylabel("frequency")
             # 显示图标题
             # plt.title("y frequency distribution histogram")
             res.append({
                 "title":
                 "{} - 频率分布".format(self.config["Y"][0]),
                 "base64":
                 "{}".format(self.plot_and_output_base64_png(plt))
             })
         if self.config.get("box") and self.config.get("box")[0]:
             for x in self.config["box"]:
                 sns.boxplot(self.table_data[x], palette="Set2", orient="v")
                 # 显示图标题
                 # plt.title("{} - Box distribution to check outliers".format(x))
                 res.append({
                     "title":
                     "{} - 箱型图".format(x),
                     "base64":
                     "{}".format(self.plot_and_output_base64_png(plt))
                 })
         if "pairs" in self.config["show_options"]:
             sns.pairplot(self.table_data)
             # plt.title("Variable relation in pairs")
             res.append({
                 "title":
                 "变量两两关系图",
                 "base64":
                 "{}".format(self.plot_and_output_base64_png(plt))
             })
         if "corr" in self.config["show_options"]:
             corr = self.table_data.corr()
             sns.heatmap(corr,
                         xticklabels=corr.columns,
                         yticklabels=corr.columns,
                         linewidths=0.2,
                         cmap="YlGnBu",
                         annot=True)
             # plt.title("Correlation between variables")
             res.append({
                 "title":
                 "相关系数图",
                 "base64":
                 "{}".format(self.plot_and_output_base64_png(plt))
             })
         if "y_corr" in self.config["show_options"]:
             self.table_data.corr()[self.config["Y"][0]].sort_values(
                 ascending=False).plot(kind='bar')
             # plt.title("Correlations between y and x")
             res.append({
                 "title":
                 "因变量和各自变量的相关系数图",
                 "base64":
                 "{}".format(self.plot_and_output_base64_png(plt))
             })
         response_data = {"res": res, "code": "200", "msg": "ok!"}
         return response_data
     except Exception as e:
         return {"data": "", "code": "500", "msg": "{}".format(e.args)}
#%%

#boxplots
#first, rearrange structures in ASCENDING order (will be plotted as descending, -_-) by density and counts
order = np.argsort(np.median(pcounts, axis=0))[::-1]
#renaming for figure
sois_sort = np.array(sois)[order][:10]

#boxplots of percent counts
plt.figure(figsize=(5, 4))
df = pd.DataFrame(pcounts)
df.columns = sois
g = sns.stripplot(data=df, color="dimgrey", orient="h", order=sois_sort)
sns.boxplot(data=df,
            orient="h",
            showfliers=False,
            showcaps=False,
            boxprops={"facecolor": "None"},
            order=sois_sort)
plt.xlabel("% of neocortical neurons")
plt.ylabel("Region")

#hide the right and top spines
sns.despine(top=True, right=True, left=False, bottom=False)

plt.tick_params(length=6)

plt.savefig(os.path.join(dst, "prv_nc_pcounts_boxplots.pdf"),
            bbox_inches="tight")

#%%
Ejemplo n.º 48
0
clf = SVC()
clf.fit(X_train, y_train)
linear_svc = LinearSVC()

print("Accuracy:{}".format(clf.score(X_test, y_test)))

#Create initial prediction
test = df_test[Numeric_Columns].fillna(-1000)
SubMission['Survived'] = clf.predict(test)

#Make first Submission
SubMission.set_index("PassengerId", inplace=True)
SubMission.to_csv('myFirstSubmission.csv', sep=',')

fig, (ax1, ax2) = plt.subplots(ncols=2, sharey=True, figsize=(12, 6))
sns.boxplot(data=df_train, x="Pclass", y="Fare", ax=ax1)
plt.figure(1)
sns.boxplot(data=df_train, x="Embarked", y="Fare", ax=ax2)
plt.show()

embarked = ['S', 'C', 'Q']
for port in embarked:
    fare_to_impute = df_data.groupby('Embarked')['Fare'].median()[
        embarked.index(port)]
    df_data.loc[(df_data['Fare'].isnull()) & (df_data['Embarked'] == port),
                'Fare'] = fare_to_impute

#Fare in df_train an df_test
df_train["Fare"] = df_data["Fare"][:891]
df_test["Fare"] = df_data["Fare"][891:]
print("Missing Fares Estimated")
Ejemplo n.º 49
0
a = list(range(1, 10))
for i in b:
    a[i - 1] = stats.percentileofscore(summary['numberall'].to_list(), i)
plt.plot(b, a)

summary['numberall'].hist()
df_total['averagerating'].hist()
plt.scatter(df_total['startyear'], df_total['age'])

#Which age does the director most often produce film
year_a
year_a['startyearlen'][year_a['count'] > 1].describe()
year_a['startyearlen'][year_a['count'] > 1].hist()

#Glimps of the data
sns.boxplot(df['numVotes'])
df['numVotes'].describe()
#The numVotes is heavily tailed, most movies only has less than 300 people to vote
np.corrcoef(df['numVotes'], df['averageRating'])
sns.scatterplot(x='numVotes', y='averageRating', data=df)

con = psycopg2.connect(
    #user name
)
#create a cursor
cur = conn.cursor()
#execute a query
cur.execute('SELECT title, directors FROM data')
row = cur.fetch

#commit the changes
Ejemplo n.º 50
0
sampled_by_month = forex_close_price.resample(rule='1m', how='last')
log_returns = np.log(sampled_by_month / sampled_by_month.shift(1))
log_returns = log_returns[1:]
pct_returns = sampled_by_month.pct_change()
log_returns.drop(log_returns.index[0], inplace=True)
pct_returns.drop(pct_returns.index[0], inplace=True)
log_returns.gbpusd = log_returns.gbpusd.dropna()
""" 
plot monthly log return of each pair at month
"""
monthinteger = 9
month = datetime.date(1900, monthinteger, 1).strftime('%B')
ax, fig = plt.subplots(1, 1, figsize=(8, 4))
plt.title('Forex seasonality for %s' % month)
plt.ylabel('Monthly log return')
sns.boxplot(log_returns.ix[log_returns.index.month == monthinteger])
ax.autofmt_xdate()
ax.savefig(result_dir + 'Monthly log return in %s.png' % month)
""" 
boxplot monthly log return of a pair from Jan to Dec
"""
for sym in forex_list:
    ax, fig = plt.subplots(1, 1, figsize=(8, 4))
    sns.boxplot(data=[log_returns.loc[log_returns.index.month==s, sym].dropna().values \
                for s in range(1,13)])
    #    sns.boxplot(data=[log_returns[sym][log_returns[sym].index.month==s].dropna().values \
    #               for s in range(1,13)])
    plt.title('seasonality in %s from 2005-2017' % sym)
    plt.xlabel('Months')
    ax.savefig(result_dir + ('seasonality in %s from 2005-2017.png' % sym))
""" 
plt.xticks(fontproperties=siyuanheiti)
ax1.set_title('北京各大区二手房每平米单价对比',fontsize=15,fontproperties = siyuanheiti)
ax1.set_xlabel('区域',fontproperties = siyuanheiti)
ax1.set_ylabel('每平米单价',fontproperties = siyuanheiti)

# 不同地区的二手房数量
f2,ax2 = plt.subplots(1,1,figsize=(20,7))
sns.barplot(x='Region', y='Price', palette="Greens_d", data=df_house_count, ax=ax2)
plt.xticks(fontproperties=siyuanheiti)
ax2.set_title('北京各大区二手房数量对比',fontsize=15,fontproperties = siyuanheiti)
ax2.set_xlabel('区域',fontproperties = siyuanheiti)
ax2.set_ylabel('数量',fontproperties = siyuanheiti)

# 不同地区的二手总价
f3,ax3 = plt.subplots(1,1,figsize=(20,7))
sns.boxplot(x='Region', y='Price', data=df, ax=ax3)
plt.xticks(fontproperties=siyuanheiti)
ax3.set_title('北京各大区二手房房屋总价',fontsize=15,fontproperties = siyuanheiti)
ax3.set_xlabel('区域',fontproperties = siyuanheiti)
ax3.set_ylabel('房屋总价',fontproperties = siyuanheiti)

# plt.show()

# size特征分析
# 房间大小
f4,[ax4,ax5] = plt.subplots(1,2,figsize=(15,5))
sns.distplot(df['Size'],bins=20,ax=ax4,color='r')
sns.kdeplot(df['Size'],shade=True,ax=ax4)
ax4.set_title("北京各大区二手房大小分布",fontproperties=siyuanheiti)
sns.regplot(x='Size',y='Price',data=df,ax=ax5)
ax5.set_title("北京各大区二手房大小与价格分布",fontproperties=siyuanheiti)
Ejemplo n.º 52
0
def _plot_categorical_and_continuous(df,
                                     xlabel,
                                     ylabel,
                                     x_keys,
                                     y_keys,
                                     ax,
                                     cmap,
                                     n_cat=5,
                                     plottype="box"):
    """
    Plot a categorical variable and a continuous variable against each
    other. Types of plots include box plot, violin plot, strip plot and swarm
    plot.

    Parameters
    ----------
    df : pd.DataFrame
        A pandas DataFrame with the data

    xlabel : str
        The column name for the variable on the x-axis

    ylabel : str
        The column name for the variable on the y-axis

    ax : matplotlib.Axes object
        The matplotlib.Axes object to plot the bubble plot into

    cmap : matplotlib.cm.colormap
        A matplotlib colormap to use for shading the bubbles

    n_cat : int
        The number of categories; used for creating the colour map

    plottype : {"box" | "violin" | "strip" | "swarm"}
        The type of plot to produce; default is a box plot

    Returns
    -------
    ax : matplotlib.Axes object
        The same matplotlib.Axes object for further manipulation

    """
    if x_keys is xlabel:
        keys = y_keys
    elif y_keys is ylabel:
        keys = x_keys
    else:
        raise Exception("Something went terribly, horribly wrong!")

    current_palette = sns.color_palette(cmap, n_cat)
    if plottype == "box":
        sns.boxplot(x=xlabel,
                    y=ylabel,
                    data=df,
                    order=keys,
                    palette=current_palette,
                    ax=ax)
    elif plottype == "strip":
        sns.stripplot(x=xlabel,
                      y=ylabel,
                      data=df,
                      order=keys,
                      palette=current_palette,
                      ax=ax)
    elif plottype == "swarm":
        sns.swarmplot(x=xlabel,
                      y=ylabel,
                      data=df,
                      order=keys,
                      palette=current_palette,
                      ax=ax)
    elif plottype == "violin":
        sns.violinplot(x=xlabel,
                       y=ylabel,
                       data=df,
                       order=keys,
                       palette=current_palette,
                       ax=ax)
    else:
        raise Exception("plottype not recognized!")

    return ax
Ejemplo n.º 53
0
    y_val_fin = pd.concat(y_val_2)
    return X_train_fin, X_test_fin, X_val_fin, y_test_fin, y_train_fin, y_val_fin
        
X_train, X_test, X_val, y_test, y_train, y_val  = test_train_splitter(test_df)
X_train = X_train.reset_index(drop=True).copy()
X_test = X_test.reset_index(drop=True).copy()
y_test = y_test.reset_index(drop=True).copy()
y_train = y_train.reset_index(drop=True).copy()
X_val = X_val.reset_index(drop=True).copy()
y_val = y_val.reset_index(drop=True).copy()


# In[35]:


sns.boxplot(X_train['Attr27'])


# In[36]:


sns.distplot(X_train['Attr27'])


# In[4]:


X_train.shape, y_train.shape, X_test.shape, y_test.shape, X_val.shape, y_val.shape


# In[5]:
Ejemplo n.º 54
0
#after looking at this we can tell that people who did not survive were more likely to be belonging to third class i.e the lowest class, the cheapest to get on to and people who did survive were more towards belonging to higher classes.
#distribution plot of age of the people
sns.distplot(train['Age'].dropna(), kde=False, bins=30, color='Green')
#The average age group of people to survive is somewhere between 20 to 30and as older you get lesser chances of you to have on board.
##countplot of the people having siblings or spouce
sns.countplot(x='SibSp',data=train)
#looking at this plot we can directly tell that most people on board did not have either children, siblings or spouse on board and the second most popular option is 1which is more likely to be spouse. We have a lot of single people on board, they don’t have spouse or children.
#distribution plot of the ticket fare
train['Fare'].hist(color='green',bins=40,figsize=(8,4))
#It looks like most of the purchase prices are between 0 and50, which actually makes sense tickets are more distributed towards cheaper fare prices because most passengers are in cheaper third class.

#%%%Data Cleaning
#We want to fill in missing age data instead of just dropping the missing age data rows. One way to do this is by filling in the mean age of all the passengers. However, we can be smarter about this and check the average age by passenger class.
#boxplot with age on y-axis and Passenger class on x-axis.
plt.figure(figsize=(12, 7))
sns.boxplot(x='Pclass',y='Age',data=train,palette='winter');

#We can see the wealthier passengers in the higher classes tend to be older, which makes sense. We’ll use these average age values to impute based on Pclass for Age.
#function 
def impute_age(cols):
    Age = cols[0]
    Pclass = cols[1]
    if pd.isnull(Age):
        if Pclass == 1:  return 37
        elif Pclass == 2:  return 29
        else: return 24
    else:   return Age

#Now apply that function!
train['Age'] = train[['Age','Pclass']].apply(impute_age,axis=1)
#Now let’s check that heatmap again!
Ejemplo n.º 55
0
def plot_NO2_by_district(df):

    fig, ax = plt.subplots()

    flierprops = dict(markerfacecolor='1',
                      markersize=8,
                      marker='o',
                      linestyle='none')
    colors = ['red', 'dodgerblue']

    bplot = sns.boxplot(
        x='district',  # vertical
        y='no2_2017',
        orient='v',
        hue='tipus',
        data=df,
        width=.5,
        palette=colors,
        linewidth=0.5,
        flierprops=flierprops,
        whis=[5, 95],
        order=[
            'Ciutat Vella', 'Eixample', 'Sants-Montjuïc', 'Les Corts',
            'Sarrià-Sant Gervasi', 'Gràcia', 'Horta-Guinardó', 'Nou Barris',
            'Sant Andreu', 'Sant Martí'
        ])

    for patch in bplot.artists:
        r, g, b, a = patch.get_facecolor()
        patch.set_facecolor((r, g, b, .75))

    index = 0
    for i, artist in enumerate(bplot.artists):
        col = artist.get_facecolor()
        artist.set_edgecolor(col)
        artist.set_facecolor(col)

        for j in range(i * 6, i * 6 + 6):
            line = bplot.lines[j]
            if j == 4 + 6 * index:
                line.set_color('#ffffff')
                line.set_mfc('#ffffff')
                line.set_mec('#ffffff')
                index = index + 1
            else:
                line.set_color(col)
                line.set_mfc(col)
                line.set_mec(col)

    ax.set_xlabel('', **label_style)
    ax.set_ylabel('NO2 levels', **label_style)

    plt.xticks(rotation=90, **ticks_style)
    plt.yticks(**ticks_style)

    plt.gca().spines["top"].set_visible(False)
    plt.gca().spines["bottom"].set_visible(False)
    plt.gca().spines["right"].set_visible(False)
    plt.gca().spines["left"].set_visible(False)

    plt.grid(axis='x', alpha=.5, linewidth=.5, color='lightgrey')

    plt.tick_params(axis='x',
                    which='both',
                    bottom=False,
                    top=False,
                    labelbottom=True)
    plt.tick_params(axis='y',
                    which='both',
                    left=False,
                    right=False,
                    labelleft=True)

    plt.legend(frameon=False, prop=legend_style)

    for t, l in zip(ax.get_legend().texts, ['Traffic', 'Background']):
        t.set_text(l)

    plt.tight_layout()

    plt.savefig('img/Figure4c.pdf', figsize=(10, 6), dpi=300)
Ejemplo n.º 56
0
    y_score = res.LLR.values
    y_true = np.array([x.upper() in all_markers_norm
                       for x in res.index]).astype('int')

    fpr, tpr, thresholds = roc_curve(y_true, y_score)
    auc_i = roc_auc_score(y_true, y_score)
    aucs.append([auc_i, 'spatialDE'])

    plt.plot(fpr, tpr, color='green')

plt.show()

aucs = pd.DataFrame(aucs, columns=['AUC', 'Method'])

sns.boxplot(x='Method', y='AUC', data=aucs)
plt.ylim(0, 1)
plt.show()

# %%

aucs = []

plt.figure()

for ff in hs_files:
    res = pd.read_table(ff, index_col=0)

    y_score = res.Z.values
    y_true = np.array([x.upper() in all_markers_norm
                       for x in res.index]).astype('int')
Ejemplo n.º 57
0
#f, ax = plt.subplots(figsize=(6, 8))
ax[1] = sns.countplot(x=" income", data=dataset, palette="Set1")
ax[1].set_title("Frequency distribution of income variable")
plt.show()

# Distribution of age variable
f, ax = plt.subplots(figsize=(10, 8))
x = dataset['age']
ax = sns.distplot(x, bins=10, color='blue')
ax.set_title("Distribution of age variable")
plt.show()

# Detect outliers in age variable with boxplot
f, ax = plt.subplots(figsize=(10, 8))
x = dataset['age']
ax = sns.boxplot(x)
ax.set_title("Visualize outliers in age variable")
plt.show()

# Visualize income with respect to age variable
f, ax = plt.subplots(figsize=(10, 8))
ax = sns.boxplot(x=" income", y="age", data=dataset)
ax.set_title("Visualize income with respect to age variable")
plt.show()

# Visualize income with respect to age and sex variable
plt.figure(figsize=(8, 6))
ax = sns.catplot(x=" income",
                 y="age",
                 col=" sex",
                 data=dataset,
Ejemplo n.º 58
0
    fig, axs = plt.subplots(nrows=n_rows,
                            ncols=1,
                            figsize=[1.2 * len(dataframes) + 2, 4.0 * n_rows],
                            sharex=True)
    fig.subplots_adjust(hspace=0.5)

    fig.suptitle("DBScan_Clustering result" + titleext,
                 size="xx-large",
                 weight="black")

    # boxplots
    sns.set_style("whitegrid")
    sns.boxplot(ax=axs[0],
                x="Technique",
                y="PuRicall",
                hue="Metric",
                data=redundancy_frame,
                palette="Set2")
    sns.stripplot(ax=axs[0],
                  x="Technique",
                  y="PuRicall",
                  hue="Metric",
                  data=redundancy_frame,
                  palette="Set2",
                  dodge=True,
                  edgecolor="black",
                  linewidth=0.3)

    handles, labels = axs[0].get_legend_handles_labels(
    )  # legend, use to only show half the legend
    axs[0].set_ylabel("Purity & Recall")
Ejemplo n.º 59
0
df.isnull().values.sum()
missing_ratio = df.isnull().sum() / len(df)
missing_ratio.sort_values(ascending=True)[10:]

# check UA_T
null_data = df[df['UA_0.5W_T'].isnull()]

# drop four
#df = df[(df["MVID"] != 47553) & (df["MVID"] != 46899) & (df["MVID"] != 67598) & (df["MVID"] != 45798)]
df = df[~df["MVID"].isin([47553, 46899, 67598, 45798])]

# check FCO
null_data = df[df['FCO_0.5W_T'].isnull()]

# fil missing values with group median
sns.boxplot(x="CATEGORY", y="FCO_0.5W_T", data=df)

# get median
df[[
    'FCO_0.5W_T', 'FCO_0.5W_M24', 'FCO_0.5W_M26', 'FCO_0.5W_F24',
    'FCO_0.5W_F26', 'CATEGORY'
]].groupby('CATEGORY').median()


# FCO_0.5W_T
def impute_FCO(cols):
    FCO = cols[0]
    GROUP = cols[1]

    if pd.isnull(FCO):
Ejemplo n.º 60
0
final_rcp85 = pd.concat(list_tmp)

final_histo = final_histo.assign(Location=1)
final_rcp45 = final_rcp45.assign(Location=2)
final_rcp85 = final_rcp85.assign(Location=3)

cdf = pd.concat([final_histo, final_rcp45, final_rcp85])

mdf = pd.melt(cdf, id_vars=['Location'], var_name=['temp_bins'])

ax = sns.boxplot(
    x="temp_bins",
    y="value",
    hue="Location",
    data=mdf,
    showfliers=False,
    palette=[
        sns.xkcd_rgb["medium green"], sns.xkcd_rgb["medium blue"],
        sns.xkcd_rgb["pale red"]
    ],
)  # https://xkcd.com/color/rgb/

#ax.set(ylim=(0, 50))
#plt.legend(title='Smoker', loc='upper left', labels=['RCMs historical', 'RCMs rcp45 scenario', 'RCMs rcp85 scenario'])
handles, _ = ax.get_legend_handles_labels()
ax.legend(handles, legends, prop={'size': 15})
ax.set_title('Outaouais Watershed: November to May',
             fontdict={
                 'fontsize': 20,
                 'fontweight': 'bold'
             })