Esempio n. 1
0
def gen_entropy_hist(good, bad):
    g = gen_entropy_data(good)
    b = gen_entropy_data(bad)
    p1 = sns.distplot(g, color='g')
    p2 = sns.distplot(b, color='r')
    p1.set(xlim=(0,1))
    p2.set(xlim=(0,1))
Esempio n. 2
0
def cycle_time_histogram(cycle_data, bins=30, percentiles=[0.3, 0.5, 0.75, 0.85, 0.95], title=None, ax=None):
    histogram_df = cycle_data[['cycle_time']].dropna(subset=['cycle_time'])
    ct_days = histogram_df['cycle_time'].dt.days

    if len(ct_days.index) < 2:
        raise UnchartableData("Need at least 2 completed items to draw histogram")

    if ax is None:
        fig, ax = plt.subplots()

    sns.distplot(ct_days, bins=bins, ax=ax, axlabel="Cycle time (days)")
    
    if title is not None:
        ax.set_title(title)

    left, right = ax.get_xlim()
    ax.set_xlim(0, right)

    # Add percentiles
    bottom, top = ax.get_ylim()
    for percentile, value in ct_days.quantile(percentiles).iteritems():
        ax.vlines(value, bottom, top - 0.001, linestyles='--', linewidths=1)
        ax.annotate("%.0f%% (%.0f days)" % ((percentile * 100), value,),
            xy=(value, top),
            xytext=(value, top - 0.001),
            rotation="vertical",
            fontsize="small",
            ha="right"
        )

    return ax
Esempio n. 3
0
def plot_frame_displacement(realignment_parameters_file, mean_FD_distribution=None, figsize=(11.7,8.3)):

    FD_power = calc_frame_dispalcement(realignment_parameters_file)

    fig = Figure(figsize=figsize)
    FigureCanvas(fig)
    
    if mean_FD_distribution:
        grid = GridSpec(2, 4)
    else:
        grid = GridSpec(1, 4)
    
    ax = fig.add_subplot(grid[0,:-1])
    ax.plot(FD_power)
    ax.set_xlim((0, len(FD_power)))
    ax.set_ylabel("Frame Displacement [mm]")
    ax.set_xlabel("Frame number")
    ylim = ax.get_ylim()
    
    ax = fig.add_subplot(grid[0,-1])
    sns.distplot(FD_power, vertical=True, ax=ax)
    ax.set_ylim(ylim)
    
    if mean_FD_distribution:
        ax = fig.add_subplot(grid[1,:])
        sns.distplot(mean_FD_distribution, ax=ax)
        ax.set_xlabel("Mean Frame Dispalcement (over all subjects) [mm]")
        MeanFD = FD_power.mean()
        label = "MeanFD = %g"%MeanFD
        plot_vline(MeanFD, label, ax=ax)
        
    return fig
Esempio n. 4
0
def rysuj_histogram(df, opis):
    plt.clf()

    global TRESC, MENU, global_tytul

    fig, ax = plt.subplots(figsize=(11, 5))
    plt.subplots_adjust(bottom=0.18, top=0.85)

    #ax.get_xaxis().tick_bottom()
    #ax.get_yaxis().tick_left()
    
    ax1 = sns.distplot(df.czas_netto_s, rug=True, bins=bins, kde=False)
    ax1.xaxis.set_major_formatter(FuncFormatter(time_ticks))
    ax1.xaxis.set_major_locator(MultipleLocator(dT))
    plt.xticks(rotation='vertical')
    ax1.set_xlabel(u"Czas netto")
    plt.ylabel(u"Zawodników")

    ax2 = ax1.twiny()
    ax1 = sns.distplot(df.czas_netto_s, rug=True, bins=bins, kde=False)
    ax2.xaxis.set_major_formatter(FuncFormatter(pace_ticks))
    ax2.xaxis.set_major_locator(MultipleLocator(dT))
    plt.xticks(rotation='vertical')
    ax2.set_xlabel(u"Tempo, min/km")
    plt.ylabel(u"Zawodników")

    outFileName = "hist-%s.png" % (opis)
    plt.savefig(outputdir + outputdir_rel + outFileName, dpi=dpi)

    TRESC += u"<p><img src='%s' alt='%s' /></p>\n" % (outFileName, global_tytul)
    plt.clf()
Esempio n. 5
0
def plot_volume_per_day_hist(transactions, ax=None, **kwargs):
    """
    Plots a histogram of trading volume per day.

    Parameters
    ----------
    transactions : pd.DataFrame
        A strategy's transactions. See pos.make_transaction_frame(transactions).
    ax : matplotlib.Axes, optional
        Axes upon which to plot.
    **kwargs, optional
        Passed to seaborn plotting function.

    Returns
    -------
    ax : matplotlib.Axes
        The axes that were plotted on.
    """

    if ax is None:
        ax = plt.gca()

    sns.distplot(transactions.txn_volume, ax=ax, **kwargs)
    ax.set_title('Distribution of Daily Trading Volume')
    ax.set_xlabel('Volume')
    return ax
Esempio n. 6
0
    def plot_dfgbrv_dist(self, **kwargs):
        """
        Plot four distribution plots for the deltafactor, deltafactor prime and the 
        relative errors for the GBRV fcc, bcc structures.

        Return: `matplotlib` figure.
        """
        import matplotlib.pyplot as plt
        fig, ax_list = plt.subplots(nrows=2, ncols=2, squeeze=True)
        ax_list = ax_list.ravel()

        frame = self.get_dfgbrv_dataframe()

        import seaborn as sns
        for ax, col in zip(ax_list.ravel(), ["deltafactor", "gbrv_fcc", "df_prime", "gbrv_bcc"]):
            values = frame[col].dropna() 
            sns.distplot(values, ax=ax, rug=True, hist=True, kde=False, label=col, bins=kwargs.pop("bins", 50))

            # Add text with Mean or (MARE/RMSRE)
            text = []; app = text.append
            if col in ("deltafactor", "df_prime"):
                app("Mean = %.2f" % values.mean())
            else:
                app("MARE = %.2f" % values.abs().mean())
                app("RMSRE = %.2f" % np.sqrt((values**2).mean()))

            ax.text(0.8, 0.8, "\n".join(text), transform=ax.transAxes)

        return fig
Esempio n. 7
0
 def explore(self):
     gs1 = gs.GridSpec(2,2)
     fig = plt.figure(figsize=(15,6))
     # histogram: report duration
     ax1 = fig.add_subplot(gs1[0:1,0])
     sns.distplot(self.df['report.duration'], bins=15, ax=ax1)
     ax1.set_title("patient report maximum delay reception")
     ax1.set_xlabel("Days")
     # bar horizontal: duplicates
     ax2 = fig.add_subplot(gs1[0:1,1])
     duplicates = self.df.duplicate.isnull().value_counts()
     duplicates.index=["UnReported", "Duplicates"]
     duplicates.plot(kind='barh', ax=ax2)
     ax2.set_title("Number of Duplicates Reported")
     # bar horizontal: country occurence
     ax3 = fig.add_subplot(gs1[1:2,0])
     countries = self.df["occurcountry"].value_counts(sort=True, ascending=True)
     countries.plot(kind='barh', ax=ax3)
     ax3.set_title("Countries where reported event occured")
     # bar horizontal: reporting types
     ax4 = fig.add_subplot(gs1[1:2,1])
     reportings = self.df["primarysource.qualification"]
     reportings = reportings[reportings.notnull()].astype('int').value_counts(sort=True, ascending=True)
     labels = {"1": "Physician", "2": "Pharamacist", "3": "Professional", "4": "Lawyer", "5": "Consumer"}
     reportings.index = reportings.index.map(lambda x: labels[str(x)])
     reportings.plot(kind='barh', ax=ax4)
     ax4.set_title("Distribution of Reporting Types")
     plt.tight_layout()
def sb_distplots(plotargs, return_key='close_return', update_type='Revisions'):
    "Plots conditional underpricing distributions. Run set_data(df) first."

    f, ax = plt.subplots(1,1,figsize=(16, 5), sharex=True)
    for arg in plotargs:
        df, c, l, h = arg

        sb.distplot(df[return_key], ax=ax,
            kde_kws={"label": l + "    Obs={N}".format(N=len(df)), "color": c},
            hist_kws={"histtype": "stepfilled", "color": c})

        r = df[return_key]
        m,s,y,med = r.mean(), r.std(), r.skew(), r.median()
        ax.annotate(
            u'μ={:.2f}%,   σ={:.2f},   γ={:.2f}'.format(m,s,y),
            xy=(med+2, h), xytext=(med+6, h+0.01),
            arrowprops=dict(facecolor=cl.rgb2hex(c), width=1.5, headwidth=5, shrink=0.1))


    H, prob = kruskalwallis(*[x[0][return_key] for x in plotargs])
    ax.annotate("Kruskal-Wallis: (H={H:.2f}, prob={p:.3f})".format(H=H, p=prob),
                xy=(66,0.01))

    plt.title("Conditional Underpricing Distributions %s" % update_type)
    plt.ylabel("Density")
    plt.xlim(xmin=-40,xmax=100)
    plt.xlabel("1st Day Returns (%)")
    plt.ylim((0, 0.12))
def DUPLICATE_remove(data, distance_threshold, graph=False):
    sizeFormat=pixelFormat * pixelSize
    poly=overlapping_grid(nrow, ncol, overlap_region,
                            [sizeFormat, sizeFormat])
    result=np.zeros(data.shape[0], dtype=bool)
    for i in xrange(data.shape[0]):
        result[i]=poly.contains(Point(data[i, 2], data[i, 3]))
    data_to_evaluate=data[result.ravel(), :]
    output=data[np.invert(result.ravel()), :]
    # Calculate Nearest Neighbors distance
    distance, indexes=DISTANCE_spatial(
        data_to_evaluate, data_to_evaluate, neighbors=2)
    if graph:
        sns.plt.grid(False)
        sns.distplot(distance)
        plt.title("%d spots to evaluate" % data_to_evaluate.shape[0])
        sns.despine()
        sns.plt.show()
    # Find which spots is below the distance_threshold
    mask_to_keep=distance > distance_threshold
    to_keep=[]
    to_exclude=[]
    for source in xrange(len(indexes)):
        target=indexes[source]
        if indexes[target] == source and distance[source] < distance_threshold and source not in to_exclude:
            to_keep.append(source)
            to_exclude.append(target)
    mask_to_keep[to_keep]=True
    print str(len(to_exclude)) + ' spots discarded'
    data_to_evaluate=data_to_evaluate[mask_to_keep, :]
    output=np.row_stack((output, data_to_evaluate))
    return output
Esempio n. 10
0
def plot_daily_turnover_hist(transactions, positions,
                             ax=None, **kwargs):
    """Plots a histogram of daily turnover rates.

    Parameters
    ----------
    transactions : pd.DataFrame
        Prices and amounts of executed trades. One row per trade.
         - See full explanation in tears.create_full_tear_sheet.
    positions : pd.DataFrame
        Daily net position values.
         - See full explanation in tears.create_full_tear_sheet.
    ax : matplotlib.Axes, optional
        Axes upon which to plot.
    **kwargs, optional
        Passed to seaborn plotting function.

    Returns
    -------
    ax : matplotlib.Axes
        The axes that were plotted on.

    """

    if ax is None:
        ax = plt.gca()
    turnover = txn.get_turnover(positions, transactions, period=None)
    sns.distplot(turnover, ax=ax, **kwargs)
    ax.set_title('Distribution of Daily Turnover Rates')
    ax.set_xlabel('Turnover Rate')
    return ax
Esempio n. 11
0
    def plot_returns_cmp(self, only_show_returns=False, only_info=False):
        """考虑资金情况下的度量,进行与benchmark的收益度量对比,收益趋势,资金变动可视化,以及其它度量信息,不涉及benchmark"""

        self.log_func('买入后卖出的交易数量:{}'.format(self.order_has_ret.shape[0]))
        self.log_func('胜率:{:.4f}%'.format(self.win_rate * 100))

        self.log_func('平均获利期望:{:.4f}%'.format(self.gains_mean * 100))
        self.log_func('平均亏损期望:{:.4f}%'.format(self.losses_mean * 100))

        self.log_func('盈亏比:{:.4f}'.format(self.win_loss_profit_rate))

        self.log_func('策略收益: {:.4f}%'.format(self.algorithm_period_returns * 100))
        self.log_func('策略年化收益: {:.4f}%'.format(self.algorithm_annualized_returns * 100))

        self.log_func('策略买入成交比例:{:.4f}%'.format(self.buy_deal_rate * 100))
        self.log_func('策略资金利用率比例:{:.4f}%'.format(self.cash_utilization * 100))
        self.log_func('策略共执行{}个交易日'.format(self.num_trading_days))

        if only_info:
            return

        self.algorithm_cum_returns.plot()
        plt.legend(['algorithm returns'], loc='best')
        plt.show()

        if only_show_returns:
            return
        sns.regplot(x=np.arange(0, len(self.algorithm_cum_returns)), y=self.algorithm_cum_returns.values)
        plt.show()
        sns.distplot(self.capital.capital_pd['capital_blance'], kde_kws={"lw": 3, "label": "capital blance kde"})
        plt.show()
def main_fraction_under_figure(mirna2tar, mirna2age, target2age):

	counter = 0
	perc_younger_lst = []
	tot_counter = 0
	for mirna in mirna2tar:
		if mirna not in mirna2age: continue
		age_set = [target2age[alpha] for alpha in mirna2tar[mirna] if alpha in target2age]


		perc_younger_lst.append(float(sum(i < mirna2age[mirna] for i in  age_set))/ float(len(age_set)))



	print len(sorted(perc_younger_lst))


	sns.distplot(perc_younger_lst)

	plt.gca().set_xlim([0,.6])
	plt.ylabel('Number of miRNAs')
	plt.xlabel('Fraction of Protein Coding Targets Younger than miRNA')
	plt.subplots_adjust(bottom=0.20)
	plt.savefig('figures/mirna_age_fraction.pdf',bbox_inches='tight')
	plt.close()
def count_vote_dist():
    db_inst = get_db_inst('AmazonReviews', 'AndroidAPP')
    delta = 2
    x_list = []
    y_list = []
    xx = []
    for i in range(1000):
        x_list.append((i * delta, (i + 1) * delta))
        pass
    for tu in x_list:
        try:
            # y_list.append(math.log(db_inst.find({"total_vote": {"$gt": tu[0], "$lt": tu[1]}}).count(), 10))
            y_list.append(db_inst.find({"total_vote": {"$gte": tu[0], "$lt": tu[1]}}).count())

            xx.append(tu[0])
            print y_list[-1]
        except:
            xx.append(tu[0])
            y_list.append(0)
    # y_list.append(math.log(db_inst.find({"total_vote": {"$gt": x_list[-1][1]}}).count(), 10))
    y_list.append(db_inst.find({"total_vote": {"$gt": x_list[-1][1]}}).count())
    xx.append(xx[-1] + 1)
    res = {"x": x_list, 'y': y_list}
    open('%s/data/amazon_data/%s' % (PROJECT_PATH, 'vote_counts.json'), 'w').write(json.dumps(res))
    # plt.plot(xx, y_list)
    # plt.grid()
    # plt.show()
    sns.distplot(y_list)
    plt.show()
Esempio n. 14
0
    def plot_hist(self, struct_type, ax=None, errtxt=True, **kwargs):
        """
        Histogram plot.
        """
        #if codes is None: codes = ["ae"]
        ax, fig, plt = get_ax_fig_plt(ax)
        import seaborn as sns

        codes = ["this", "gbrv_paw"] #, "gbrv_uspp", "pslib", "vasp"]
        new = self[self["struct_type"] == struct_type].copy()
        ypos = 0.8
        for i, code in enumerate(codes):
            values = (100 * (new[code] - new["ae"]) / new["ae"]).dropna()
            sns.distplot(values, ax=ax, rug=True, hist=False, label=code)

            # Add text with Mean or (MARE/RMSRE)
            if errtxt:
                text = []; app = text.append
                #app("%s MARE = %.2f" % (code, values.abs().mean()))
                app("%s RMSRE = %.2f" % (code, np.sqrt((values**2).mean())))
                ax.text(0.6, ypos, "\n".join(text), transform=ax.transAxes)
                ypos -= 0.1

        ax.grid(True)
        ax.set_xlabel("relative error %")
        ax.set_xlim(-0.8, 0.8)

        return fig
Esempio n. 15
0
def log2_oulierfilter(df_by_cell, plot=False):
    log2_df = np.log2(df_by_cell+1)
    top_log2 = find_top_common_genes(log2_df)
    if top_log2.empty:
        print("no common genes found")
        return log2_df, log2_df.transpose()
    log2_df2= pd.DataFrame(pd.to_numeric(log2_df, errors='coerce'))
    log_mean = top_log2.mean(axis=0).sort_values(ascending=False)
    log2_sorted = top_log2.reindex_axis(top_log2.mean(axis=0).sort_values(ascending=False).index, axis=1)
    xticks = []
    keep_col= []
    log2_cutoff = np.average(log2_sorted)-np.std(log2_sorted)
    avg_cutoff = np.average(log2_cutoff)
    for col, m in zip(log2_sorted.columns.tolist(),log2_sorted.mean()):
        if m > avg_cutoff:
            keep_col.append(col)
            xticks.append(col+' '+str("%.2f" % m))
    filtered_df_by_cell = df_by_cell[keep_col]
    filtered_df_by_gene = filtered_df_by_cell.transpose()
    filtered_log2 = np.log2(filtered_df_by_cell[filtered_df_by_cell>0])
    if plot:
        ax = sns.boxplot(data=filtered_log2, whis= .75, notch=True)
        ax = sns.stripplot(x=filtered_log2.columns.values, y=filtered_log2.mean(axis=0), size=4, jitter=True, edgecolor="gray")
        xtickNames = plt.setp(ax, xticklabels=xticks)
        plt.setp(xtickNames, rotation=90, fontsize=9)
        plt.show()
        plt.clf()
        sns.distplot(filtered_log2.mean())
        plt.show()
    log2_expdf_cell = np.log2(filtered_df_by_cell+1)
    log2_expdf_gene = log2_expdf_cell.transpose()
    return log2_expdf_cell, log2_expdf_gene
Esempio n. 16
0
File: visr.py Progetto: CoAxLab/radd
def plot_traces_rts(p, all_traces, rts, names=['A', 'B', 'C', 'D'], tb=1000):
    tr = np.mean(p['tr'])*1e3
    rtkeys = np.sort(rts.keys())
    rt_dists = [np.asarray(rts[k])*1e3-tr for k in rtkeys]
    tb = np.ceil(np.max([np.max(rti) if len(rti)>0 else 0 for rti in rt_dists]))+50
    sns.set(style='white', font_scale=1.5)
    f, axes = build_multi_axis(p, tb=tb)
    clrs = ['#3572C6',  '#c44e52', '#8172b2', '#83a83b']
    for i in range(len(all_traces)):
        for ii, ax in enumerate(axes.flatten()):
            x=np.arange(len(all_traces[i][ii]))
            ax.plot(x, all_traces[i][ii], color=clrs[ii], alpha=.3, lw=.75)

    for i, ax in enumerate(axes.flatten()):
        divider = make_axes_locatable(ax)
        axx = divider.append_axes("top", size=.7, pad=0.01, sharex=ax)
        for spine in ['top', 'left', 'bottom', 'right']:
            axx.spines[spine].set_visible(False)
        axx.set_xticklabels([])
        axx.set_yticklabels([])
        if len(rt_dists[i])<=1:
            continue
        sns.distplot(rt_dists[i], ax=axx, label=k, kde=True, hist=True, color=clrs[i], bins=20)
        text_str='$\mu_{%s}=%.fms$'%(names[i], tr+np.mean(rt_dists[i]))
        ax.text(x[0]-50, np.mean(p['a'])-.1*np.mean(p['a']), text_str, fontsize=21)
Esempio n. 17
0
 def histogram(self,x=None, y=None, l=None, t=None, **kwargs):
     """
     this is a short-cut for creating many possible histograms, at a
     specified beamline location l, or specified time t.
     - if x and y are not input, then it creates a full joint-scatterplot
       for each pair of variables (7 variables total: x,y,z, vx, vy, vz, t)
     - if x is input, it creates a 1d histogram with respect to that parameter
     - if x and y are input, creates a 2d histogram with respect to those parameters
     """
     table = self.to_dataframe(l=l, t=t, latex=True)
     if x is None and y is None:
         g = sns.pairplot(table, **kwargs)
         for ax in g.axes.flat:
             _ = plt.setp( ax.xaxis.get_majorticklabels(), rotation=90)
         return
     if x is not None and y is None:
         x = self._reformat_label(x)
         sns.distplot(table[x], **kwargs)
         plt.xlabel(x)
         return
     if x is not None and y is not None:
         x = self._reformat_label(x)
         y = self._reformat_label(y)
         sns.jointplot(x=x, y=y, data=table, **kwargs);
         return
def make_return_dist_fig(sim_lookup, predictions, pick_K=100, n_bins=200, n_boots=5000):

    sim_net = sim_lookup['net_ret'].values
    sim_weights = sim_lookup['weights'].values

    bin_locs = np.linspace(0, 100, n_bins)[::-1]
    bins = np.percentile(sim_lookup['pred'].values, bin_locs)
    
    sim_samps_per_bin = len(sim_lookup)/float(n_bins)
    pred_bins = np.digitize(predictions['returns'] / 100., bins) #find bins of first max_K points in prediction
    
    sim_returns = np.zeros(n_boots)
    boot_samps = sim_samps_per_bin*pred_bins[:pick_K] + np.random.randint(0, sim_samps_per_bin, size=(n_boots, pick_K))
    boot_samps = boot_samps.astype(int)
    sim_returns = np.sum(sim_net[boot_samps], axis=1) / np.sum(sim_weights[boot_samps], axis=1)                
    sim_returns = LCM.annualize_returns(sim_returns)
    
    fig,ax=plt.subplots(figsize=(5.0,4.0))
    sns.distplot(sim_returns,bins=100, hist=False, rug=False,
                 ax=ax, kde_kws={'color':'k','lw':3})
    plt.xlabel('Annual returns (%)',fontsize=14)
    plt.ylabel('Probability',fontsize=14)
    plt.title('Estimated portfolio returns', fontsize=18)
    plt.tick_params(axis='both', which='major', labelsize=10)
    plt.margins(.01, .01)   
    plt.tight_layout()
    return fig
Esempio n. 19
0
File: visr.py Progetto: CoAxLab/radd
def plot_rt_dists(simdf, axes=None):
    targets=['A', 'B', 'C', 'D']
    targetColors = dict(zip(targets, ['#3572C6',  '#c44e52', '#8172b2', '#83a83b']))
    sns.set(style='white')
    if axes is None:
        f, axes = plt.subplots(2, 2, figsize=(9, 6), sharex=True)
    axes = axes.flatten()
    for i, ax in enumerate(axes):
        target = targets[i]
        rts = simdf[simdf.choice==target].rt.values
        sns.distplot(rts, kde=False, hist_kws={'alpha':.9}, norm_hist=True, bins=10, ax=ax, color=targetColors[target])
        top = ax.get_ylim()[1]*.75
        ax.text(750, top,  target, color=targetColors[target], fontsize=19)
    x = np.array([0,300,600,900])
    axes = np.asarray(f.axes)
    axes[0].set_ylabel('Probability Mass', fontsize=17)
    axes[2].set_ylabel('Probability Mass', fontsize=17)
    axes[2].set_xlabel('Time (ms)', fontsize=17)
    axes[3].set_xlabel('Time (ms)', fontsize=17)
    for ax in axes.flatten():
        ax.set_title('')
        ax.set_xticks(x)
        ax.set_yticklabels('')
        ax.set_xlim(0,900)
    axes[2].set_xticklabels(x, fontsize=12)
    axes[3].set_xticklabels(x, fontsize=12)
    sns.despine()
Esempio n. 20
0
def hist_boxplot(x='', category='', df=pandas.DataFrame(), colors={}, xlim=[], bins=[], alpha=0.9, box_step=0.15, ax=None):
    category_values = df[category].drop_duplicates()
    if isinstance(colors, dict):
        category_values = list(colors.keys())
    box_position = 1 + (box_step*len(category_values))
    yticks = [0.0,0.2,0.4,0.6,0.8,1.0]
    x_values = dict()
    x_nums = dict()
    bins=numpy.arange(xlim[0]-((xlim[1]-xlim[0])/50), xlim[1]+((xlim[1]-xlim[0])/50), (xlim[1]-xlim[0])/100)
    for cv in category_values:
        label = cv
        if isinstance(colors, dict):
            color = colors[cv]
        elif isinstance(colors, list):
            color = colors.pop()
        df_tmp=df.loc[(df[category]==cv),:]
        x_values[cv] = df_tmp[x].dropna()
        x_nums[cv] = df_tmp[x].dropna().shape[0]
        hist_kws={'cumulative':True,'histtype':'step','lw':1,'alpha':alpha}
        seaborn.distplot(x_values[cv], color=color, kde=False, bins=bins, ax=ax, hist_kws=hist_kws, norm_hist=True, label=label)
        box = ax.boxplot(x_values[cv].tolist(), positions=[box_position,], vert=False, showfliers=False, widths=[0.1,])
        for element in ['boxes', 'whiskers', 'fliers', 'means', 'medians', 'caps']:
            matplotlib.pyplot.setp(box[element], color=color, linestyle='solid')
        yticks.append(box_position)
        box_position = box_position - box_step
    ax.set_xlabel(x)
    ax.set_ylabel('Cumulative frequency')
    ax.set_xlim(numpy.mean([xlim[0],min(bins)]),numpy.mean([xlim[1],max(bins)]))
    ax.set_ylim(-0.02, 1.1+(box_step*len(category_values)))
    ax.set_yticks(yticks)
    yticklabels = [ y for y in yticks if y<=1 ] + category_values
    ax.set_yticklabels(yticklabels)
    return ax
Esempio n. 21
0
def plot_flux_distributions(plt, old_mag, new_mag, old_weighted_rms, new_weighted_rms,
                            faint, bright, old_PA1, new_PA1,
                            name='', outdir='.plots'):
    """Plot various distributions of fluxes and magnitudes.

    Parameters
    ----------
    plt : matplotlib.pyplot instance
        pyplot instance to plot with
    old_mag : np.array
        old magnitudes
    new_mag : np.array
        new magnitudes
    old_weighted_rms : np.array
        old rms weighted by the mean (rms(data)/mean(data))
    new_weighted_rms : np.array
        old rms weighted by the mean (rms(data)/mean(data))
    faint : float
        Faint end of range that PA1 was computed from.
    bright : float
        Bright end of range that PA1 was computed from.
    old_PA1 : float
        Old value of PA1, to plot as horizontal line.
    new_PA1 : float
        New value of PA1, to plot as horizontal line.
    name : str
        Name to include in plot titles and save files.
    outdir : str, optional
        Directory to write the saved plots to.
    """

    import seaborn
    seaborn.set_style('whitegrid')
    import scipy.stats

    old_color = 'blue'
    new_color = 'red'
    plt.figure()
    plt.plot(old_mag, old_weighted_rms, '.', color=old_color, label='old')
    plt.plot(new_mag, new_weighted_rms, '.', color=new_color, label='new')
    plt.axvline(faint, ls=':', color=old_color)
    plt.axvline(bright, ls=':', color=old_color)
    plt.axhline(old_PA1, ls='--', color=old_color)
    plt.axhline(new_PA1, ls='--', color=new_color)
    plt.legend(loc='upper left')
    plt.title('Where is the systematic flux rms limit?')
    plt.xlabel('magnitude')
    plt.ylabel('rms/mean per source')
    filename = os.path.join(outdir, '{}-photometry-PA1.pdf')
    plt.savefig(filename.format(name))

    plt.figure()
    seaborn.distplot(old_weighted_rms, fit=scipy.stats.lognorm, kde=False, label="old", color=old_color)
    seaborn.distplot(new_weighted_rms, fit=scipy.stats.lognorm, kde=False, label="new", color=new_color)
    plt.title('Source RMS pre/post-jointcal')
    plt.xlabel('rms(flux)/mean(flux)')
    plt.ylabel('number')
    plt.legend(loc='upper right')
    filename = os.path.join(outdir, '{}-photometry-rms.pdf')
    plt.savefig(filename.format(name))
Esempio n. 22
0
def skew_plot(app_train, features, filename):
    """
    对传入的 df 进行偏度可视化分析
    """
    fcols = 2
    frows = len(features)
    plt.figure(figsize=(4 * fcols, 6 * frows))
    i = 0
    for col in features:
        dat = app_train[[col, 'TARGET']].dropna()

        i += 1
        plt.subplot(frows, fcols, i)
        sns.distplot(dat[col], fit=stats.norm)
        plt.title(col + ' Original')
        plt.xlabel('')

        i += 1
        plt.subplot(frows, fcols, i)
        _ = stats.probplot(dat[col], plot=plt)
        plt.title('skew=' + '{:.4f}'.format(stats.skew(dat[col])))
        plt.xlabel('')
        plt.ylabel('')

    plt.tight_layout(h_pad=2.5)
    plt.savefig(filename)
    plt.show()
Esempio n. 23
0
def plot_epi_T1_corregistration(mean_epi_file, wm_file, subject_id, similarity_distribution=None, figsize=(11.7,8.3),):
       
    fig = plt.figure(figsize=figsize)
    
    if similarity_distribution:
        ax = plt.subplot(2,1,1)
        sns.distplot(similarity_distribution.values(), ax=ax)
        ax.set_xlabel("EPI-T1 mincost function (over all subjects)")
        cur_similarity = similarity_distribution[subject_id]
        label = "mincost function = %g"%cur_similarity
        plot_vline(cur_similarity, label, ax=ax)
        
        ax = plt.subplot(2,1,2)
    else:
        ax = plt.subplot(1,1,0)
    
  

    func = nb.load(mean_epi_file).get_data()
    func_affine = nb.load(mean_epi_file).get_affine()
    
    wm_data = nb.load(wm_file).get_data()
    wm_affine = nb.load(wm_file).get_affine()
    
    slicer = viz.plot_anat(np.asarray(func), np.asarray(func_affine), black_bg=True,
                           cmap = cm.Greys_r,  # @UndefinedVariable
                           figure = fig,
                           axes = ax,
                           draw_cross = False)
    slicer.contour_map(np.asarray(wm_data), np.asarray(wm_affine), linewidths=[0.1], colors=['r',])
    
    fig.suptitle('coregistration', fontsize='14')
    
    return fig
Esempio n. 24
0
    def histogramPlot(cls, data, bins, fileLocation, label=''):
        from matplotlib import pyplot as plt
        import seaborn as sns

        histogram, axes = plt.subplots()
        sns.distplot(data, bins=bins, kde=False, rug=False, axlabel=label)
        cls.saveFigure(histogram, fileLocation)
Esempio n. 25
0
def plot_DVARS(title, DVARS_file, mean_DVARS_distribution=None, figsize=(11.7,8.3)):

    DVARS = np.loadtxt(DVARS_file)

    fig = Figure(figsize=figsize)
    FigureCanvas(fig)
    
    if mean_DVARS_distribution:
        grid = GridSpec(2, 4)
    else:
        grid = GridSpec(1, 4)
    
    ax = fig.add_subplot(grid[0,:-1])
    ax.plot(DVARS)
    ax.set_xlim((0, len(DVARS)))
    ax.set_ylabel("DVARS")
    ax.set_xlabel("Frame number")
    ylim = ax.get_ylim()
    
    ax = fig.add_subplot(grid[0,-1])
    sns.distplot(DVARS, vertical=True, ax=ax)
    ax.set_ylim(ylim)
    
    if mean_DVARS_distribution:
        ax = fig.add_subplot(grid[1,:])
        sns.distplot(mean_DVARS_distribution, ax=ax)
        ax.set_xlabel("Mean DVARS (over all subjects) [std]")
        MeanFD = DVARS.mean()
        label = "Mean DVARS = %g"%MeanFD
        plot_vline(MeanFD, label, ax=ax)
        
    fig.suptitle(title, fontsize='14')
        
    return fig
Esempio n. 26
0
def plotResults(tr, resultKey='resultInputPsf', doRates=False, title='', asHist=False, doPrint=True, actuallyPlot=True):
    import matplotlib.pyplot as plt
    import matplotlib
    matplotlib.style.use('ggplot')

    import seaborn as sns
    sns.set(style="whitegrid", palette="pastel", color_codes=True)

    methods = ['ALstack', 'ZOGY', 'SZOGY', 'ALstack_decorr']
    tr = [t for t in tr if t is not None and t[resultKey]]
    FN = pd.DataFrame({key: np.array([t[resultKey][key]['FN'] for t in tr]) for key in methods})
    FP = pd.DataFrame({key: np.array([t[resultKey][key]['FP'] for t in tr]) for key in methods})
    TP = pd.DataFrame({key: np.array([t[resultKey][key]['TP'] for t in tr]) for key in methods})
    title_suffix = 's'
    if doRates:
        FN /= (FN + TP)
        FP /= (FN + TP)
        TP /= (FN + TP)
        title_suffix = ' rate'
    if doPrint:
        print 'FN:', '\n', FN.mean()
        print 'FP:', '\n', FP.mean()
        print 'TP:', '\n', TP.mean()

    if not actuallyPlot:
        return TP, FP, FN

    matplotlib.rcParams['figure.figsize'] = (18.0, 6.0)
    fig, axes = plt.subplots(nrows=1, ncols=2)

    if not asHist:
        sns.violinplot(data=TP, cut=True, linewidth=0.3, bw=0.25, scale='width', alpha=0.5, ax=axes[0])
        if TP.shape[0] < 500:
            sns.swarmplot(data=TP, color='black', size=3, alpha=0.3, ax=axes[0])
        sns.boxplot(data=TP, saturation=0.5, boxprops={'facecolor': 'None'},
                    whiskerprops={'linewidth': 0}, showfliers=False, ax=axes[0])
        plt.setp(axes[0], alpha=0.3)
        axes[0].set_ylabel('True positive' + title_suffix)
        axes[0].set_title(title)
        sns.violinplot(data=FP, cut=True, linewidth=0.3, bw=0.5, scale='width', ax=axes[1])
        if FP.shape[0] < 500:
            sns.swarmplot(data=FP, color='black', size=3, alpha=0.3, ax=axes[1])
        sns.boxplot(data=FP, saturation=0.5, boxprops={'facecolor': 'None'},
                    whiskerprops={'linewidth': 0}, showfliers=False, ax=axes[1])
        plt.setp(axes[1], alpha=0.3)
        axes[1].set_ylabel('False positive' + title_suffix)
        axes[1].set_title(title)
    else:
        for t in TP:
            sns.distplot(TP[t], label=t, norm_hist=False, ax=axes[0])
        axes[0].set_xlabel('True positive' + title_suffix)
        axes[0].set_title(title)
        legend = axes[0].legend(loc='upper left', shadow=True)
        for t in FP:
            sns.distplot(FP[t], label=t, norm_hist=False, ax=axes[1])
        axes[1].set_xlabel('False positive' + title_suffix)
        axes[1].set_title(title)
        legend = axes[1].legend(loc='upper left', shadow=True)

    return TP, FP, FN
Esempio n. 27
0
def Array_Grapher(array):
    """
    Simply creates a histogram from the array when running from a shell
    :param array: an array from a numpy function
    :return:a histogram of the array
    """
    seaborn.distplot(array)
Esempio n. 28
0
def plot_volume_per_day_hist(transactions, ax=None, **kwargs):
    """Plots a histogram of trading volume per day.

    Parameters
    ----------
    transactions : pd.DataFrame
        Daily transaction volume and dollar ammount.
         - See full explanation in tears.create_full_tear_sheet.
    ax : matplotlib.Axes, optional
        Axes upon which to plot.
    **kwargs, optional
        Passed to seaborn plotting function.

    Returns
    -------
    ax : matplotlib.Axes
        The axes that were plotted on.

    """

    if ax is None:
        ax = plt.gca()

    sns.distplot(transactions.txn_volume, ax=ax, **kwargs)
    ax.set_title('Distribution of Daily Trading Volume')
    ax.set_xlabel('Volume')
    return ax
Esempio n. 29
0
def rolling_success_diff(answers, last_count=4, filters=None, only_last=True):
    if filters is None:
        filters = [None]

    data = []
    for filter in filters:
        df = filter_users(answers, min_answer_count=filter)
        for df in df.groupby('user'):
            df = df[1]
            mean = df['correct'].mean()
            if len(df) < last_count:
                continue
            for x in df['correct'].rolling(last_count, last_count).mean():
                if np.isnan(x):
                    continue
                if not only_last:
                    data.append([np.round(x - mean, 1), filter, 0])
            if not only_last:
                data[-1][-1] = 1
            else:
                data.append([x - mean, filter, 1])
    df = pd.DataFrame(data, columns=['rolling_success_diff', 'min_answers', 'leave'])
    if not only_last:
        sns.pointplot(data=df, x='rolling_success_diff', y='leave', hue='min_answers').set(ylim=(0, 0.2))
    else:
        for filter in filters:
            sns.distplot(df.loc[df['min_answers'] == filter, 'rolling_success_diff'], label=str(filter))
        plt.legend(loc=1)
    return df
Esempio n. 30
0
    def plot_vlast_density(self, nsim=100, nobs=100, param=None):
        """Plot the marginal density of ARG process."""

        plt.figure(figsize=(8, 4))
        vol = self.vsim_last(nsim=int(nsim), nobs=int(nobs), param=param)
        sns.distplot(vol, rug=True, hist=False)
        plt.show()
Esempio n. 31
0
# Visualize frequency distribution of income variable
f, ax = plt.subplots(1, 2, figsize=(18, 8))
ax[0] = dataset[' income'].value_counts().plot.pie(explode=[0, 0],
                                                   autopct='%1.1f%%',
                                                   ax=ax[0],
                                                   shadow=True)
ax[0].set_title('Income Share')
#f, ax = plt.subplots(figsize=(6, 8))
ax[1] = sns.countplot(x=" income", data=dataset, palette="Set1")
ax[1].set_title("Frequency distribution of income variable")
plt.show()

# Distribution of age variable
f, ax = plt.subplots(figsize=(10, 8))
x = dataset['age']
ax = sns.distplot(x, bins=10, color='blue')
ax.set_title("Distribution of age variable")
plt.show()

# Detect outliers in age variable with boxplot
f, ax = plt.subplots(figsize=(10, 8))
x = dataset['age']
ax = sns.boxplot(x)
ax.set_title("Visualize outliers in age variable")
plt.show()

# Visualize income with respect to age variable
f, ax = plt.subplots(figsize=(10, 8))
ax = sns.boxplot(x=" income", y="age", data=dataset)
ax.set_title("Visualize income with respect to age variable")
plt.show()
Esempio n. 32
0
dataset = pd.read_csv(
    '/Users/kalharaperera/Desktop/Projects/Data Sets/us-weather-history/KCLT.csv'
)

# print(dataset.shape)
print(dataset.describe())

dataset.plot(x='actual_min_temp', y='actual_max_temp', style='o')
plt.title('Min temp Vs Max temp')
plt.xlabel('Min temp')
plt.ylabel('Max temp')
plt.show()

plt.figure(figsize=(15, 10))
# plt.tight_layout()
seaborninstance.distplot(dataset['actual_max_temp'])
plt.show()

#data splicing basically spliting your data into training data and testing data
X = dataset['actual_min_temp'].values.reshape(-1, 1)
Y = dataset['actual_max_temp'].values.reshape(-1, 1)

#assigning 20% of the data to test data and the others to training data
X_train, X_test, Y_train, Y_test = train_test_split(X,
                                                    Y,
                                                    test_size=0.2,
                                                    random_state=0)

regressor = LinearRegression()
#training the algorithem
regressor.fit(X_train, Y_train)
Esempio n. 33
0
from scipy.stats import norm
from sklearn.preprocessing import StandardScaler
from scipy import stats
import warnings

# In[5]:

ames_train = pd.read_csv('train.csv')
ames_train.columns

# In[15]:

sns.set(style="white", palette="muted", color_codes=True)

#histogram plot
sns.distplot(ames_train['SalePrice'], color='r')

# In[16]:

##rug plot
sns.distplot(ames_train['SalePrice'], hist=False, rug=True, color='m')

# In[19]:

print("Std: %f" % ames_train['SalePrice'].std())
print("Skewness: %f" % ames_train['SalePrice'].skew())
print("Kurtosis: %f" % ames_train['SalePrice'].kurt())

# In[27]:

##build a correlation heatmap
@author: arunramji
"""
#Let's import required libraries
import pandas as pd
pd.options.display.max_rows=999
pd.options.display.max_columns =999
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 

df = pd.read_csv('/Users/arunramji/Downloads/Sourcefiles/Kaggle_Housing_Price/train.csv')

fig , ax = plt.subplots(figsize=(12,6))

sns.distplot(df['SalePrice'])
plt.show()

fig_size = plt.rcParams["figure.figsize"]
fig_size[0] =16.0
fig_size[1] = 4.0

x =df['SalePrice']
plt.hist(x, normed=True, bins=400)
plt.ylabel('SalePrice');

df_1 = df[df['SalePrice']<400000]

#Missing values
Null_Cols = pd.DataFrame(df.select_dtypes(include='object').isnull().sum(),columns=['Null_count'])
Null_Cols[Null_Cols.Null_count>0]
    AAPL[column_name] = pd.Series(AAPL['Adj Close']).rolling(window=ma).mean()
# %%
AAPL[['Adj Close', 'MA for 10 days', 'MA for 20 days', 'MA for 50 days']].plot(subplots=False, figsize=(14,7))
# %%
# If we get a moving average for more days at a time, we get a smoother line, and it's not gonna rely much on the daily
# fluctuation changes.
# %%
# Now retrieving the daily returns for Apple
# What that means is: for any given day, what is your percent return on your money?

AAPL['Daily Return'] = AAPL['Adj Close'].pct_change()
AAPL['Daily Return'].plot(figsize=(14,7), legend=True, linestyle='--', marker='o')
# %%
# This is a histogram of the daily returns for the past year.

sns.distplot(AAPL['Daily Return'].dropna(), bins=100, color='purple')
# %%
# It looks like the above histogram is skewed a little more negatively, but we need to do some more analysis to check
# that out.
# The following graph is just another way to see it.

AAPL['Daily Return'].hist(bins=50)
plt.gcf().set_size_inches(15, 8)
# %%
# Now building up another Dataframe with all the adjusted close columns for each of the stocks Dataframes in order to
# analyse the return of all the stocks in our data list.

closing_df = DataReader(tech_list, 'yahoo', start, end)['Adj Close']
# %%
closing_df.head()
# %%
Esempio n. 36
0
X_soph = np.concatenate([X_soph,encoded_conf_soph],axis=1)



smote =SMOTE(sampling_strategy='minority',k_neighbors=5)
X_train_smote, y_train_smote = smote.fit_resample(X_train,y_train)

#%% Plot distributions
################################################
zeros = features.loc[features['eval'] == 0]
ones = features.loc[features['eval'] == 1]
numeric_cols = features.columns[1:-1]
nplot=1
for col in numeric_cols:
    plt.subplot(int(np.ceil(len(numeric_cols)/3)),3,nplot)
    sns.distplot(zeros[col],hist=False,label='Misses')
    sns.distplot(ones[col],hist=False,label='Hits')
    nplot+=1
    plt.legend()
plt.tight_layout()
plt.show()




# %% Modelling
###############################################

estimator = XGBClassifier()
# estimator = LogisticRegression()
param_grid={
Esempio n. 37
0
tips.head()
tips['tip_pct'] = tips.tip/(tips.total_bill-tips.tip)
sns.barplot(x='tip_pct', y='day', data=tips, orient='h') 
# length of the bar is the average tip_pcts on each day
# black line is the 95% confidence interval
sns.barplot(x='tip_pct', y='day', hue='time', data=tips, orient='h') 
sns.set(style='whitegrid')
tips.tip_pct.plot.hist(bins=50)
tips.tip_pct.hist(bins=50)
tips.tip_pct.plot.density()
tips.tip_pct.plot.kde()
comp1 = np.random.normal(0, 1, size=200)
comp1
comp2 = np.random.normal(10, 2, size=200)
values = pd.Series(np.concatenate([comp1, comp2]))
values
sns.distplot(values, bins=100, color='k') 
# distplot plot both a histogram and a continuour density estimate simulation
macro = pd.read_csv('../examples/macrodata.csv')
macro.head()
data = macro[['cpi', 'm1', 'tbilrate', 'unemp']]
data.head()
trans_data = np.log(data).diff().dropna()
trans_data[-5:]
sns.regplot('m1', 'unemp', data=trans_data)
plt.title('Changes in log %s versus log %s' % ('m1', 'unemp'))
sns.pairplot(trans_data, diag_kind='kde', plot_kws={'alpha': 0.2})
sns.factorplot(x='day', y='tip_pct', hue='time', col='smoker',kind='bar', data=tips[tips.tip_pct<1])
sns.factorplot(x='day', y='tip_pct', row='time',col='smoker', kind='bar',data=tips[tips.tip_pct<1])
sns.factorplot(x='tip_pct', y='day', kind='box', data=tips[tips.tip_pct<1])
Esempio n. 38
0
                                  'price', 'std_score', 'max_score',
                                  'min_score', 'avg_score', 'count'
                              ]).fillna(0)
    corr_p = stats.pearsonr(df_analyze['price'], df_analyze['avg_score'])
    print('STATS | PEARSON R')
    print(corr_p)

    corr_s = stats.spearmanr(df_analyze['price'], df_analyze['avg_score'])
    print('STATS | SPEARMAN')
    print(corr_s)

    df_analyze = SharedUtils.normalize(df_analyze)

    sns.distplot(df_analyze['avg_score'],
                 hist=True,
                 kde=True,
                 color='blue',
                 hist_kws={'edgecolor': 'black'})
    # Add labels
    plt.title('Histogram Sentiment Score (n=169)')
    plt.xlabel('Score')
    plt.ylabel('No. Weeks')
    plt.show()

    sns.distplot(df_analyze['price'],
                 hist=True,
                 kde=True,
                 color='blue',
                 hist_kws={'edgecolor': 'black'})
    # Add labels
    plt.title('Histogram Price (n=169)')
Esempio n. 39
0
    def visualiseMargins(self, df):
        df_won = df[df['Result'] == 1]
        df_w_run = df_won[df_won['Margin'].astype(str).str.contains("runs")]
        df_w_run['Margin'].replace(to_replace=r'[\s]+.*',
                                   value="",
                                   regex=True,
                                   inplace=True)
        df_w_run = df_w_run.astype({"Margin": int})

        categorical = ['Toss', 'Bat', 'Opposition', 'Ground']
        fig, ax = plt.subplots(2, 2, figsize=(10, 10))
        for variable, subplot in zip(categorical, ax.flatten()):
            sns.scatterplot(x="Result",
                            y="Margin",
                            hue=df_w_run[variable],
                            data=df_w_run,
                            ax=subplot)
        plt.show()

        plt.figure(2)
        sns.distplot(df_w_run['Margin'],
                     hist=True,
                     kde=True,
                     bins=int(5),
                     color='darkblue',
                     hist_kws={'edgecolor': 'black'},
                     kde_kws={'linewidth': 4})
        plt.show()

        df_w_wickets = df_won[df_won['Margin'].astype(str).str.contains(
            "wickets")]
        df_w_wickets['Margin'].replace(to_replace=r'[\s]+.*',
                                       value="",
                                       regex=True,
                                       inplace=True)
        df_w_wickets = df_w_wickets.astype({"Margin": int})

        plt.figure(3)
        sns.distplot(df_w_wickets['Margin'],
                     hist=True,
                     kde=True,
                     bins=int(10),
                     color='darkblue',
                     hist_kws={'edgecolor': 'black'},
                     kde_kws={'linewidth': 4})
        plt.show()

        df_lost = df[df['Result'] == 0]
        df_l_run = df_lost[df_lost['Margin'].astype(str).str.contains("runs")]
        df_l_run['Margin'].replace(to_replace=r'[\s]+.*',
                                   value="",
                                   regex=True,
                                   inplace=True)
        df_l_run = df_l_run.astype({"Margin": int})

        categorical = ['Toss', 'Bat', 'Opposition', 'Ground']
        fig, ax = plt.subplots(2, 2, figsize=(10, 10))
        for variable, subplot in zip(categorical, ax.flatten()):
            sns.scatterplot(x="Result",
                            y="Margin",
                            hue=df_l_run[variable],
                            data=df_l_run,
                            ax=subplot)
        plt.show()

        plt.figure(5)
        sns.distplot(df_l_run['Margin'],
                     hist=True,
                     kde=True,
                     bins=int(5),
                     color='darkblue',
                     hist_kws={'edgecolor': 'black'},
                     kde_kws={'linewidth': 4})
        plt.show()

        df_l_wicket = df_lost[df_lost['Margin'].astype(str).str.contains(
            "wickets")]
        df_l_wicket['Margin'].replace(to_replace=r'[\s]+.*',
                                      value="",
                                      regex=True,
                                      inplace=True)
        df_l_wicket = df_l_wicket.astype({"Margin": int})

        plt.figure(6)
        sns.distplot(df_l_wicket['Margin'],
                     hist=True,
                     kde=True,
                     bins=int(10),
                     color='darkblue',
                     hist_kws={'edgecolor': 'black'},
                     kde_kws={'linewidth': 4})
        plt.show()
Esempio n. 40
0
b = closure(a)
b = multiplicative_replacement(a)
pd.DataFrame(b)
pd.DataFrame(b).sum(axis=1)

###########################################################
# PLOT histogram - investigate error in A549_D aitchison ##
###########################################################
import seaborn as sns

df_part = df_int.iloc[:, df_int.columns.get_level_values("batch") == 
                      ("_".join(["A549", "D", "Rep2"]))]
for i in range(np.shape(df_part)[1]):
    #plt.hist(np.log2((df_part.iloc[:,i]).replace(0,np.nan)), bins = 1000, 
    #         histtype="step", label = df_part.iloc[:,i].name[-1])
    sns.distplot(np.log2((df_part.iloc[:,i]).replace(0,np.nan)), bins = 1000,
                 label = df_part.iloc[:,i].name[-1], kde = True, hist = False)
plt.legend()

ax = np.log2(df_part.replace(0,np.nan)).plot.hist(bins=1000, alpha = 0.5)

df_part = df_part[(df_part.T != 0).any()]
df_ait_part = aitchison_transform(df_part)


for i in range(np.shape(df_ait_part)[1]):
#    plt.hist((df_ait_part.iloc[:,i]).replace(0,np.nan), bins = 1000, 
#             histtype="step", label = df_part.iloc[:,i].name[-1])
    sns.distplot(df_ait_part.iloc[:,i], bins = 1000,
             label = df_part.iloc[:,i].name[-1], kde = True, hist = False)
plt.legend()
    mean = accident_df[column_name].mean()
    std = accident_df[column_name].std()
    min_ = accident_df[column_name].min()
    max_ = accident_df[column_name].max()
    kurt = accident_df[column_name].kurt()
    skew = accident_df[column_name].skew()

    print(column_name,
          ',min =',
          min_,
          ',max =',
          max_,
          ',avg =',
          mean,
          ',std =',
          std,
          ',skewness =',
          skew,
          ',kurtosis =',
          kurt,
          end='\n')
    print()
fig, axs = plt.subplots(nrows=1, ncols=2, figsize=(10, 5))

sns.distplot(accident_df[accident_df['Precipitation(in)'].isnull() == False]
             ['Precipitation(in)'],
             ax=axs[0])
sns.distplot(accident_df[accident_df['Temperature(F)'].isnull() == False]
             ['Temperature(F)'],
             ax=axs[1])
def plot_distplot(df, cell_model, matching):
    sns.distplot(df.loc[df.arch == "complex_core", "syn_len"], label = "core")
    sns.distplot(df.loc[df.arch == "complex_derived", "syn_len"], label = "der")
    plt.legend()
    outf =  f"{RE}{cell_model}_ernst_active_bases_dist_{matching}.pdf"
    plt.savefig(outf, bbox_inches = "tight")
Esempio n. 43
0
# fill HOLIDAY with normal
df['HOLIDAY'] = df['HOLIDAY'].fillna("NORMAL")

# drop features
drop_feats = [
    'WW_GRS', 'PERCENT', 'NM_0.5W_T', 'NM_0.5W_M24', 'NM_0.5W_M26',
    'NM_0.5W_F24', 'NM_0.5W_F26', 'GENRE2'
]

df.drop(drop_feats, axis=1, inplace=True)

# check OBO
df['OBO'].describe()

# orginal data
sns.distplot(df['OBO'], fit=norm)
# Get the fitted parameters used by the function
(mu, sigma) = norm.fit(df['OBO'])
print('\n mu = {:.2f} and sigma = {:.2f}\n'.format(mu, sigma))
#Now plot the distribution
plt.legend(
    ['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)],
    loc='best')
plt.ylabel('Frequency')
plt.title('distribution')

#Get also the QQ-plot
fig = plt.figure()
res = stats.probplot(df['OBO'], plot=plt)
plt.show()
Esempio n. 44
0
 def plotlabeldist(self):
     labels=[self.masks[i]['labels'] for i in range(len(self.masks))]
     return sns.distplot(labels)
dataset = pd.read_csv(r'C:\Users\Hp\Desktop\Nasa\Dataset\Asteroid_data_final.csv')

dataset.shape

dataset.describe()

dataset.isnull().any()
dataset = dataset.fillna(method='ffill')

X = dataset[['As_diam_km', 'As_dist_km', 'As_velocity_kmh', 'As_velocity_angle', 'As_dist_relative_flag','Coordinate_flag','Sc_diam_m', 'Sc_velocity_kmh', 'Relative_velocity', 'Estimated_time', 'New_theta']].values
y = dataset['Final_Angle'].values

plt.figure(figsize=(15,10))
plt.tight_layout()
seabornInstance.distplot(dataset['Final_Angle'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

regressor = LinearRegression()  s
regressor.fit(X_train, y_train)

#coeff_df = pd.DataFrame(regressor.coef_, X.columns, columns=['Coefficient'])
#coeff_df

y_pred = regressor.predict(X_test)

df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})

df1 = df.head(25)
df1
Esempio n. 46
0
def plot_dist(result, out_dir, name, model_flavors, metric, cumu):
	if metric == "size":
		kwd = "set_sizes"
	elif metric == "prop":
		kwd = "set_props"

	sns.set(style="whitegrid", font="Roboto")

	# if "full" in model_flavors:
	# 	set_sizes_full = result["{0}_full".format(kwd)]
	# 	sns.distplot(
	# 		set_sizes_full,
	# 		hist=False,
	# 		kde=True,
	# 		kde_kws={"linewidth": 2, "shade":True, "cumulative":cumu},
	# 		label="Full"
	# 	)

	if "full" in model_flavors:
		set_sizes_full = result["{0}_full".format(kwd)]
		try:
			sns.distplot(
				set_sizes_full,
				hist=False,
				kde=True,
				kde_kws={"linewidth": 2, "shade":False, "cumulative":cumu},
				label="PLASMA-JC"
			)
		except Exception:
			pass
	if "indep" in model_flavors:
		set_sizes_indep = result["{0}_indep".format(kwd)]
		try:
			sns.distplot(
				set_sizes_indep,
				hist=False,
				kde=True,
				kde_kws={"linewidth": 2, "shade":False, "cumulative":cumu},
				label="PLASMA-JI"			
			)
		except Exception:
			pass
	if "ase" in model_flavors:
		set_sizes_ase = result["{0}_ase".format(kwd)]
		try:
			sns.distplot(
				set_sizes_ase,
				hist=False,
				kde=True,
				kde_kws={"linewidth": 2, "shade":False, "cumulative":cumu},
				label="PLASMA-AS"			
			)
		except Exception:
			pass
	if "acav" in model_flavors:
		set_sizes_caviar_ase = result["{0}_caviar_ase".format(kwd)]
		try:
			sns.distplot(
				set_sizes_caviar_ase,
				hist=False,
				kde=True,
				kde_kws={"linewidth": 2, "shade":False, "cumulative":cumu},
				label="CAVIAR-ASE"			
			)
		except Exception:
			pass
	if "eqtl" in model_flavors:
		set_sizes_eqtl = result["{0}_eqtl".format(kwd)]
		try:
			sns.distplot(
				set_sizes_eqtl,
				hist=False,
				kde=True,
				kde_kws={"linewidth": 2, "shade":False, "cumulative":cumu},
				label="QTL-Only"			
			)
		except Exception:
			pass
	if metric == "prop":
		plt.xlim(0, 1)
	elif metric == "size":
		plt.xlim(0, 1000)
	plt.legend(title="Model")
	if cumu:
		cumu_kwd = "Cumulative "
		cumu_fname = "_cumu"
		yax = "Proportion of Markers"
	else:
		cumu_kwd = ""
		cumu_fname = ""
		yax = "Density"
	if metric == "size":
		plt.xlabel("Set Size")
		plt.ylabel(yax)
		plt.title("{0}Distribution of Credible Set Sizes: {1}".format(cumu_kwd, name))
		plt.savefig(os.path.join(out_dir, "set_size_distribution{0}.svg".format(cumu_fname)))
	elif metric == "prop":
		plt.xlabel("Set Size (Proportion of Total Markers)")
		plt.ylabel(yax)
		plt.title("{0}Distribution of Credible Set Sizes: {1}".format(cumu_kwd, name))
		plt.savefig(os.path.join(out_dir, "set_prop_distribution{0}.svg".format(cumu_fname)))
	plt.clf()
Esempio n. 47
0
plt.scatter(dat[:, 1], dat[:, 3])
plt.grid()
plt.title('Petal Width by Sepal Width')
plt.ylabel('Petal width(cm)')
plt.xlabel('Sepal width(cm)')

dat1 = pd.DataFrame(data=dat, columns=iris["feature_names"])
dat1["target"] = iris["target"]
dat1['target'] = dat1['target'].replace([0, 1, 2], iris["target_names"])

sns.set(style="whitegrid")
sns.countplot(x="target", data=dat1)
plt.title("Number of Examples per Species")
plt.xlabel("species")

sns.distplot(dat1['sepal width (cm)'], hist=True, kde=False)
plt.title("Histogram of Sepal Width")

sns.barplot(x='target', y='sepal length (cm)', data=dat1, estimator=np.mean)
plt.title("Avg. Sepal Length by Species")
plt.ylabel("mean(sepal length (cm))")
plt.xlabel("species")

sns.boxplot(x='target', y='sepal width (cm)', data=dat1)
plt.title("Boxplot of Sepal Width by Species")
plt.xlabel("species")

sns.violinplot(x='target', y='sepal width (cm)', data=dat1)
plt.title("Violinplot of Sepal Width by Species")
plt.xlabel("species")
# Now drop the 'Id' column since it's unnecessary for the prediction process.
train.drop('Id', axis=1, inplace=True)
test.drop('Id', axis=1, inplace=True)

print("Train data size before dropping Id feature is : {}".format(train.shape))
print("Test data size before dropping Id feature is : {}".format(test.shape))

print(train.head())
print(test.head())

# Getting Description
print(train['SalePrice'].describe())

# Plot Histogram
sns.distplot(train['SalePrice'], fit=norm)

# Get the fitted parameters used by the function
(mu, sigma) = norm.fit(train['SalePrice'])
print('\n mu = {:.2f} and sigma = {:.2f}\n'.format(mu, sigma))
plt.legend(
    ['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)],
    loc='best')
plt.ylabel('Frequency')
plt.title('SalePrice distribution')

fig = plt.figure()
res = stats.probplot(train['SalePrice'], plot=plt)
plt.show()

print('Skewness: %f' % train['SalePrice'].skew())
from genetic_algorithm import generate_population

sns.set_style("whitegrid")

def lcs(X, Y): 
    # find the length of the strings 
    m = len(X) 
    n = len(Y) 
  
    # declaring the array for storing the dp values 
    L = [[None]*(n + 1) for i in range(m + 1)] 
  
    """Following steps build L[m + 1][n + 1] in bottom up fashion 
    Note: L[i][j] contains length of LCS of X[0..i-1] 
    and Y[0..j-1]"""
    for i in range(m + 1): 
        for j in range(n + 1): 
            if i == 0 or j == 0 : 
                L[i][j] = 0
            elif X[i-1] == Y[j-1]: 
                L[i][j] = L[i-1][j-1]+1
            else: 
                L[i][j] = max(L[i-1][j], L[i][j-1]) 
  
    # L[m][n] contains the length of LCS of X[0..n-1] & Y[0..m-1] 
    return L[m][n]

population = generate_population(10000)
lcs_vec = [lcs(individual[1], ['H','E','L','L','O','W','O','R','L','D']) for individual in population]
gene_dist = sns.distplot(lcs_vec, kde=False, norm_hist=False, color='black', bins=10)
plt.show()
Esempio n. 50
0
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.ensemble import RandomForestRegressor

train = pd.read_csv('Bike_Train.csv')
test = pd.read_csv('Bike_Test.csv')

print("original train data:", train.shape)
print("original test data:", test.shape)

#图1:count
fig = plt.figure()
ax = fig.add_subplot(1, 1, 1)
fig.set_size_inches(6, 5)
sns.distplot(train['count'])
ax.set(xlabel='count', title='distribution')
fig.savefig('001 count distribution', dpi=200)

train_drop_tail = train[np.abs(train['count'] - train['count'].mean()) <= (
    3 * train['count'].std())]
print('----------------------')
print("train_drop_tail:", train_drop_tail.shape)

#图2 两个图的对比
fig = plt.figure()
fig.set_size_inches(12, 5)
ax1 = fig.add_subplot(1, 2, 1)
ax2 = fig.add_subplot(1, 2, 2)
sns.distplot(train['count'], ax=ax1)
sns.distplot(train_drop_tail['count'], ax=ax2)
Esempio n. 51
0
    return reg


# def q_walk_uni_line(state, size, steps):
# 	H = np.array([[1,1], [1, -1]]) / np.sqrt(2)
# 	plus = H * np.array([1,0])

# 	states = np.append(np.array()
# 	walker = np.kron(state, plus)

samples = 100

r_dist = np.array([disc_line_random_walk(51, 101, 40) for i in range(samples)])

fig, axis = plt.subplots()
sbn.distplot(r_dist, ax=axis, kde=True, hist=True)
plt.title(
    "Distribuição empírica para passeio aleatório com {} amostras.".format(
        samples))
plt.xlabel("Nó de parada")
plt.ylabel("Frequência relativa")
plt.grid(True)  # coller
plt.savefig("rd_walk{}.png".format(samples))
plt.clf()

n_qbits = 8
graph_size = 101
steps = 40

eng = MainEngine()
data = np.zeros(samples, dtype=int)
Esempio n. 52
0
ax2.hist(data.Amount[data.Class == 0], bins = 30)
ax2.set_title('Normal')
plt.xlabel('Amount ($)')
plt.ylabel('Number of Transactions')
plt.yscale('log')
plt.show()


# In[ ]:


plt.figure(figsize=(12,28*4))
gs = gridspec.GridSpec(28, 1)
for i in range(1, 29):
    ax = plt.subplot(gs[i-1])
    sns.distplot(data['V'+str(i)][data.Class == 1], bins=50)
    sns.distplot(data['V'+str(i)][data.Class == 0], bins=50)
    ax.set_xlabel('')
    ax.set_title('histogram of feature: ' + 'V'+str(i))
plt.show()
plt.tight_layout()


# In[ ]:


# Based on observation of data overlap above, try out a second dataset with redunancies removed
clean_data = data.drop(['V28','V27','V23','V8'], axis =1)
# Later - can re run everything after running the following line
#data = clean_data
train_df["coverage"] = train_df.masks.map(np.sum) / pow(img_size_ori, 2)

def cov_to_class(val):    
    for i in range(0, 11):
        if val * 10 <= i :
            return i
        
train_df["coverage_class"] = train_df.coverage.map(cov_to_class)


# In[8]:


fig, axs = plt.subplots(1, 2, figsize=(15,5))
sns.distplot(train_df.coverage, kde=False, ax=axs[0])
sns.distplot(train_df.coverage_class, bins=10, kde=False, ax=axs[1])
plt.suptitle("Salt coverage")
axs[0].set_xlabel("Coverage")
axs[1].set_xlabel("Coverage class")


# In[9]:


#Plotting the depth distributions¶

sns.distplot(train_df.z, label="Train")
sns.distplot(test_df.z, label="Test")
plt.legend()
plt.title("Depth distribution")
# Load data into a pandas DataFrame. Note: 1st column is ID
home_data = pd.read_csv(file_path, index_col=0)
home_data.tail()
# home_data.head()
home_data.shape
# List of numerical attributes
home_data.select_dtypes(exclude=['object']).columns
len(home_data.select_dtypes(exclude='object').columns)
home_data.select_dtypes(exclude=['object']).describe().round(decimals=2)
home_data.select_dtypes(include=['object']).columns
len(home_data.select_dtypes(include='object').columns)
home_data.select_dtypes(include=['object']).describe()
target = home_data.SalePrice
plt.figure()
sns.distplot(target)
plt.title('Distribution of SalePrice')
plt.show()
sns.distplot(np.log(target))
plt.title('Distribution of Log-transformed SalePrice')
plt.xlabel('log(SalePrice)')
plt.show()
print('SalePrice has a skew of ' + str(target.skew().round(decimals=2)) + 
      ' while the log-transformed SalePrice improves the skew to ' + 
      str(np.log(target).skew().round(decimals=2)))
num_attributes = home_data.select_dtypes(exclude='object').drop('SalePrice', axis=1).copy()

fig = plt.figure(figsize=(12,18))
for i in range(len(num_attributes.columns)):
    fig.add_subplot(9,4,i+1)
    sns.distplot(num_attributes.iloc[:,i].dropna())
Esempio n. 55
0
traindata.isnull().sum().sum()

sns.boxplot(traindata['y'])

traindata.shape
# fill missing values by mean
traindata['y'].fillna(traindata['y'].mean(), inplace=True)


sns.relplot(x="x", y="y", data=traindata)

sns.relplot(x=traindata['x'], y=traindata['y'], data=traindata)

sns.regplot(x=traindata['x'], y=traindata['y'], data=traindata)

sns.distplot(traindata['x'], bins=10)

sns.boxplot(traindata['x'], orient='v')

sns.boxplot(traindata['y'], orient='v')

Q1=traindata.quantile(.25)
Q3=traindata.quantile(.75)
IQR = traindata.apply(stats.iqr)
upper = (Q3 + 1.5 * IQR)
lower = (Q1 - 1.5 * IQR)
# No of outliers for each column

# To see no of outliers for each variable
(traindata > (Q3 + 1.5 * IQR)).sum()
(traindata < (Q1 - 1.5 * IQR)).sum()
plt.figure(figsize=(16,8))
plt.imshow(title_wordcloud)
plt.axis('off')
plt.show()

# Get summary statistics of rating
ratings['rating'].describe()

# Import seaborn library
import seaborn as sns
sns.set_style('whitegrid')
sns.set(font_scale=1.5)
%matplotlib inline

# Display distribution of rating
sns.distplot(ratings['rating'].fillna(ratings['rating'].median()))

# Join all 3 files into one dataframe
dataset = pd.merge(pd.merge(movies, ratings),users)
# Display 20 movies with highest ratings
dataset[['title','genres','rating']].sort_values('rating', ascending=False).head(20)

# Make a census of the genre keywords
genre_labels = set()
for s in movies['genres'].str.split('|').values:
    genre_labels = genre_labels.union(set(s))

# Function that counts the number of times each of the genre keywords appear
def count_word(dataset, ref_col, census):
    keyword_count = dict()
    for s in census: 
Esempio n. 57
0
def distribution_compare_pretty(_df1,
                                _df2,
                                col,
                                figsize=None,
                                date_flag=False):
    """
    Draw pretty distribution graph for data compare

    Parameters
    ----------
    _df1: pandas DataFrame
        slice of table1 containing enough information to check
    _df2: pandas DataFrame
        slice of table2 containing enough information to check
    col: string
        name of column to check
    figsize: tuple, default=None
        figure size
    date_flag: bool, default=False
        whether it is checking date features
    """

    # color values for graph
    TABLE1_DARK = "#4BACC6"
    TABLE2_DARK = "#F79646"

    df1, df2 = _df1.copy(), _df2.copy()

    if date_flag:
        numeric_col = '%s_numeric' % (col)
        if numeric_col not in df1.columns.values:
            snapshot_date_now = str(datetime.datetime.now().date())
            df1[numeric_col] = (
                pd.to_datetime(snapshot_date_now) -
                pd.to_datetime(df1[col], errors='coerce')).astype(
                    'timedelta64[M]', errors='ignore')
        if numeric_col not in df2.columns.values:
            snapshot_date_now = str(datetime.datetime.now().date())
            df2[numeric_col] = (
                pd.to_datetime(snapshot_date_now) -
                pd.to_datetime(df2[col], errors='coerce')).astype(
                    'timedelta64[M]', errors='ignore')
    else:
        numeric_col = col

    value_mins = [df1[numeric_col].min(), df2[numeric_col].min()]
    value_means = [df1[numeric_col].mean(), df2[numeric_col].mean()]
    value_medians = [df1[numeric_col].median(), df2[numeric_col].median()]
    value_maxs = [df1[numeric_col].max(), df2[numeric_col].max()]

    if date_flag:
        date_mins = [
            pd.to_datetime(df1[col], errors='coerce').min(),
            pd.to_datetime(df2[col], errors='coerce').min()
        ]
        date_maxs = [
            pd.to_datetime(df1[col], errors='coerce').max(),
            pd.to_datetime(df2[col], errors='coerce').max()
        ]

    both_value_max = np.max([abs(v) for v in value_maxs] +
                            [abs(v) for v in value_mins])

    # get clean values
    df1_sample_dropna_values = df1[numeric_col].dropna().values
    df2_sample_dropna_values = df2[numeric_col].dropna().values

    # get distribution
    scale_flg = 0
    df1_draw_values = df1_sample_dropna_values
    df1_draw_value_4 = [
        value_mins[0], value_means[0], value_medians[0], value_maxs[0]
    ]

    df2_draw_values = df2_sample_dropna_values
    df2_draw_value_4 = [
        value_mins[1], value_means[1], value_medians[1], value_maxs[1]
    ]

    if both_value_max >= pow(10, 6):
        scale_flg = 1
        df1_draw_values, df1_draw_value_4 = _get_scale_draw_values(
            df1_draw_values, df1_draw_value_4)
        df2_draw_values, df2_draw_value_4 = _get_scale_draw_values(
            df2_draw_values, df2_draw_value_4)

    # draw the graph
    plt.clf()
    if figsize is not None:
        plt.figure(figsize)
    else:
        plt.figure(figsize=(10, 5))

    if scale_flg:
        plt.title('%s (log10 scale)' % (col))
    else:
        plt.title('%s' % (col))

    # if unique level is less than 10, draw countplot instead
    both_num_uni = np.max(
        [df1[col].dropna().nunique(), df2[col].dropna().nunique()])
    if both_num_uni <= 10:
        df1_temp = pd.DataFrame(df1_sample_dropna_values, columns=['value'])
        df1_temp['type'] = 'table1'
        df2_temp = pd.DataFrame(df2_sample_dropna_values, columns=['value'])
        df2_temp['type'] = 'table2'
        full_temp = pd.concat([df1_temp, df2_temp], axis=0)
        sns.countplot(full_temp['value'],
                      hue=full_temp['type'],
                      palette=sns.color_palette([TABLE1_DARK, TABLE2_DARK]))
        if both_num_uni > 5:
            plt.xticks(rotation=90)
        plt.legend(loc=1)
    else:
        ax1 = sns.distplot(df1_draw_values,
                           color=TABLE1_DARK,
                           hist=False,
                           label='table1')
        ax2 = sns.distplot(df2_draw_values,
                           color=TABLE2_DARK,
                           hist=False,
                           label='table2')
        y_low_1, y_up_1 = ax1.get_ylim()
        y_low_2, y_up_2 = ax2.get_ylim()
        y_low, y_up = np.min([y_low_1, y_low_2]), np.max([y_up_1, y_up_2])
        plt.ylim((y_low, y_up))

        if date_flag:
            _draw_texts(text_values=[date_mins[0], date_maxs[0]],
                        draw_value_4=df1_draw_value_4,
                        mark=1,
                        y_low=y_low,
                        y_up=y_up,
                        date_flag=True)
            _draw_texts(text_values=[date_mins[1], date_maxs[1]],
                        draw_value_4=df2_draw_value_4,
                        mark=2,
                        y_low=y_low,
                        y_up=y_up,
                        date_flag=True)
        else:
            _draw_texts(text_values=[
                value_mins[0], value_means[0], value_medians[0], value_maxs[0]
            ],
                        draw_value_4=df1_draw_value_4,
                        mark=1,
                        y_low=y_low,
                        y_up=y_up)
            _draw_texts(text_values=[
                value_mins[1], value_means[1], value_medians[1], value_maxs[1]
            ],
                        draw_value_4=df2_draw_value_4,
                        mark=2,
                        y_low=y_low,
                        y_up=y_up)

    plt.show()
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import seaborn as sns

from pydataset import data


#1 What does the distribution of petal lengths look like?

sns.distplot(iris.petal_length)

#2 s there a correlation between petal length and petal width?
sns.relplot(data=iris, x='petal_length', y='petal_width')

#3#3Would it be reasonable to predict species based on sepal width and sepal length?
sns.jointplot(data=iris, x='petal_length', y='petal_width')



#anscombe  1-Using the lesson as an example, use seaborn's load_dataset function to load the anscombe 
#data set. Use pandas to group the data by the dataset column, and calculate summary statistics for each dataset. What do you notice?
anscombe.groupby('dataset').describe()

sns.relplot(x='x', y='y', data=anscombe)

##Load the InsectSprays dataset and read it's documentation. Create a boxplot that shows the effectiveness of the different insect sprays.
sns.boxplot(data=IS, x='count', y='spray')

Esempio n. 59
0
prod_time2 = np.genfromtxt('gauss_prod_time_50_0_1_26_.txt')
prod_time2_scaled = np.genfromtxt('gauss_prod_time_50_0_1_26_scaled.txt')

prod_time3 = np.genfromtxt('gauss_prod_time_50_0_1_24_.txt')
prod_time3_scaled = np.genfromtxt('gauss_prod_time_50_0_1_24_scaled.txt')

prod_time4 = np.genfromtxt('gauss_prod_time_50_0_1_23_5.txt')
prod_time4_scaled = np.genfromtxt('gauss_prod_time_50_0_1_23_5_scaled.txt')


nbins = 60

if 1==1:
    fig, ax = plt.subplots()
    sns.distplot(np.log10(prod_time1[:,2]),hist=False,kde=True,bins=nbins, 
                kde_kws = {'shade': True, 'linewidth': 3},label='30')
    sns.distplot(np.log10(prod_time2[:,2]),hist=False,kde=True,bins=nbins, 
                kde_kws = {'shade': True, 'linewidth': 3},label='26')
    sns.distplot(np.log10(prod_time3[:,2]),hist=False,kde=True,bins=nbins, 
                kde_kws = {'shade': True, 'linewidth': 3},label='24')
    sns.distplot(np.log10(prod_time4[:,2]),hist=False,kde=True,bins=nbins, 
                kde_kws = {'shade': True, 'linewidth': 3},label='23.5')

    sns.distplot(np.log10(prod_time1_scaled[:,2]),hist=False,kde=True,bins=nbins, 
                kde_kws = {'shade': True, 'linewidth': 3},label='30 scaled')
    sns.distplot(np.log10(prod_time2_scaled[:,2]),hist=False,kde=True,bins=nbins, 
                kde_kws = {'shade': True, 'linewidth': 3},label='26 scaled')
    sns.distplot(np.log10(prod_time3_scaled[:,2]),hist=False,kde=True,bins=nbins, 
                kde_kws = {'shade': True, 'linewidth': 3},label='24 scaled')
    sns.distplot(np.log10(prod_time4_scaled[:,2]),hist=False,kde=True,bins=nbins, 
                kde_kws = {'shade': True, 'linewidth': 3},label='23.5 scaled')
Esempio n. 60
0
def _compare_numeric(col, _df1, _df2, img_dir, date_flag=False):
    """
    Compare two numeric type values

    Parameters
    ----------
    col: string
        name of column to check
    _df1: pandas DataFrame
        slice of table1 containing enough information to check
    _df2: pandas DataFrame
        slice of table2 containing enough information to check
    img_dir: root directory for the generated images
    date_flag: boolean
        Whether the column is date type

    Returns
    -------
    Dictionary contains the output result
    """

    # sampling
    df1_sample = _df1.copy()
    df2_sample = _df2.copy()

    stat_output = _simple_stats(col, df1_sample, df2_sample, 'numeric')

    nan_rate1, nan_rate2 = stat_output['nan_rate']
    if (nan_rate1 == 1) or (nan_rate2 == 1):
        if (nan_rate1 == 1) and (nan_rate2 == 1):
            error_msg = 'all nan in both table'
        elif nan_rate1 == 1:
            error_msg = 'all nan in table1'
        else:
            error_msg = 'all nan in table2'
        return {'column': col, 'error_msg': error_msg}

    # generate the output
    output = [{
        'feature': 'column',
        'value': col,
        'graph': 'Distribution'
    }, {
        'feature': 'sample_value',
        'value': '\n'.join([str(v) for v in stat_output['sample_value']])
    }, {
        'feature':
        'nan_rate',
        'value':
        '\n'.join([str(round(v, 3)) for v in stat_output['nan_rate']])
    }, {
        'feature':
        'num_uni',
        'value':
        '%s/%s\n%s/%s' %
        (str(stat_output['num_uni'][0]), str(df1_sample.dropna().shape[0]),
         str(stat_output['num_uni'][1]), str(df2_sample.dropna().shape[0]))
    }, {
        'feature':
        'value_min',
        'value':
        '\n'.join([str(round(v, 3)) for v in stat_output['value_min']])
    }, {
        'feature':
        'value_mean',
        'value':
        '\n'.join([str(round(v, 3)) for v in stat_output['value_mean']])
    }, {
        'feature':
        'value_median',
        'value':
        '\n'.join([str(round(v, 3)) for v in stat_output['value_median']])
    }, {
        'feature':
        'value_max',
        'value':
        '\n'.join([str(round(v, 3)) for v in stat_output['value_max']])
    }]

    both_value_max = np.max([abs(v) for v in stat_output['value_max']] +
                            [abs(v) for v in stat_output['value_min']])

    # get clean values
    df1_sample_dropna_values = df1_sample[col].dropna().values
    df2_sample_dropna_values = df2_sample[col].dropna().values

    if date_flag:
        dt1 = pd.to_datetime(df1_sample[col.replace('_numeric', '')],
                             errors='coerce')
        dt2 = pd.to_datetime(df2_sample[col.replace('_numeric', '')],
                             errors='coerce')
        date_min1, date_max1 = dt1.min(), dt1.max()
        date_min2, date_max2 = dt2.min(), dt2.max()

    # get distribution
    scale_flg = 0
    df1_draw_values = df1_sample_dropna_values
    df1_draw_value_4 = [
        stat_output['value_min'][0], stat_output['value_mean'][0],
        stat_output['value_median'][0], stat_output['value_max'][0]
    ]

    df2_draw_values = df2_sample_dropna_values
    df2_draw_value_4 = [
        stat_output['value_min'][1], stat_output['value_mean'][1],
        stat_output['value_median'][1], stat_output['value_max'][1]
    ]

    if both_value_max >= pow(10, 6):
        scale_flg = 1
        df1_draw_values, df1_draw_value_4 = _get_scale_draw_values(
            df1_draw_values, df1_draw_value_4)
        df2_draw_values, df2_draw_value_4 = _get_scale_draw_values(
            df2_draw_values, df2_draw_value_4)

    # calculate correlation between two distributions
    if np.max(stat_output['num_uni']) <= 100:
        vc1, vc2 = _value_counts_df(df1_draw_values), _value_counts_df(
            df2_draw_values)
        vc = vc1.merge(vc2, on='value', how='outer').fillna(0)
        obs1, obs2 = vc['count_x'].values * 1.0 / vc['count_x'].sum(
        ), vc['count_y'].values * 1.0 / vc['count_y'].sum()
    else:
        both_min = np.min([np.min(df1_draw_values), np.min(df2_draw_values)])
        both_max = np.max([np.max(df1_draw_values), np.max(df2_draw_values)])
        hist1 = np.histogram(df1_draw_values,
                             bins=100,
                             range=(both_min, both_max),
                             normed=False,
                             density=False)
        hist2 = np.histogram(df2_draw_values,
                             bins=100,
                             range=(both_min, both_max),
                             normed=False,
                             density=False)
        obs1, obs2 = hist1[0] / (np.sum(hist1[0]) *
                                 1.0), hist2[0] / (np.sum(hist2[0]) * 1.0)

    if len(obs1) == 1:
        corr = np.min([1. - nan_rate1, 1. - nan_rate2]) * 1.0 / np.max(
            [1. - nan_rate1, 1. - nan_rate2])
    elif list(obs1) == list(obs2):
        corr = 1.0
    else:
        corr = spearmanr(obs1, obs2)[0]

    # draw and save distribution graph
    dpi = 72
    if date_flag:
        plt.figure(figsize=(635. / dpi, 635. / (9. / 8.) / dpi), dpi=dpi)
    else:
        plt.figure(figsize=(635. / dpi, 635. / (9. / 6.) / dpi), dpi=dpi)
    if scale_flg:
        plt.title('%s (log10 scale)' % (col))
    else:
        plt.title('%s' % (col))

    # if unique level is less than 10, draw countplot instead
    both_num_uni = np.max(stat_output['num_uni'])
    if both_num_uni <= 10:
        df1_temp = pd.DataFrame(df1_sample_dropna_values, columns=['value'])
        df1_temp['type'] = 'table1'
        df2_temp = pd.DataFrame(df2_sample_dropna_values, columns=['value'])
        df2_temp['type'] = 'table2'
        full_temp = pd.concat([df1_temp, df2_temp], axis=0)
        sns.countplot(full_temp['value'],
                      hue=full_temp['type'],
                      palette=sns.color_palette([TABLE1_DARK, TABLE2_DARK]))
        if both_num_uni > 5:
            plt.xticks(rotation=90)
        plt.legend(loc=1)
    else:
        ax1 = sns.distplot(df1_draw_values,
                           color=TABLE1_DARK,
                           hist=False,
                           label='table1')
        ax2 = sns.distplot(df2_draw_values,
                           color=TABLE2_DARK,
                           hist=False,
                           label='table2')
        y_low_1, y_up_1 = ax1.get_ylim()
        y_low_2, y_up_2 = ax2.get_ylim()
        y_low, y_up = np.min([y_low_1, y_low_2]), np.max([y_up_1, y_up_2])
        plt.ylim((y_low, y_up))

        if date_flag:
            _draw_texts(text_values=[date_min1, date_max1],
                        draw_value_4=df1_draw_value_4,
                        mark=1,
                        y_low=y_low,
                        y_up=y_up,
                        date_flag=True)
            _draw_texts(text_values=[date_min2, date_max2],
                        draw_value_4=df2_draw_value_4,
                        mark=2,
                        y_low=y_low,
                        y_up=y_up,
                        date_flag=True)
        else:
            _draw_texts(text_values=[
                stat_output['value_min'][0], stat_output['value_mean'][0],
                stat_output['value_median'][0], stat_output['value_max'][0]
            ],
                        draw_value_4=df1_draw_value_4,
                        mark=1,
                        y_low=y_low,
                        y_up=y_up)
            _draw_texts(text_values=[
                stat_output['value_min'][1], stat_output['value_mean'][1],
                stat_output['value_median'][1], stat_output['value_max'][1]
            ],
                        draw_value_4=df2_draw_value_4,
                        mark=2,
                        y_low=y_low,
                        y_up=y_up)

    # save the graphs
    # adjust graph name
    graph_name = col
    if '/' in graph_name:
        graph_name = graph_name.replace('/', '')
    plt.savefig(os.path.join(img_dir, graph_name + '.png'),
                transparent=True,
                dpi=dpi)

    if date_flag:
        output.append({
            'feature': 'date_min',
            'value': '%s\n%s' % (date_min1, date_min2)
        })
        output.append({
            'feature': 'date_max',
            'value': '%s\n%s' % (date_max1, date_max2)
        })
    output.append({'feature': 'corr', 'value': round(corr, 3)})

    return {
        'column': col,
        'result_df': pd.DataFrame(output),
        'corr': {
            'column': col,
            'corr': round(corr, 3)
        }
    }