Python barplot Exemples, seaborn.barplot Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : plotters.py Projet : MrChristophRivera/InsightProject

def plot_accuracy_with_random_by_category(true, predicted, sort=True):
    """plots the random
    Parameters:
        true (array): the observed values
        predicted (array): the observed values.
        sort (bool): if true sort by the
    Returns:
        res, ax
        """
    # compute the values
    res = calcuate_accuracy_above_random_chance(true, predicted, sort)

    # set styles
    sns.set_style('white')
    sns.set_context('talk')

    # axes
    ax = sns.barplot(x='Model', y='Category', data=res, color='#c0cdf3')
    sns.barplot(x='Random', y='Category', data=res, color='#4f73dd', ax=ax)

    # configure the plot details
    plt.xlim(0, 100)
    plt.xlabel('Accuracy (%)', size=24)
    plt.ylabel('')
    ax.tick_params(axis='both', labelsize=22)
    sns.despine()

    return ax, res

Exemple #2

0

Afficher le fichier

Fichier : dist_plotting.py Projet : sallamander/dsfuncs

def _plot_categorical_var_dist(var_data, ax, show): 
    """Plot a boxplot of the continuous variable data inputted. 
    
    This is a helper function called from plot_var_dist. It'll 
    be used in the case that categorical data is passed in. 

    Args: 
        var_data: 1d numpy.ndarray
        ax: matplotlib.pyplot.Axes object 
            This may or may not be None, depending on what 
            was passed from plot_var_dist. 
        show: bool 
    """

    var_data_counts = var_data.value_counts()
    var_data_percs = var_data_counts / var_data_counts.sum()

    if ax: 
        sns.barplot(var_data_percs.index, 
                var_data_percs.values, palette="BuGn_d", ax=ax)
    else: 
        ax = sns.barplot(var_data_percs.index, 
                var_data_percs.values, palette="BuGn_d") 
    bars = ax.patches
    labels = var_data_percs.values
    _add_bar_text(ax, bars, labels) 

    if show: 
        plt.show()

Exemple #3

0

Afficher le fichier

Fichier : plot.py Projet : DBlackKat/Sentiment-Data-Analysis

def sentimentAccuracy(tickeName):
    path2 = 'resultsMKII'
    frame2 = call_data(tick_Name,path2)
    logReturn = [[],[],[]]
    sentiment = []
    index = []
    for i in range(len(frame2)):
        for x in range(3):
            logReturn[x].append(frame2[str(x+1)+' day'].values[i])
        sentiment.append(frame2['Sentiment'].values[i])
        index.append(i)
    result = {'logReturn1':pd.Series(logReturn[0],index = index),
              'logReturn2':pd.Series(logReturn[1],index = index),
              'logReturn3':pd.Series(logReturn[2],index = index),
              'Sentiment':pd.Series(sentiment,index = index)}

    sns.plt.subplot(3,1,1)
    aw = sns.barplot(x="Sentiment",y = "logReturn1",ci=None,data = result)
    aw.set(xlabel='Sentiment', ylabel='Day 1')
    sns.plt.subplot(3,1,2)
    ax = sns.barplot(x="Sentiment",y = "logReturn2",ci=None,data = result)
    ax.set(xlabel='Sentiment', ylabel='Day 2')
    sns.plt.subplot(3,1,3)
    ay = sns.barplot(x="Sentiment",y = "logReturn3",ci=None,data = result)
    ay.set(xlabel='Sentiment', ylabel='Day 3')
    sns.plt.show()

Exemple #4

0

Afficher le fichier

Fichier : plotting.py Projet : Hushpar/titanic_ml

def p6(data):
    f, (ax1, ax2) = plt.subplots(2, 1, figsize=(10, 8), sharex=True)

    sns.boxplot(x="Title", y="Age", data=data.sort_values("Age"), ax=ax1)
    sns.barplot(x="Title", y="Survived", data=data, ax=ax2)

    plt.show()

Exemple #5

0

Afficher le fichier

Fichier : plot.py Projet : DBlackKat/Sentiment-Data-Analysis

def historyEffectOnSentiment(tickeName):
    # from mpl_toolkits.mplot3d import Axes3D
    path2 = 'resultsMKII'
    frame2 = call_data(tick_Name,path2)
    logReturn = [[],[],[],[],[]]
    sentiment = []
    index = []
    for i in range(len(frame2)):
        for x in range(5):
            logReturn[x].append(frame2['-'+str(x+1)+' day'].values[i])
        sentiment.append(frame2['Sentiment'].values[i])
        index.append(i)
    result = {'logReturn1':pd.Series(logReturn[0],index = index),
              'logReturn2':pd.Series(logReturn[1],index = index),
              'logReturn3':pd.Series(logReturn[2],index = index),
              'logReturn4':pd.Series(logReturn[3],index = index),
              'logReturn5':pd.Series(logReturn[4],index = index),
              'Sentiment':pd.Series(sentiment,index = index)}
    sns.plt.subplot(5,1,1)
    aw = sns.barplot(x="Sentiment",y = "logReturn1",data = result)
    aw.set(xlabel='Sentiment', ylabel='Day -1')
    sns.plt.subplot(5,1,2)
    ax = sns.barplot(x="Sentiment",y = "logReturn2",data = result)
    ax.set(xlabel='Sentiment', ylabel='Day -2')
    sns.plt.subplot(5,1,3)
    ay = sns.barplot(x="Sentiment",y = "logReturn3",data = result)
    ay.set(xlabel='Sentiment', ylabel='Day -3')
    sns.plt.subplot(5,1,4)
    az = sns.barplot(x="Sentiment",y = "logReturn4",data = result)
    az.set(xlabel='Sentiment', ylabel='Day -4')
    sns.plt.subplot(5,1,5)
    bx = sns.barplot(x="Sentiment",y = "logReturn5",data = result)
    bx.set(xlabel='Sentiment', ylabel='Day -5')
    sns.plt.show()

Exemple #6

0

Afficher le fichier

Fichier : run_xgb_param_search.py Projet : Quasi-quant2010/Stacking

def get_xgb_feature_importance_plot(best_param_, experiment_, 
                                    png_folder,
                                    png_fname,
                                    score_threshold=0.8):

    # 1. 
    train_X, train_y = experiment_.get_train_data()
    clf = XGBClassifier()
    try:
        del best_param_['model_type']
    except:
        pass
    clf.set_params(**best_param_)
    clf.fit(train_X, train_y)
    index2feature = clf.booster().get_fscore()
    fis = pd.DataFrame({'name':index2feature.keys(),
                        'score':index2feature.values()})
    fis = fis.sort('score', ascending=False)
    if len(fis.index) > 20:
        score_threshold = fis['score'][fis['score'] > 0.0].quantile(score_threshold)
        #where_str = 'score > %f & score > %f' % (score_threshold, 0.0)
        where_str = 'score >= %f' % (score_threshold)
        fis = fis.query(where_str)

    # 2. plot
    #gs = GridSpec(2,2)
    #ax1 = plt.subplot(gs[:,0])
    #ax2 = plt.subplot(gs[0,1])
    #ax3 = plt.subplot(gs[1,1])

    # 3.1 feature importance
    sns.barplot(x = 'score', y = 'name',
                data = fis,
                #ax=ax1,
                color="blue")
    #plt.title("Feature_Importance", fontsize=10)
    plt.ylabel("Feature", fontsize=10)
    plt.xlabel("Feature_Importance : f-Score", fontsize=10)

    """
    # 3.2 PDF
    confidence_score = clf.oob_decision_function_[:,1]
    sns.distplot(confidence_score, kde=False, rug=False, ax=ax2)
    ax2.set_title("PDF")

    # 3.3 CDF
    num_bins = min(best_param_.get('n_estimators',1), 100)
    counts, bin_edges = np.histogram(confidence_score, bins=num_bins, normed=True)
    cdf = np.cumsum(counts)
    ax3.plot(bin_edges[1:], cdf / cdf.max())
    ax3.set_title("CDF")
    ax3.set_xlabel("Oob_Decision_Function:Confidence_Score", fontsize=10)
    """

    png_fname = os.path.join(Config.get_string('data.path'), 'graph', png_fname)
    plt.tight_layout()
    plt.savefig(png_fname)#, bbox_inches='tight', pad_inches=1)
    plt.close()

    return True

Exemple #7

0

Afficher le fichier

Fichier : cluster_1.py Projet : iandriver/RNA-sequence-tools

def clust_stability(log2_expdf_gene, iterations=16):
    sns.set(context='poster', font_scale = 1)
    sns.set_palette("RdBu_r")
    stability_ratio = []
    total_genes = len(log2_expdf_gene.columns.tolist())
    end_num = 1000
    iter_list = range(100,int(round(end_num)),int(round(end_num/iterations)))
    for gene_number in iter_list:
        title= str(gene_number)+' genes plot.'
        top_pca = plot_PCA(log2_expdf_gene, num_genes=gene_number, title=title)
        top_pca_by_gene = log2_expdf_gene[top_pca]
        top_pca_by_cell = top_pca_by_gene.transpose()
        cell_linkage, plotted_df_by_gene, col_order = clust_heatmap(top_pca, top_pca_by_gene, num_to_plot=gene_number, title=title)
        if gene_number == 100:
            s1 = col_order
            s0 = col_order
        else:
            s2= col_order
            sm_running = difflib.SequenceMatcher(None,s1,s2)
            sm_first = difflib.SequenceMatcher(None,s0,s2)
            stability_ratio.append((sm_running.ratio(), sm_first.ratio()))
            s1=col_order
        plt.close()
    x= iter_list[1:]
    f, (ax1, ax2) = plt.subplots(2, 1, figsize=(10, 8), sharex=True)
    y1= [m[0] for m in stability_ratio]
    y2= [m[1] for m in stability_ratio]
    sns.barplot(x, y1, palette="RdBu_r", ax=ax1)
    ax1.set_ylabel('Running ratio (new/last)')
    sns.barplot(x, y2, palette="RdBu_r", ax=ax2)
    ax2.set_ylabel('Ratio to 100')
    plt.savefig(os.path.join(filename,'clustering_stability.pdf'), bbox_inches='tight')
    plt.show()
    plt.close()
    return stability_ratio

Exemple #8

0

Afficher le fichier

Fichier : frame.py Projet : TAKSIM/camp

    def createSubOverviewPage(self):
        layout = QtGui.QGridLayout()
        w = QtGui.QWidget()
        sns.set(style="whitegrid")
        f, ax = plt.subplots(figsize=(20, 12))
        canvas = figureCanvas(f)
        canvas.setParent(w)
        sns.set(style="whitegrid")
        q = QtSql.QSqlQuery("""SELECT EXP_DATE, SUM(AMOUNT), SUM(AMOUNT*(1+EXP_RETURN*(datediff(EXP_DATE, SETTLE_DATE)+1)/36500.0)) FROM LIABILITY WHERE EXP_DATE>='%s' GROUP BY EXP_DATE ORDER BY EXP_DATE"""%self.sysdate.date().toPyDate())
        dates, vals = [], []
        x_amt = range(0,1000000000,100000000)
        while q.next():
            dates.append(q.value(0).toDate().toPyDate().isoformat())
            vals.append((q.value(1).toDouble()[0], q.value(2).toDouble()[0]))
        data = pd.DataFrame(vals, index=dates, columns=['Amount', 'Total Return'])
        # Plot the total crashes
        sns.set_color_codes("pastel")
        sns.barplot(x='Total Return', y=dates, data=data,
                    label='Interest', color="b")

        # Plot the crashes where alcohol was involved
        sns.set_color_codes("muted")
        sns.barplot(x='Amount', y=dates, data=data,
                    label="Principal", color="b")

        # Add a legend and informative axis label
        ax.legend(ncol=2, loc="upper right", frameon=True)
        ax.set(ylabel="Maturity Date", title='Liability Overview')
        sns.despine(left=True, bottom=True)

        layout.addWidget(w, 0, 0, 1, 1)
        return layout

Exemple #9

0

Afficher le fichier

Fichier : visr.py Projet : CoAxLab/radd

def plot_simdf_summary(simdf):
    f, axes = plt.subplots(2, 2, figsize=(12,8))
    a1, a2, a3, a4 = axes.flatten()
    targets=['A', 'B', 'C', 'D']
    clrs = ['#3572C6',  '#c44e52', '#8172b2', '#83a83b']
    targetColors = dict(zip(targets,clrs))
    sns.barplot(x='choice', y='rt', data=simdf, ax=a1, order=targets, palette=targetColors)
    sns.barplot(x='choice', y='switch', data=simdf, ax=a2, order=targets, palette=targetColors)
    a1.set_ylabel('Response Time (ms)', fontsize=13)
    a2.set_ylabel('P(Switch)', fontsize=13)
    rts = simdf.groupby('choice').mean().rt.values
    sw = simdf.groupby('choice').mean().switch.values
    a1.set_ylim(rts.min()*.85, rts.max()*1.15)
    a2.set_ylim(sw.min()*.50, sw.max()*1.20)
    for i, target in enumerate(targets):
        tcolor=targetColors[target]
        tdf = simdf[simdf.choice=='target'].reset_index()
        sns.timeseries.tsplot(data=simdf, time='trial', unit='agent', value='vd'+target, ax=a3, color=tcolor)
        sns.timeseries.tsplot(data=simdf, time='trial', unit='agent', value='vi'+target, ax=a4, color=tcolor)
    a3.legend(loc=0)
    f.subplots_adjust(hspace=.35, wspace=.4)
    a3.set_ylabel('$v^G_t$', fontsize=16)
    a4.set_ylabel('$v^N_t$', fontsize=16)
    a3.set_xlabel('Trial ( $t$ )', fontsize=13)
    a4.set_xlabel('Trial ( $t$ )', fontsize=13)
    plt.subplots_adjust(wspace=.4)
    sns.despine()

Exemple #10

0

Afficher le fichier

Fichier : profile.py Projet : Stefannn/PyUtil

def barplot_top_n_functions(df, n, sort_criterium='tot_time', show_std=True):
    '''
    Barplot of the n most time consuming functions (sorted by sort_criterium)
    df: panda dataframe (e.g. via get_df_from_stats())
    ci: confidence intervall, set to None if you don't want them
    returns: figure
    '''
    tt = ('tot_time', 'mean')
    s_c = (sort_criterium, 'mean') # sort criterium including mean
    total_time = df[tt].sum()
    data = df.sort(columns=[s_c], ascending=False).iloc[0:n]
    topn_time = data[tt].sum()
    frac_time = topn_time / total_time
    if show_std:
        errs = data[(sort_criterium, 'std')]
    else:
        errs = None

    f, ax = plt.subplots(figsize=(10,5))
    sns.barplot(data=data, x=s_c, y='flf', color='b', xerr=errs)
    sns.despine(left=True, bottom=True)
    ax.set(ylabel="", xlabel=sort_criterium + " [s]")
    # write the fraction of total time spent in these n functions
    rect = ax.patches[0] # last rectangle to get position of text
    txt = str(int(100*frac_time)) + "% of total runtime"
    ax.text(rect.get_width()*0.7, rect.get_height()*1.5, txt,
            ha="center", va="center")

    plt.tight_layout()
    return ax

Exemple #11

0

Afficher le fichier

Fichier : stats.py Projet : fxfactorial/macholibre

def gen_abnormalities_bar(good, bad):
    print 'Parsing good json.'
    g = gen_abnormalities_data(good)
    print 'Total Good:', len(g)
    print 'Parsing bad json.'
    b = gen_abnormalities_data(bad)
    print 'Total Bad:', len(b)

    mcg = map(lambda x: x[0], g.most_common(25))
    mcb = map(lambda x: x[0], b.most_common(25))

    most_common = set(mcg + mcb)
    for k in g.keys():
        if k not in most_common:
            del g[k]
    print 'Filtered Good:', len(g)

    for k in b.keys():
        if k not in most_common:
            del b[k]
    print 'Filtered Bad:', len(b)

    gabnormalities, gcounts = zip(*g.most_common())
    babnormalities, bcounts = zip(*b.most_common())
    gdata = pd.DataFrame({'alignment': 'good', 'abnormality': gabnormalities,
                          'count': gcounts})
    bdata = pd.DataFrame({'alignment': 'bad', 'abnormality': babnormalities,
                          'count': bcounts})

    data = gdata.append(bdata).sort_values('count', ascending=False)
    print data

    sns.barplot(x='abnormality', y='count', hue='alignment', data=data)

Exemple #12

0

Afficher le fichier

Fichier : frame_features_rank.py Projet : vincent2610/cancer-assessment

    def __init__(self, master, x_train, y_train, x_test, y_test, evaluator, df, console):
        Tk.Frame.__init__(self, master)
        self.x_train = x_train
        self.y_train = y_train
        self.x_test = x_test
        self.y_test = y_test
        self.evaluator = evaluator
        self.df = df
        self.console = console

        frame_train = Tk.Frame(self)
        frame_train.pack(fill=Tk.BOTH, expand=1, padx=15, pady=15)
        plt.figure(figsize=(12, 20))
        plt.subplot(111)
        
        # k best feature's names
        plt.figure(figsize=(12, 8))
        plt.subplot(111)
        selection = SelectKBest(f_classif, k=3)
        selection.fit(self.x_train, self.y_train)
        feature_scores = selection.scores_
        feature_names = df.columns.values
        feature_names = feature_names[feature_names != "NSP"]
        kbest_feature_indexes = selection.get_support()
        kbest_feature_names = feature_names[kbest_feature_indexes]

        # 存为DataFrame
        rec = zip(feature_scores, feature_names)
        data = pd.DataFrame(rec, columns=["Score", "Feature"])

        sns.barplot(x="Feature", y="Score", data=data)
        plt.xticks(rotation=-90)
        plt.title("Cardiotocography Feature Scores Ranking")
        self.attach_figure(plt.gcf(), frame_train)

Exemple #13

0

Afficher le fichier

Fichier : ratings.py Projet : papousek/analysis

def plot_number_of_user_ratings_per_context():
    nums = load_ratings_with_contexts().groupby(['user', 'context_name', 'term_type']).apply(len).reset_index().rename(columns={0: 'num'}).groupby('num').apply(len).reset_index().rename(columns={0: 'count'})
    nums = nums.head(n=20)
    sns.barplot(x='num', y='count', data=nums, color=output.palette()[0])
    plt.ylabel('Number of users')
    plt.xlabel('Number of ratings per context')
    output.savefig('number_of_ratings')

Exemple #14

0

Afficher le fichier

Fichier : graphs.py Projet : annamarie-g/capstone_project

def age_histogram(df_age):
    age_counts = df_age.groupby('age').age.count()

    y = age_counts.values
    x = [int(age) for age in age_counts.index]

    f, ax = plt.subplots(1,1, figsize=(12,8))
    sns.barplot(x,y, palette=sns.dark_palette('#008080', reverse=True, n_colors=60), linewidth=0)
    ax.set_ylabel('Postings')
    ax.set_xlabel('')
    ax.set_title('Histogram of Postings by Age')
    x_ticks = [0]
    x_ticks.extend(range(2,95, 5))
    x_ticklabels = ['']
    x_ticklabels.extend(range(20,95,5))
    ax.set_xticks(x_ticks)
    ax.set_xticklabels(x_ticklabels)
    #need to fix xlabels
    sns.despine(bottom=True, right=True)
    sns.plt.xlim(-1, 90)
    for i,p  in enumerate(ax.patches):
        height = p.get_height()
        if ((i+18) % 5 == 0) and (i+18 < 70): 
            ax.text(p.get_x()-1, height + 4, i+18, fontsize=18)

    plt.show()

Exemple #15

0

Afficher le fichier

Fichier : bayesian_neural_network.py Projet : asudomoeva/probability

def plot_heldout_prediction(input_vals, probs,
                            fname, n=10, title=""):
  """Save a PNG plot visualizing posterior uncertainty on heldout data.

  Args:
    input_vals: A `float`-like Numpy `array` of shape
      `[num_heldout] + IMAGE_SHAPE`, containing heldout input images.
    probs: A `float`-like Numpy array of shape `[num_monte_carlo,
      num_heldout, num_classes]` containing Monte Carlo samples of
      class probabilities for each heldout sample.
    fname: Python `str` filename to save the plot to.
    n: Python `int` number of datapoints to vizualize.
    title: Python `str` title for the plot.
  """
  fig = figure.Figure(figsize=(9, 3*n))
  canvas = backend_agg.FigureCanvasAgg(fig)
  for i in range(n):
    ax = fig.add_subplot(n, 3, 3*i + 1)
    ax.imshow(input_vals[i, :].reshape(IMAGE_SHAPE[:-1]), interpolation="None")

    ax = fig.add_subplot(n, 3, 3*i + 2)
    for prob_sample in probs:
      sns.barplot(np.arange(10), prob_sample[i, :], alpha=0.1, ax=ax)
      ax.set_ylim([0, 1])
    ax.set_title("posterior samples")

    ax = fig.add_subplot(n, 3, 3*i + 3)
    sns.barplot(np.arange(10), np.mean(probs[:, i, :], axis=0), ax=ax)
    ax.set_ylim([0, 1])
    ax.set_title("predictive probs")
  fig.suptitle(title)
  fig.tight_layout()

  canvas.print_figure(fname, format="png")
  print("saved {}".format(fname))

Exemple #16

0

Afficher le fichier

Fichier : plot.py Projet : clarkfitzg/ballistics

def make_plots(groups):

    sns.stripplot("ammo", "moa", data=groups, jitter=True)
    postprocess()
    plt.savefig("points.png")

    plt.clf()
    sns.boxplot("ammo", "moa", data=groups)
    postprocess()
    plt.savefig("boxplot.png")

    plt.clf()
    sns.barplot("ammo", "mean", data=groups, ci=None)
    plt.title("mean moa for best 9 of 10 five shot groups")
    plt.ylabel("moa")
    postprocess()
    plt.savefig("avg_moa.png")

    plt.clf()
    std = groups["standard"]
    std = std[std.notnull()]

    fig, axes = plt.subplots(ncols=2)
    sns.distplot(std, ax=axes[0])
    stats.probplot(std, plot=axes[1])
    fig.set_size_inches(6, 4)
    fig.tight_layout()
    plt.savefig("qqplot.png")

Exemple #17

0

Afficher le fichier

Fichier : exploreData.py Projet : wrodezno/Bimbo

 def hbars(colrow,colcol,groupedData,tempcolors,title,ylab,xlab):
     #Input: colrow: Values alone the x axis
     #       colcol: Values along the y-axis
     #  groupedData: Pandas DataFrame
     #   tempcolors: Bar colors in plot
     #        title: Plot title
     #        ylab:  Y-label
     #        xlab:  X-label
     #Output: Horizontal Bar Plot with value labels the the end of each bar
     valuePlotting = sns.barplot(x = colcol,y = colrow,order = groupedData[colcol],data = groupedData)
     fig, ax = plt.subplots()                                                       #Plot Figure and axes handles 
     fig.set_size_inches(14, 14)
     sns.despine()
     ax = sns.barplot(x = colrow,y = colcol,data = groupedData,order = groupedData[colcol], color = tempcolors)
     plt.setp(ax.patches, linewidth=0)   
     ax.set_title(title,fontsize = 16)
     ax.set_ylabel(ylab,fontsize = 15)
     ax.set_xlabel(xlab,fontsize = 15)
     for p in valuePlotting.patches:
         xpos = p.get_height()
         height = p.get_x()   
         if xpos > 50:
             t = .01
         elif xpos > 8:
             t = .008
         elif xpos > 15:
             t = .001
         else:
             t = .1
         ax.text(xpos + t*xpos, height+ .5, '%1.1f'%(xpos))
     return fig

Exemple #18

0

Afficher le fichier

Fichier : analysis_helpers.py Projet : judithfan/graphcomm

def plot_avg_rank_all_models(P,split_type='balancedavg1',saveout=True):
    '''
    Generate bar plot of average rank (out of 64) of correct sketch category, by model, for a particular split.
    Wrapper around get_avg_rank_all_models, which itself wraps around get_avg_rank_across_samples.
    '''
    HU,H0U,H1U,MU,M0U,M1U,M2U,M3U = get_avg_rank_all_models(P,split_type=split_type)
    sns.set_context('talk')
    sns.set_style("ticks")
    fig = plt.figure(figsize=(4,8))
    ax = fig.add_subplot(111)
    U = pd.concat([HU,H0U,H1U,MU,M0U,M1U,M2U,M3U],axis=0)
    sns.barplot(data=U,
                x='adaptor',
                y='target_rank',
                ci='sd',
                order = ['human_combined_cost','human_S0_cost','human_combined_nocost',\
                         'multimodal_fc6_combined_cost', \
                         'multimodal_fc6_S0_cost','multimodal_fc6_combined_nocost',
                         'multimodal_conv42_combined_cost',\
                         'multimodal_pool1_combined_cost'])
    plt.ylabel('mean rank of congruent sketch')
    plt.ylim([1,32])
    xticklabels=['Context Cost Human','NoContext Cost Human','Context NoCost Human','Context Cost HighAdaptor',
                 'NoContext Cost HighAdaptor','Context NoCost HighAdaptor', 'Context Cost MidAdaptor',\
                 'Context Cost LowAdaptor']
    plt.xlabel('')
    l = ax.set_xticklabels(xticklabels, rotation = 90, ha="left")
    plt.tight_layout()
    if saveout:
        plt.savefig('./plots/avg_rank_all_models_{}.pdf'.format(split_type))

Exemple #19

0

Afficher le fichier

Fichier : analysis_helpers.py Projet : judithfan/graphcomm

def plot_prop_congruent_all_models(P,split_type='balancedavg1',saveout=True):
    '''
    Generate bar plot of proportion of trials for which context-congruent sketch preferred over incongruent sketch.
    Wrapper around get_prop_congruent_all_models, which itself wraps around get_prop_congruent.
    '''
    HU,H0U,H1U,MU,M0U,M1U,M2U,M3U = get_prop_congruent_all_models(P,split_type=split_type)
    sns.set_context('talk')
    fig = plt.figure(figsize=(4,8))
    ax = fig.add_subplot(111)     
    D = pd.concat([HU,H0U,H1U,MU,M0U,M1U,M2U,M3U],axis=0)    
    sns.barplot(data=D,
                x='adaptor',
                y='sign_diff_rank',ci='sd')
    plt.axhline(y=0.5,linestyle='dashed',color='k')
    plt.ylim([0,1])
    plt.ylabel('proportion context-congruent sketch preferred')

    xticklabels=['Context Cost Human','NoContext Cost Human','Context NoCost Human','Context Cost HighAdaptor',
                 'NoContext Cost HighAdaptor','Context NoCost HighAdaptor', 'Context Cost MidAdaptor',\
                 'Context Cost LowAdaptor']
    plt.xlabel('')
    l = ax.set_xticklabels(xticklabels, rotation = 90, ha="left")
    plt.tight_layout()
    if saveout:
        plt.savefig('./plots/prop_congruent_all_models_{}.pdf'.format(split_type))

Exemple #20

0

Afficher le fichier

Fichier : meme_processory.py Projet : saketkc/bio-tricks

def main(argv):
    parser = argparse.ArgumentParser(description='Process meme files')
    parser.add_argument('-i', '--meme', metavar='<meme_out>', help='Meme input file', required=True)
    parser.add_argument('-m', '--motif', metavar='<motif_no>', help='Motif number', required=True, type=int)
    parser.add_argument('-c', '--phylo', metavar='<phylo_out>', help='PhyloP conservation scores', required=True)
    parsed = parser.parse_args(argv)
    handle = open(parsed.meme)
    records = motifs.parse(handle, 'meme')
    record = records[parsed.motif-1]
    phylo_data = csv.reader(open(parsed.phylo,'r'), delimiter='\t')
    phylo_scores = []
    for line in phylo_data:
        phylo_scores.append(float(line[2]))
    print "Motif length", record.length
    print "phylo length", len(phylo_scores)
    profile = position_wise_profile(record.counts, record.length)
    max_occur = find_max_occurence(profile, max_count=1)
    motif_scores = []
    for position in max_occur:
        motif_scores.append(position[0][1])
    pr = pearsonr(np.array(motif_scores), np.array(phylo_scores))
    print 'Pearson correlation: {}'.format(pr)
    fig, ax = plt.subplots()
    ax= sns.regplot(y=np.array(motif_scores), x=np.array(phylo_scores), scatter=True)
    ax.set(ylabel="Count of most freq nucleotide", xlabel="PhyloP scores", title='CTCF | pearsonr = {}, p-val={}'.format(pr[0],pr[1]));
    fig.savefig('{}_motif{}_scatter.png'.format(parsed.phylo, parsed.motif))
    x = np.linspace(1,len(phylo_scores)+1,num=len(phylo_scores), endpoint=False)
    f, (ax1, ax2) = plt.subplots(2, 1)
    x1 = sns.barplot(x,y=np.array(motif_scores), ax=ax1)
    x2 = sns.barplot(x,y=np.array(phylo_scores), ax=ax2)
    x1.set(ylabel='Counts of most freq nucleotide', xlabel='Position in motif')
    x2.set(ylabel='Phylop Score', xlabel='Position in motif')
    f.tight_layout()
    f.savefig('{}_motif{}_trend.png'.format(parsed.phylo, parsed.motif))

Exemple #21

0

Afficher le fichier

Fichier : basicTests.py Projet : 5agado/conversation-analyzer

    def animate(i):
        df = grouped.get_group(keys[i]).sort_values("hour")

        print(df.head())
        ax.clear()
        sns.barplot(x="hour", y="lenMsgs", hue="sender", data=df, ax=ax)
        ax.set_title(i)

Exemple #22

0

Afficher le fichier

Fichier : models_impact.py Projet : thran/experiments2.0

def compare_more_models(experiments):
    labels = sorted(experiments.keys())

    results_d = pd.DataFrame(index=labels, columns=labels, dtype=float)
    results_s = pd.DataFrame(index=labels, columns=labels, dtype=float)
    results_p = pd.DataFrame(index=labels, columns=labels, dtype=float)
    for label1 in labels:
        for label2 in labels:
            d, s, p = compare_models(experiments[label1][0](label1), experiments[label2][0](label2),
                                     experiments[label1][1](label1), experiments[label2][1](label2), plot=False)
            results_d[label1][label2] = d
            results_s[label1][label2] = s
            results_p[label1][label2] = p

    df = pd.DataFrame(columns=["labels", "rmse"])
    for label in labels:
        r = Evaluator(experiments[label][0](label), experiments[label][1](label)).get_report()
        df.loc[len(df)] = (label, r["rmse"])

    plt.subplot(221)
    plt.title("Correlations of difficulties")
    sns.heatmap(results_d)
    plt.subplot(222)
    plt.title("Correlations of skills")
    sns.heatmap(results_s)
    plt.subplot(223)
    plt.title("Correlations of predictions")
    sns.heatmap(results_p)
    plt.subplot(224)
    sns.barplot(x="labels", y="rmse", data=df,)

Exemple #23

0

Afficher le fichier

Fichier : wrong_answers.py Projet : papousek/analysis

def plot_answer_frequency_all(wrong_only=True, contexts=20, show_names=False, normalize=True, top=5):
    plot_cols = 4 if contexts >= 20 else 2
    plot_rows = math.ceil(contexts / plot_cols)
    context_answers = get_context_answers()['count'].to_dict()
    data_all = prepare_answer_frequency_all()
    plot_contexts = sorted(data_all['group_name'].unique(), key=lambda c: -context_answers[c])[:contexts]
    data_all = data_all[data_all['group_name'].isin(plot_contexts)]
    if wrong_only:
        data_all = data_all[data_all['term_name_asked'] != data_all['term_name_answered']]
    if normalize:
        def _normalize(group):
            group['answer_frequency'] = group['answer_frequency'] / group['answer_frequency'].sum()
            return group
        data_all = data_all.groupby(['group_name', 'term_name_asked']).apply(_normalize)
    rcParams['figure.figsize'] = 7.5 * plot_cols, 5 * plot_rows
    for i, (group_name, data) in enumerate(data_all.groupby('group_name')):
        plt.subplot(plot_rows, plot_cols, i + 1)
        to_plot = defaultdict(list)
        for term, term_data in data.groupby('term_name_asked'):
            to_plot[term] = list(term_data['answer_frequency'].head(top).cumsum().sort_values(ascending=False, inplace=False))
        terms, terms_data = zip(*sorted(to_plot.items(), key=lambda x: x[1][-1], reverse=True))
        plt.title(group_name[:30])
        for i in range(top):
            sns.barplot(list(range(len(terms))), list(map(lambda x: ([0] * (top - len(x)) + x)[i], terms_data)), color=output.palette()[i])
        plt.xticks(plt.xticks()[0], terms, rotation=90)
    output.savefig(filename='answer_frequencies_all')

Exemple #24

0

Afficher le fichier

Fichier : sb_models.py Projet : SWFarag/NRP-structure-classifier

 def ModelsSummary(self, df1, df2, scoring_metric):
     if(scoring_metric == "balanced_accuracy"):
         scoring_metric = "CCR"
     res1 = self.reshapeDf(df1, "AUC")
     res2 = self.reshapeDf(df2, scoring_metric)
     res1 = res1.sort_values(by=['AUC'], ascending=False)
     res2 = res2.sort_values(by=[scoring_metric], ascending=False)
     ###################################################
     sns.set(style="whitegrid")
     fig, ax = plt.subplots(nrows=1,ncols=2,squeeze=False,sharex=False, sharey=True)
     fig.suptitle("Models Performance", fontsize=20)
     fig.tight_layout()
     fig.subplots_adjust(top=0.85)
     fig.set_figheight(6)
     fig.set_figwidth(14)
     ax[0,0].set_title("AUC",fontsize=15)
     ax[0,1].set_title(scoring_metric, fontsize=15)
     #ax[0,0].set_xlabel(xlabel="fsadf",fontsize=24)
     #ax[0,1].set_xlabel(xlabel="fsadf",fontsize=24)
     #sns.set_context("paper", rc={"font.size":15,"axes.titlesize":10,"axes.labelsize":20})
     #sns.set()
     sns.set_context("paper",font_scale=1.6)
     sns.barplot(x="Model", y="AUC", hue="Descriptor", data=res1, ax = ax[0,0])
     sns.barplot(x="Model", y=scoring_metric, hue="Descriptor", data=res2, ax = ax[0,1])
     fig.savefig(fname=self.out_df_path + "SB_models_performance_summary_both.png" , dpi=400 ,format="png")
     fig.clf()
     return res1, res2

Exemple #25

0

Afficher le fichier

Fichier : hw2.py Projet : grin3s/technosfera_dm_2

def plot_bar_counts(data):
    fig = plt.figure(figsize=(20,10))
    plt.yticks(fontsize=8)
    plt.xticks(rotation=90)
    sns.barplot(x=data.keys().values,y=data.values)
    plt.xlabel('')
    plt.ylabel('Number of jobs',fontsize=10)

Exemple #26

0

Afficher le fichier

Fichier : storage_dashboard.py Projet : rolandet/tools

def backline_esc_by_region(data, component_chart_dir, component):
  textcolor='black'
  palette=['#aad962','#fbbf45','#ef6a32']
  plt.clf()
  plt.rcParams['figure.figsize']=(25,15)
  f, ax = plt.subplots(3)
  ax[0].set_title("Total " + component + " BL Escalations by Regional COEs", fontsize=40)
  f.subplots_adjust(hspace=0.4)
  #f.tight_layout()
  for idx, comp in enumerate(["EMEA", "Americas", "APJ"]):
    if comp == "EMEA":
      regional_data = data[data['region'].isin(['Africa','Europe','GMT',''])]
    elif comp == "Americas":
      regional_data = data[data['region'].isin(['America','Atlantic'])]
    elif comp == "APJ":
      regional_data = data[data['region'].isin(['Australia','Asia','Pacific'])]
    regional_data = regional_data.groupby(['month']).sum().reset_index()
    #display(regional_data)
    if not regional_data.empty:
      sns.barplot(x="month", y="bl_esc_count", data=regional_data, color=palette[idx], ax=ax[idx], errcolor='None')

    ax[idx].set_ylabel(comp,fontsize=25, color=textcolor)
    ax[idx].yaxis.set_major_locator(ticker.MaxNLocator(integer=True))
    ax[idx].set_xlabel("")
    for label in ax[idx].get_xticklabels():
        label.set_rotation(15)
        label.set_color(textcolor)
    for label in ax[idx].get_yticklabels():
        label.set_color(textcolor)
  plt.savefig(component_chart_dir + '/total_regional_bl_esc.png')
  #plt.show()
  plt.close()
  return

Exemple #27

0

Afficher le fichier

Fichier : xgbparser.py Projet : GuillaumeMohr/modelint

 def plot_parts(self, x, groups=None):
     """ Plots individual parameter importance
     Parameters
     ----------
     x: (p,) array
         input variables (p features)
     groups: dict
         group variables under a common name
     """
     if groups is None:
         groups = {}
     p, fp, b = self.predict(x)
     features = [''] * (len(self.feat_map) + 1)
     for f, i in self.feat_map.items():
         features[i] = f
     features[-1] = 'bias'
     parts = np.r_[fp, b]
     df = pd.DataFrame({'participation': parts,
                        'feature': features})
     df['group'] = df.feature.apply(lambda f: groups.get(f, f))
     df = df.groupby('group', as_index=False).sum()
     df['abs_participation'] = df['participation'].abs()
     sns.barplot(x='participation',
                 y='group',
                 data=df.sort_values('abs_participation',
                                     ascending=False))

Exemple #28

0

Afficher le fichier

Fichier : plot_stats.py Projet : FAB4D/agrigater

def simple_barplot(xlabel, ylabels, stype, df, filename, exts):
    ## how to have multiple y values? pass list?
    with sns.axes_style('ticks'):
        fig = plt.figure()
        df = complete_df(df, xlabel)
        if len(ylabels) > 1:
            df = tidy_df(df, xlabel, ylabels, stype)
            ylabel = 'missing percentage'
            plot = sns.barplot(x=xlabel, y='value', hue=stype, data=df)
        else:
            ylabel = ylabels[0]
            if stype == "coverage":
                plot = sns.barplot(x=xlabel, y=ylabels[0], data=df, color="green") #palette=sns.light_palette("green"))
            else:
                plot = sns.barplot(x=xlabel, y=ylabels[0], data=df)
        if stype in ["coverage", "nas"]:
            plot.set_ylim([0, 100])
        sns.despine()
        plt.ylabel(ylabel)
        if xlabel == "year":
            plt.setp(plot.get_xticklabels(), rotation=45)
        plt.xlabel(xlabel)
        fig.add_axes(plot)
        fig.tight_layout()
        ### TODO: call complete_df() and tidy_df()
        for ext in exts:
            save_plot(fig, outpath, filename, '.'+ext)
        plt.close()
    return

Exemple #29

0

Afficher le fichier

Fichier : storage_dashboard.py Projet : rolandet/tools

def owner_barcharts(data, component_chart_dir, multi_month=False):
  textcolor='black'
  palette=['#aad962','#fbbf45','#ef6a32']
  plt.clf()
  plt.rcParams['figure.figsize']=(25,15)
  f, ax = plt.subplots(3)
  for idx, comp in enumerate(["EMEA", "Americas", "APJ"]):
    if comp == "EMEA":
      regional_data = data[data['region'].isin(['Africa','Europe','GMT',''])]
    elif comp == "Americas":
      regional_data = data[data['region'].isin(['America','Atlantic'])]
    elif comp == "APJ":
      regional_data = data[data['region'].isin(['Australia','Asia','Pacific'])]

    if multi_month:
      sorted_hue = sorted(regional_data.month.unique())
      sns.barplot(x="name", y="total_count", hue="month", hue_order=sorted_hue, data=regional_data, color=palette[idx], ax=ax[idx])
    else:
      sns.barplot(x="name", y="total_count", data=regional_data, color=palette[idx], ax=ax[idx])

    ax[idx].set_ylabel(comp,fontsize=25, color=textcolor)
    ax[idx].yaxis.set_major_locator(ticker.MaxNLocator(integer=True))
    ax[idx].set_xlabel("")
    for label in ax[idx].get_xticklabels():
        label.set_rotation(15)
        label.set_color(textcolor)
    for label in ax[idx].get_yticklabels():
        label.set_color(textcolor)
  plt.show()
  plt.close()
  return

Exemple #30

0

Afficher le fichier

Fichier : MICAnalysis.py Projet : Ernestyj/PyProj

def plotMICHist():
    f, ax = plt.subplots(figsize=(12, 6))
    sns.barplot(ks, vs, palette="BuGn_d", ax=ax)
    ax.set_ylabel("MIC")
    plt.xticks(rotation=90)
    f.subplots_adjust(bottom=0.2)
    plt.show()

Exemple #31

0

Afficher le fichier

Fichier : comapre_experiments.py Projet : CuriosityLabTAU/robots_vids

        axis=0)
    df_anova['choice'] = 1 - df_anova['choice']
    df_anova['qn'] = qn

    df_anova['choice'] = pd.to_numeric(df_anova['choice'])
    print('####### question: ' + q1 + ' #######')
    F, p = stats.f_oneway(df_anova['choice'][df_anova['experiment'] == 0],
                          df_anova['choice'][df_anova['experiment'] == 1],
                          df_anova['choice'][df_anova['experiment'] == 2])
    print('ANOVA: %.2f, %.4f' % (F, p))
    if p < .05:
        res = pairwise_tukeyhsd(df_anova['choice'], df_anova['experiment'])
        print(res)
    df_anova_qn = pd.concat((df_anova_qn, df_anova), axis=0)

    sns.barplot(data=df_anova_qn, x='qn', y='choice', hue='experiment')

questions_df_multilinear = pd.melt(questions_df,
                                   id_vars=['q'],
                                   value_vars=questions_df.columns[:-1],
                                   var_name='experiment',
                                   value_name='percentage')
questions_df_multilinear.to_csv('data/paper/00questions_df_multilinear.csv')

### plotting the regression lines for all the frequency
figure, ax = plt.subplots(1, 1)
for x in questions_df.columns:
    if x != 'q':
        sns.regplot(x='q', y=x, data=questions_df, label=x, ax=ax)
ax.set_ylabel('Prefer towards rational')
plt.legend()

Exemple #32

0

Afficher le fichier

Fichier : main.py Projet : padhidebasish5/CitiHackathon

HT_regular = hashtag_extract(train['tweet'][train['label'] == 0])

# extracting hashtags from racist/sexist tweets
HT_negative = hashtag_extract(train['tweet'][train['label'] == 1])

# unnesting list
HT_regular = sum(HT_regular, [])
HT_negative = sum(HT_negative, [])

a = nltk.FreqDist(HT_regular)
d = pd.DataFrame({'Hashtag': list(a.keys()), 'Count': list(a.values())})

# selecting top 20 most frequent hashtags
d = d.nlargest(columns="Count", n=20)
plt.figure(figsize=(16, 5))
ax = sns.barplot(data=d, x="Hashtag", y="Count")
ax.set(ylabel='Count')
plt.show()

a = nltk.FreqDist(HT_negative)
d = pd.DataFrame({'Hashtag': list(a.keys()), 'Count': list(a.values())})

# selecting top 20 most frequent hashtags
d = d.nlargest(columns="Count", n=20)
plt.figure(figsize=(16, 5))
ax = sns.barplot(data=d, x="Hashtag", y="Count")
ax.set(ylabel='Count')
plt.show()

# tokenizing the words present in the training set
tokenized_tweet = train['tweet'].apply(lambda x: x.split())

Exemple #33

0

Afficher le fichier

    label[label.isin(['WALKING_UPSTAIRS', 'WALKING_DOWNSTAIRS'])], 'subject'
])['duration'].count() * 1.28)
duration_df = pd.DataFrame(duration_df)

# Sort the values of duration
plot_data = duration_df.reset_index().sort_values('duration', ascending=False)
plot_data['Activity'] = plot_data['Activity'].map({
    'WALKING_UPSTAIRS':
    'Upstairs',
    'WALKING_DOWNSTAIRS':
    'Downstairs'
})

# Plot the durations for staircase use
plt.figure(figsize=(15, 5))
sns.barplot(data=plot_data, x='subject', y='duration', hue='Activity')
plt.title('Participants Compared By Their Staircase Walking Duration')
plt.xlabel('Participants')
plt.ylabel('Total Duration [s]')
plt.show()

# --------------
#exclude the Activity column and the subject column
feature_cols = data.columns[:-2]

#Calculate the correlation values
correlated_values = data[feature_cols].corr()
#stack the data and convert to a dataframe

correlated_values = (correlated_values.stack().to_frame().reset_index().rename(
    columns={

Exemple #34

0

Afficher le fichier

print(diabetes_df.head())

diabetes_df.info()
diabetes_df.isnull().sum()

corr = diabetes_df.corr()
print(corr)
sns.heatmap(corr, xticklabels=corr.columns, yticklabels=corr.columns)

plt.subplots(figsize=(18, 15))
plt.subplot(4, 3, 1)
plt.subplots_adjust(wspace=0.2, hspace=0.5)
sns.countplot(x='Outcome', data=diabetes_df)
plt.subplot(4, 3, 2)
plt.subplots_adjust(wspace=0.2, hspace=0.5)
sns.barplot(x='Outcome', y='Age', data=diabetes_df)
plt.show()

#data analysis
columns = diabetes_df.columns[:8]
# print(columns)
plt.subplots(figsize=(18, 15))
length = len(columns)
for i, j in itertools.zip_longest(columns, range(length)):
    plt.subplot((length / 2), 3, j + 1)
    plt.subplots_adjust(wspace=0.2, hspace=0.5)
    diabetes_df[i].hist(bins=20, edgecolor='black')
    plt.title(i)
plt.show()

#analysis of diabetic classes

Exemple #35

0

Afficher le fichier

# drop Parch & SibSp
data_train = data_train.drop(['SibSp', 'Parch'], axis=1)
data_test = data_test.drop(['SibSp', 'Parch'], axis=1)

import seaborn as sns
sns.set_style('whitegrid')
get_ipython().magic(u'matplotlib inline')
# plot
fig, (axis1, axis2) = plt.subplots(1, 2, sharex=True, figsize=(10, 5))

sns.countplot(x='Family', data=data_train, order=[1, 0], ax=axis1)

family_perc = data_train[["Family",
                          "Survived"]].groupby(['Family'],
                                               as_index=False).mean()
sns.barplot(x='Family', y='Survived', data=family_perc, order=[1, 0], ax=axis2)

axis1.set_xticklabels(["With Family", "Alone"], rotation=0)

# In[ ]:

#cabin
#看看这个值的有无，对于survival的分布状况，影响如何
fig = plt.figure(figsize=(13, 7))
fig.set(alpha=0.5)  # 设定图表颜色alpha参数

Survived_cabin = data_train.Survived[pd.notnull(
    data_train.Cabin)].value_counts()
Survived_nocabin = data_train.Survived[pd.isnull(
    data_train.Cabin)].value_counts()
df = pd.DataFrame({

Exemple #36

0

Afficher le fichier

Fichier : Code.py Projet : zubairahmed-ai/PimaIndianDiabetes

        if name in acc_dict:
            acc_dict[name] += acc
        else:
            acc_dict[name] = acc
        # print '{0}: {1}'.format(name, acc * 100)

for clf in acc_dict:
    acc_dict[clf] = acc_dict[clf] / 10.0
    log_entry = pd.DataFrame([[clf, acc_dict[clf]]], columns=log_cols)
    log = log.append(log_entry)

plt.xlabel('Accuracy')
plt.title('Classifier Accuracy')
# plt.show()
sns.set_color_codes("muted")
sns.barplot(x='Accuracy', y='Classifier', data=log, color="b")
from operator import itemgetter

sorted_dict = sorted(acc_dict.items(), key=itemgetter(1), reverse=True)

for k, v in sorted_dict:
    print "{0}-{1:.2%}".format(k, v)

ntrain = X_train.shape[0]
ntest = y_test.shape[0]
SEED = 0  # for reproducibility
NFOLDS = 5  # set folds for out-of-fold prediction
kf = KFold(ntrain, n_folds=NFOLDS, random_state=SEED)

class SklearnHelper(object):
    def __init__(self, clf, seed=0, params=None):

Exemple #37

0

Afficher le fichier

# ax=sns.barplot(x="smoothing_window(hour)", y="RMSE", hue="weather_data_used", data=tmp, ax=ax )
# ax.set_title("PPA data")
#
# ax = fig.add_subplot(122)
# tmp=df[df["test_set"]=="EPA"]
# ax=sns.barplot(x="smoothing_window(hour)", y="RMSE", hue="weather_data_used", data=tmp, ax=ax )
# ax.set_title("EPA data")
#
# plt.show()

fig = plt.figure(figsize=(20, 10))

ax = fig.add_subplot(121)
tmp = df[df["weather_data_used"] == True]
ax = sns.barplot(x="smoothing_window(hour)",
                 y="RMSE",
                 hue="test_set",
                 data=tmp,
                 ax=ax)
ax.set_title("with weather data")

ax = fig.add_subplot(122)
tmp = df[df["weather_data_used"] == False]
ax = sns.barplot(x="smoothing_window(hour)",
                 y="RMSE",
                 hue="test_set",
                 data=tmp,
                 ax=ax)
ax.set_title("no weather data")

plt.show()

Exemple #38

0

Afficher le fichier

df.isnull().sum()

# # Visualization

# In[91]:

import matplotlib.pyplot as plt
import seaborn as sns
get_ipython().run_line_magic('matplotlib', 'inline')

# In[92]:

sns.barplot(x=df['Gender'],
            y=df['Loan_Status'],
            data=df,
            label="Relationship among Gender and Loan Approval Status",
            ci=None)

# In[93]:

sns.barplot(x=df["Married"],
            y=df['Loan_Status'],
            data=df,
            label="Relationship among Gender and Loan Approval Status",
            ci=None)

# In[94]:

sns.catplot(x="Married",
            y="ApplicantIncome",

Exemple #39

0

Afficher le fichier

Fichier : decision_tree.py Projet : ds-wook/Python-Machine-Learning

# %%
import graphviz

with open("tree.dot") as f:
    dot_graph = f.read()
graphviz.Source(dot_graph)
# %%
import seaborn as sns
import numpy as np

print("Feature importances:\n{0}".format(
    np.round(dt_clf.feature_importances_, 3)))
for name, value in zip(iris_data.feature_names, dt_clf.feature_importances_):
    print("{0} : {1:.3f}".format(name, value))

sns.barplot(x=dt_clf.feature_importances_, y=iris_data.feature_names)
sns.despine()

# %%
import pandas as pd
import matplotlib.pyplot as plt

feature_name_df = pd.read_csv('./human_activity/features.txt',
                              sep='\s+',
                              header=None,
                              names=['column_index', 'column_name'])


def get_new_feature_name_df(old_feature_name_df):
    feature_dup_df = pd.DataFrame(
        data=old_feature_name_df.groupby('column_name').cumcount(),

Exemple #40

0

Afficher le fichier

Fichier : kernel_Titanic.py Projet : nbrrawal/Kaggle

    train_predict = cl.predict(a_test)
    acct = accuracy_score(b_test, train_predict)
    if name in acct_dict: 
        acct_dict[name] += acct 
    else : 
        acct_dict[name] = acct 
    
for cl in acct_dict: 
    acct_dict[cl] = acct_dict[cl]/10.0
    log_entry = pd.DataFrame([[cl, acct_dict[cl]]], columns=["Classifier", "Accuracy"])
    log= log.append(log_entry)
    
plt.xlabel('Accuracy')
plt.title('Classifier Accuracy')
sns.set_color_codes('muted')
sns.barplot(x="Accuracy", y ="Classifier", data=log, color="b")

Exemple #41

0

Afficher le fichier

Fichier : train_v003_001.py Projet : matsuken92/molecular

def train_model_classification(X, X_test, y, params, folds, model_type='lgb', eval_metric='auc', columns=None,
                               plot_feature_importance=False, model=None,
                               verbose=10000, early_stopping_rounds=200, n_estimators=50000):
    """
    A function to train a variety of regression models.
    Returns dictionary with oof predictions, test predictions, scores and, if necessary, feature importances.

    :params: X - training data, can be pd.DataFrame or np.ndarray (after normalizing)
    :params: X_test - test data, can be pd.DataFrame or np.ndarray (after normalizing)
    :params: y - target
    :params: folds - folds to split data
    :params: model_type - type of model to use
    :params: eval_metric - metric to use
    :params: columns - columns to use. If None - use all columns
    :params: plot_feature_importance - whether to plot feature importance of LGB
    :params: model - sklearn model, works only for "sklearn" model type

    """
    columns = X.columns if columns == None else columns
    X_test = X_test[columns]

    # to set up scoring parameters
    metrics_dict = {'auc': {'lgb_metric_name': eval_auc,
                            'catboost_metric_name': 'AUC',
                            'sklearn_scoring_function': metrics.roc_auc_score},
                    }

    result_dict = {}

    # out-of-fold predictions on train data
    oof = np.zeros((len(X), len(set(y.values))))

    # averaged predictions on train data
    prediction = np.zeros((len(X_test), oof.shape[1]))

    # list of scores on folds
    scores = []
    feature_importance = pd.DataFrame()

    # split and train on folds
    for fold_n, (train_index, valid_index) in enumerate(folds.split(X)):
        print(f'Fold {fold_n + 1} started at {time.ctime()}')
        if type(X) == np.ndarray:
            X_train, X_valid = X[columns][train_index], X[columns][valid_index]
            y_train, y_valid = y[train_index], y[valid_index]
        else:
            X_train, X_valid = X[columns].iloc[train_index], X[columns].iloc[valid_index]
            y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]

        if model_type == 'lgb':
            model = lgb.LGBMClassifier(**params, n_estimators=n_estimators, n_jobs=-1)
            model.fit(X_train, y_train,
                      eval_set=[(X_train, y_train), (X_valid, y_valid)],
                      eval_metric=metrics_dict[eval_metric]['lgb_metric_name'],
                      verbose=verbose, early_stopping_rounds=early_stopping_rounds)

            y_pred_valid = model.predict_proba(X_valid)
            y_pred = model.predict_proba(X_test, num_iteration=model.best_iteration_)

        if model_type == 'xgb':
            train_data = xgb.DMatrix(data=X_train, label=y_train, feature_names=X.columns)
            valid_data = xgb.DMatrix(data=X_valid, label=y_valid, feature_names=X.columns)

            watchlist = [(train_data, 'train'), (valid_data, 'valid_data')]
            model = xgb.train(dtrain=train_data, num_boost_round=n_estimators, evals=watchlist,
                              early_stopping_rounds=early_stopping_rounds, verbose_eval=verbose, params=params)
            y_pred_valid = model.predict(xgb.DMatrix(X_valid, feature_names=X.columns),
                                         ntree_limit=model.best_ntree_limit)
            y_pred = model.predict(xgb.DMatrix(X_test, feature_names=X.columns), ntree_limit=model.best_ntree_limit)

        if model_type == 'sklearn':
            model = model
            model.fit(X_train, y_train)

            y_pred_valid = model.predict(X_valid).reshape(-1, )
            score = metrics_dict[eval_metric]['sklearn_scoring_function'](y_valid, y_pred_valid)
            print(f'Fold {fold_n}. {eval_metric}: {score:.4f}.')
            print('')

            y_pred = model.predict_proba(X_test)

        if model_type == 'cat':
            model = CatBoostClassifier(iterations=n_estimators,
                                       eval_metric=metrics_dict[eval_metric]['catboost_metric_name'], **params,
                                       loss_function=metrics_dict[eval_metric]['catboost_metric_name'])
            model.fit(X_train, y_train, eval_set=(X_valid, y_valid), cat_features=[], use_best_model=True,
                      verbose=False)

            y_pred_valid = model.predict(X_valid)
            y_pred = model.predict(X_test)

        oof[valid_index] = y_pred_valid
        scores.append(metrics_dict[eval_metric]['sklearn_scoring_function'](y_valid, y_pred_valid[:, 1]))

        prediction += y_pred

        if model_type == 'lgb' and plot_feature_importance:
            # feature importance
            fold_importance = pd.DataFrame()
            fold_importance["feature"] = columns
            fold_importance["importance"] = model.feature_importances_
            fold_importance["fold"] = fold_n + 1
            feature_importance = pd.concat([feature_importance, fold_importance], axis=0)

    prediction /= folds.n_splits

    print('CV mean score: {0:.4f}, std: {1:.4f}.'.format(np.mean(scores), np.std(scores)))

    result_dict['oof'] = oof
    result_dict['prediction'] = prediction
    result_dict['scores'] = scores

    if model_type == 'lgb':
        if plot_feature_importance:
            feature_importance["importance"] /= folds.n_splits
            cols = feature_importance[["feature", "importance"]].groupby("feature").mean().sort_values(
                by="importance", ascending=False)[:50].index

            best_features = feature_importance.loc[feature_importance.feature.isin(cols)]

            plt.figure(figsize=(16, 12));
            sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance", ascending=False));
            plt.title('LGB Features (avg over folds)');

            result_dict['feature_importance'] = feature_importance

    return result_dict

Exemple #42

0

Afficher le fichier

Fichier : FullDatasetModel.py Projet : chenrq2005/cse6242_team76_project

   
#Algorithm Random Forest
#Visualize important features
import matplotlib.pyplot as plt
import seaborn as sns
#%matplotlib inline
pd.set_option('display.max_rows', 350)
pd.set_option('display.max_columns', 350)
plt.style.use('ggplot')

feature_imp = pd.Series(clf.feature_importances_,index=X.columns).sort_values(ascending=False)

# Creating a bar plot, displaying only the top k features
k=20
sns.barplot(x=feature_imp[:20], y=feature_imp.index[:k])
# Add labels to your graph
plt.xlabel('Feature Importance Score')
plt.ylabel('Features')
plt.title("Visualizing Important Features")
plt.legend()
plt.show()

# List top k important features
k=20
feature_imp.sort_values(ascending=False)[:k]

#Algorithm Random Forest
#Select the top important features, set the threshold
# Create a selector object that will use the random forest classifier to identify
# features that have an importance of more than 0.03

Exemple #43

0

Afficher le fichier

#dans l'opération précédente la date et le channel sont partis dans l'index
dateChannel_data = dateChannel_data.reset_index(
)  #recrée les colonnes date et channel
dateChannel_data['Année'] = dateChannel_data['date'].astype(
    str).str[:4]  #ajoute l'année
dateChannel_data.info()
dateChannel_data.sort_values(by=['date', 'channel'])
dateChannel_data.head(20)
##########################################################################
# Graphique en barre général Répartition du trafic selon les canaux.
##########################################################################
sns.set()  #paramètres esthétiques ressemble à ggplot par défaut.
fig, ax = plt.subplots()  #un seul plot
sns.barplot(x='channel',
            y='pageviews',
            data=dateChannel_data,
            estimator=sum,
            order=sorted(dfPVChannel['channel'].unique()))
fig.suptitle(
    "Le canal 'search' est le premier contributeur en termes de trafic.",
    fontsize=14,
    fontweight='bold')
ax.set(
    xlabel="Canal",
    ylabel="Pages vues",
    title="Le canal 'direct' (fourre tout) est malheureusement important aussi."
)
fig.text(.35,
         -.03,
         "Trafic Global - Pages vues selon les canaux depuis 2011",
         fontsize=9)

Exemple #44

0

Afficher le fichier

Fichier : plotting.py Projet : lytb123/fitbit-analyzer

def plot(data, columns, measureName, nrows, ncols, order=None):
    f, axes = plt.subplots(nrows=nrows, ncols=ncols)
    axes = axes.reshape(-1)
    for i, c in enumerate(columns):
        sns.barplot(x=measureName, y=c, data=data, order=order, ax=axes[i])
    sns.plt.show()

Exemple #45

0

Afficher le fichier

Fichier : GlobalFootprintAPI.py Projet : ecarlosfonseca/Data-Analysis

print(countries_df.head())
analysis_dict = {'variables': list(countries_df.columns.values),
                 'count': list(countries_df.count().values),
                 'v_types': list(countries_df.dtypes.values),
                 'n_null': list(countries_df.isnull().sum().values),
                 'n_uniques': list(countries_df.nunique().values)}

analysis = pd.DataFrame(analysis_dict)
print(analysis)

brazil_land_type_df = countries_df.loc[countries_df['countryName'] == 'Brazil', ['year', 'cropLand', 'grazingLand', 'forestLand']].set_index('year')
ax1 = sns.lineplot(data=brazil_land_type_df, ci=None, legend='brief')
ax1.set_title('Brazil ground types areas evolution over the years')
ax1.set_ylabel('Areas (gha)')
ax1.set_xlabel('Years')
plt.show()

fishingGround_df = countries_df.loc[(countries_df['year'] == 2016) & (countries_df['record'] == 'AreaTotHA'), ['countryName', 'fishingGround']].nlargest(40, 'fishingGround')
ax2 = sns.barplot(x='fishingGround', y='countryName', data=fishingGround_df)
ax2.set_title('2016 world top40 fishing ground')
ax2.set_xlabel('Fishing Ground (gha)')
plt.show()

portugal_area_df = countries_df.loc[(countries_df['countryName'] == 'Portugal') & (countries_df['year'] == 2016) & (countries_df['record'] == 'AreaTotHA'), ['cropLand', 'grazingLand', 'forestLand']].T
ax3 = portugal_area_df.plot.pie(y=160552, title='Portugal 2016 ground types area', legend=False)
ax3.set_ylabel('')
plt.show()

Exemple #46

0

Afficher le fichier

# In[8]:

#Let's find the columns with any null objects and plot them based on their count
data_na = train.isnull().sum()
data_na = data_na[data_na > 0]
data_na = data_na.to_frame()
data_na.columns = ['count']
data_na.index.names = ['name']
data_na['name'] = data_na.index

# In[9]:

#Plotting a bar plot of number null objects in each column
plt.figure(figsize=(25, 8))
sns.set(style='ticks')
sns.barplot(x='name', y='count', data=data_na)
plt.show()

# In[10]:

#The numerical features of data. We will used the median of each and group movies based on production companies.
train.select_dtypes(include=[np.number]).columns

# In[11]:

#First drop columns with more than 80% null values
train = train.dropna(thresh=0.80 * len(train), axis=1)

# In[12]:

#Check now to see which columns still have null values

Exemple #47

0

Afficher le fichier

Fichier : main.py Projet : shaswatsunny1998/accident-predictor

cm_6=confusion_matrix(y_true=y_test,y_pred=model.predict(X_test))
acc_6=accuracy_score(y_test,model.predict(X_test))
dic["XG Boost"]=acc_6

#Analysing the results of the Results
Estimators=[]
Accuracy=[]
for i in dic:
    Estimators.append(i)
    Accuracy.append(dic[i]*100)
d={'Estimators':Estimators,"Accuracy":Accuracy}
df=pd.DataFrame(data=d)
plt.figure(num=3)
plt.ylim(0,100)
plt.title("All classification estimators with accuracy score")
sns.barplot(x='Estimators',y='Accuracy',data=df)
plt.show()


#Finalizing the model
model=tree_final
print("Enter the Value of X acceleration: ")
x=int(input())
print("Enter the Value of Y acceleration: ")
y=int(input())
f=sc.transform(np.array([[x,y]]))
x=f[0][0]
y=f[0][1]
t=model.predict(np.array([[x,y]]))
if(t==0):
    print("The phone has fallen")

Exemple #48

0

Afficher le fichier

Fichier : company_dataset.py Projet : monicamurugesan/RF-Example

Y=c1.iloc[:,10]
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.5,random_state=0)
from sklearn.preprocessing import StandardScaler
sc=StandardScaler()
X_train=sc.fit_transform(X_train)
X_test=sc.fit_transform(X_test)

from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators = 200, criterion = 'entropy', random_state = 0)
classifier.fit(X_train,Y_train)
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score
pred=classifier.predict(X_test)
accuracy=accuracy_score(Y_test,pred)
print("Accuracy:",accuracy*100)
cm=confusion_matrix(Y_test,pred)
cm
cls1=classification_report(Y_test,pred)
print(cls1)
sns.pairplot(c1)
classifier.estimators_
sns.heatmap(cm, annot=True)
import graphviz 
dot_data = tree.export_graphviz(clf, out_file='tree.dot')
classifier.feature_importances_
featureimp=pd.Series(classifier.feature_importances_).sort_values(ascending=True)
print(featureimp)
sns.barplot(x=round(featureimp,4),y=featureimp)
plt.xlabel("Feature importance")
plt.show()
i

Exemple #49

0

Afficher le fichier

Fichier : rdm_compare.py Projet : Brinkmak/URIAL

def rdm_compare(rdms, models, comp=None, plot=None):
    '''function to compare target and model rmds'''

    import pandas as pd
    from scipy.spatial import distance
    from nilearn.connectome import sym_matrix_to_vec, vec_to_sym_matrix
    from scipy.stats import rankdata, spearmanr, kendalltau, pearsonr, mstats
    import numpy as np
    from itertools import combinations
    import pickle
    import seaborn as sns
    import matplotlib.pyplot as plt
    import copy

    if isinstance(rdms, str) is True:
        with open(rdms, 'rb') as f:
            dict_rdms = pickle.load(f)
        target_rdms = copy.deepcopy(dict_rdms['rdm'])
        target_conds = target_rdms[0].keys()
    else:
        target_rdms = rdms
        target_conds = rdms[0].keys()

    if isinstance(models, str) is True:
        with open(models, 'rb') as f:
            dict_models = pickle.load(f)
            models = dict_models['rdm']
            model_ids = dict_models['id']
    else:
        models = models

    for rdm in dict_models['rdm']:
        if 'Unnamed: 0' in rdm:
            del rdm['Unnamed: 0']

    for index, rdm in enumerate(target_rdms):
        target_rdms[index] = target_rdms[index].as_matrix()

    list_cor_rdm = list(range(0, len(target_rdms)))
    list_p = list(range(0, len(target_rdms)))
    target_rdms_trans = list(range(0, len(target_rdms)))

    if comp is None or comp == 'spearman':
        for index, rdm in enumerate(target_rdms):
            target_rdms_trans[index] = vec_to_sym_matrix(rankdata(sym_matrix_to_vec(rdm)))
            rdm_avg = pd.DataFrame(np.mean(target_rdms_trans, axis=0), columns=target_conds)

        for index, part_rdm in enumerate(target_rdms_trans):
            list_cor_rdm[index], list_p[index] = spearmanr(part_rdm.flatten(), rdm_avg.as_matrix().flatten())

        list_cor_sub = list()
        list_cor_rdm_sub = list()
        list_p_sub = list()

        for index, part in enumerate(target_rdms_trans):
            tmp_rdms = target_rdms_trans.copy()
            tmp_part = target_rdms_trans[index]
            tmp_rdms.pop(index)
            tmp_rdm_avg = np.mean(tmp_rdms, axis=0)
            list_cor_sub.append(spearmanr(tmp_part.flatten(), tmp_rdm_avg.flatten()))

        for i, cor in enumerate(list_cor_sub):
            list_cor_rdm_sub.append(cor.correlation)
            list_p_sub.append(cor.pvalue)

    elif comp == 'kendalltaua':
        for index, rdm in enumerate(target_rdms):
            target_rdms_trans[index] = vec_to_sym_matrix(rankdata(sym_matrix_to_vec(rdm)))
            rdm_avg = pd.DataFrame(np.mean(target_rdms, axis=0), columns=target_conds)

        for index, part_rdm in enumerate(target_rdms):
            list_cor_rdm[index], list_p[index] = kendalltau(part_rdm.flatten(), rdm_avg.as_matrix().flatten())

        list_cor_sub = list()
        list_cor_rdm_sub = list()
        list_p_sub = list()

        for index, part in enumerate(target_rdms):
            tmp_rdms = target_rdms.copy()
            tmp_part = target_rdms[index]
            tmp_rdms.pop(index)
            tmp_rdm_avg = np.mean(tmp_rdms, axis=0)
            list_cor_sub.append(kendalltau(tmp_part.flatten(), tmp_rdm_avg.flatten()))

        for i, cor in enumerate(list_cor_sub):
            list_cor_rdm_sub.append(cor.correlation)
            list_p_sub.append(cor.pvalue)

    elif comp == 'pearson':
        for index, rdm in enumerate(target_rdms):
            target_rdms_trans[index] = vec_to_sym_matrix(mstats.zscore(sym_matrix_to_vec(rdm)))
            rdm_avg = pd.DataFrame(np.mean(target_rdms_trans, axis=0), columns=target_conds)

        for index, part_rdm in enumerate(target_rdms_trans):
            list_cor_rdm[index], list_p[index] = pearsonr(part_rdm.flatten(), rdm_avg.as_matrix().flatten())

        list_cor_sub = list()
        list_cor_rdm_sub = list()
        list_p_sub = list()

        for index, part in enumerate(target_rdms_trans):
            tmp_rdms = target_rdms_trans.copy()
            tmp_part = target_rdms_trans[index]
            tmp_rdms.pop(index)
            tmp_rdm_avg = np.mean(tmp_rdms, axis=0)
            list_cor_sub.append(pearsonr(tmp_part.flatten(), tmp_rdm_avg.flatten()))

        for i, cor in enumerate(list_cor_sub):
            list_cor_rdm_sub.append(cor[0])
            list_p_sub.append(cor[1])

    upper_noise_ceiling = np.mean(list_cor_rdm)
    lower_noise_ceiling = np.mean(list_cor_rdm_sub)

    model_comp = pd.DataFrame(columns=['participant', 'models', 'cor'],
                              index=np.arange(len(dict_models['id']) * len(dict_rdms['id'])))
    model_comp['participant'] = dict_rdms['id'] * len(dict_models['id'])
    model_comp['models'] = sorted(dict_models['id'] * len(dict_rdms['id']))

    list_cor_models = list()

    snd_rdms = list()
    snd_rdms.append(rdm_avg.as_matrix())
    for mod_rdm in models:
        snd_rdms.append(mod_rdm.as_matrix())

    ids_rdms = list()
    ids_rdms.append('group average')
    for mod_ids in model_ids:
        ids_rdms.append(mod_ids)

    if comp is None or comp == 'spearman':
        for index, model_rdm in enumerate(dict_models['rdm']):
            for i, sub_rdm in enumerate(target_rdms_trans):
                list_cor_models.append(spearmanr(sub_rdm.flatten(), model_rdm.as_matrix().flatten()).correlation)
                rdms_dist = [spearmanr(x.flatten(), y.flatten()).correlation for x, y in combinations(snd_rdms, 2)]
                rdms_dist = pd.DataFrame(distance.squareform(rdms_dist), columns=ids_rdms)
    elif comp == 'kendalltaua':
        for index, model_rdm in enumerate(dict_models['rdm']):
            for i, sub_rdm in enumerate(target_rdms):
                list_cor_models.append(kendalltau(sub_rdm.flatten(), model_rdm.as_matrix().flatten()).correlation)
                rdms_dist = [kendalltau(x.flatten(), y.flatten()).correlation for x, y in combinations(snd_rdms, 2)]
                rdms_dist = pd.DataFrame(distance.squareform(rdms_dist), columns=ids_rdms)
    elif comp == 'pearson':
        for index, model_rdm in enumerate(dict_models['rdm']):
            for i, sub_rdm in enumerate(target_rdms_trans):
                list_cor_models.append(pearsonr(sub_rdm.flatten(), model_rdm.as_matrix().flatten())[0])
                rdms_dist = [pearsonr(x.flatten(), y.flatten())[0] for x, y in combinations(snd_rdms, 2)]
                rdms_dist = pd.DataFrame(distance.squareform(rdms_dist), columns=ids_rdms)

    model_comp['cor'] = list_cor_models

    if plot is None:
        print('results will not be plotted')
    elif plot == 'bar':
        ax = sns.barplot(x=model_comp['models'], y=model_comp['cor'], data=model_comp)
        plt.plot(np.linspace(-20, 120, 1000), [upper_noise_ceiling] * 1000, 'r', alpha=0.1)
        plt.plot(np.linspace(-20, 120, 1000), [lower_noise_ceiling] * 1000, 'r', alpha=0.1)
        rect = plt.Rectangle((-20, lower_noise_ceiling), 10000, (upper_noise_ceiling - lower_noise_ceiling), color='r',
                             alpha=0.5)
        ax.set_xticklabels(labels=list(dict_models['id']))
        if comp is None or comp == 'spearman':
            ax.set(ylabel='spearman correlation with target RDM')
        if comp == 'pearson':
            ax.set(ylabel='pearson correlation with target RDM')
        if comp == 'kendalltaua':
            ax.set(ylabel='kendall tau a correlation with target RDM')
        ax.add_patch(rect)
        plt.tight_layout()
    elif plot == 'violin':
        ax = sns.violinplot(x=model_comp['models'], y=model_comp['cor'], data=model_comp)
        plt.plot(np.linspace(-20, 120, 1000), [upper_noise_ceiling] * 1000, 'r', alpha=0.1)
        plt.plot(np.linspace(-20, 120, 1000), [lower_noise_ceiling] * 1000, 'r', alpha=0.1)
        rect = plt.Rectangle((-20, lower_noise_ceiling), 10000, (upper_noise_ceiling - lower_noise_ceiling), color='r',
                             alpha=0.5)
        ax.set_xticklabels(labels=list(dict_models['id']))
        if comp is None or comp == 'spearman':
            ax.set(ylabel='spearman correlation with target RDM')
        if comp == 'pearson':
            ax.set(ylabel='pearson correlation with target RDM')
        if comp == 'kendalltaua':
            ax.set(ylabel='kendall tau a correlation with target RDM')
        ax.add_patch(rect)
        plt.tight_layout()

    return rdm_avg, model_comp, rdms_dist

Exemple #50

0

Afficher le fichier

Fichier : all_model_evaluation.py Projet : uic-cs418/cs418-project-RNAge

 for g1 in grps:
     for g2 in grps[grps.index(g1) + 1:]:
         if g1 != g2:
             keys.append(str(g1 + '_' + g2))
             x = list(df.loc[g1, :])
             y = list(df.loc[g2, :])
             res = mannwhitneyu(x, y, alternative='two-sided')
             res_df = res_df.append({
                 'statistic': res[0],
                 'p-value': res[1]
             },
                                    ignore_index=True)
 res_df = res_df.set_index(pd.Index(keys, 'Tissue'))
 corrected_p_values = multipletests(res_df['p-value'])[1]
 res_df['cor_p-value'] = pd.Series(corrected_p_values, index=keys)
 res_df = res_df.sort_values(by='cor_p-value')
 df['Average Accuracy'] = df.mean(axis=1)
 df['sdev'] = df.std(axis=1)
 plt.figure(figsize=(10, 6))
 ax = sns.barplot(x=df.index.values,
                  y=df['Average Accuracy'],
                  yerr=df['sdev'] * 1,
                  capsize=.2)
 x = ax.set_title("Average Model Accuracies")
 x = ax.set_xlabel("Tissues")
 x = ax.set_ylabel("Average Accuracy - 1 SD")
 x = ax.set_xticklabels(labels=df.index.values, rotation=38)
 fig = ax.get_figure()
 fig.savefig("plots/all_model_accuracy.png", dpi=100, bbox_inches="tight")
 from IPython.display import display, HTML
 display(res_df.head(10))

Exemple #51

0

Afficher le fichier

# In[ ]:

m = sns.distplot(dataset["Fare"],
                 color="r",
                 label="Skewness : %.2f" % (dataset["Fare"].skew()))
m = m.legend(loc="best")

# skewness is reduced

# ### 3.2 Categorical values

# #### Sex

# In[ ]:

g = sns.barplot(x="Sex", y="Survived", data=train)
g = g.set_ylabel("Survival Probability")

# Females have a high rate of Survival

# In[ ]:

# See the two groups data ratio
train[["Sex", "Survived"]].groupby('Sex').mean()

# It shows clearly that Female have more chance to survive than Male.
# So Sex, will play an important role in the prediction of the survival.

# #### Pclass

# In[ ]:

Exemple #52

0

Afficher le fichier

Fichier : explanatory_analysis.py Projet : yolonda520/diamonds_python

####################################
fig, ax = plt.subplots(nrows=1, ncols=3, figsize=(12, 6))

sns.scatterplot(x=diamond['cut'], y=diamond['price'], color='r', ax=ax[0])

sns.scatterplot(x=diamond['color'], y=diamond['price'], color='y', ax=ax[1])

sns.scatterplot(x=diamond['clarity'], y=diamond['price'], color='g', ax=ax[2])
plt.tight_layout()
plt.show()
# %% [markdown]
# ### 3.2.3 Bar plot
# %%
fig, ax = plt.subplots(nrows=1, ncols=3, figsize=(12, 6))

sns.barplot(x=diamond['cut'], y=diamond['price'], color='r', ax=ax[0])

sns.barplot(x=diamond['color'], y=diamond['price'], color='y', ax=ax[1])
sns.barplot(x=diamond['clarity'], y=diamond['price'], color='g', ax=ax[2])

# %% [markdown]
# Insight:<br>
# 1. cut at 4 level is best deal price;<br>
# 2. color at 1 is best average price, which is weird, because the worst color has the best price.<br>
#    Probably most customers are hard to tell which is the better color;<br>
# 3. clarity 2 is the best price, which is weird too, as the clarity 2 is not a good level.

# %%
print(diamond['price'][diamond['color'] == 5].mean())
print(diamond['carat'][diamond['color'] == 5].mean())
# %%

Exemple #53

0

Afficher le fichier

#Adding a grid for better visualization
plt.grid()

#%%
#Visualization - 2

sns.set(style="white", context="talk")

# Set up the matplotlib figure
f2, (ax1, ax2, ax3) = plt.subplots(3, 1, figsize=(7, 5), sharex=True)

# Generate some sequential data

x = [60, 90, 120]
y1 = y_past_1
sns.barplot(x=x, y=y1, palette="Reds", ax=ax1).set_title('Corridor 1')
ax1.axhline(0, color="k", clip_on=False)
ax1.set_ylabel("Pasture")

# Generate some sequential data

x = [60, 90, 120]
y1 = y_agri_1
sns.barplot(x=x, y=y1, palette="Reds", ax=ax2)
ax2.axhline(0, color="k", clip_on=False)
ax2.set_ylabel("Agriculture")

# Generate some sequential data

x = [60, 90, 120]
y1 = y_exp_s_1

Exemple #54

0

Afficher le fichier

Fichier : analy.py Projet : heavy-snowy/python

zf = pd.DataFrame(zf, columns=columns)

# # 重新审视数据集
display(zf.head(n=2))

# 对二手房区域分组对比二手房数量和每平米房价
df_house_count = zf.groupby('Region')['price'].count().sort_values(
    ascending=False).to_frame().reset_index()
df_house_mean = zf.groupby('Region')['perPrice'].mean().sort_values(
    ascending=False).to_frame().reset_index()

f, [ax3, ax1, ax2] = plt.subplots(3, 1, figsize=(20, 15))

sns.barplot(x='Region',
            y='perPrice',
            palette="Blues_d",
            data=df_house_mean,
            ax=ax1)
ax1.set_title('深圳各个区域的每平方米的租金对比', fontsize=15)
ax1.set_xlabel('region', rotation=80, fontsize=1)
ax1.set_ylabel('unit price')

sns.barplot(x='Region',
            y='price',
            palette="Greens_d",
            data=df_house_count,
            ax=ax2)
ax2.set_title('深圳各个区域的出租房数量对比', fontsize=15)
ax2.set_xlabel('region')
ax2.set_ylabel('quantity')

Exemple #55

0

Afficher le fichier

plt.plot(job_admin.keys(), job_admin.values, label="admin")
plt.plot(job_technician.keys(), job_technician.values, label="technician")
plt.plot(job_blue.keys(), job_blue.values, label="blue-collar")
plt.plot(job_entrepreneur.keys(),
         job_entrepreneur.values,
         label="entrepreneur")
plt.plot(job_management.keys(), job_management.values, label="management")
plt.plot(job_retired.keys(), job_retired.values, label="retired")
plt.ylabel("No of employees")
plt.xlabel("Age range")
plt.title("Jobs VS age (Fig3)")
plt.legend()
plt.show()

# Bar plot for Job and salary using seaborn
sns.barplot(x="job", y="salary", hue="marital", data=data)
plt.xticks(rotation=45)
plt.title("Jobs and salaries (Fig4)")
plt.show()

# pie chart for job
plt.pie(data["job"].value_counts().values,
        autopct='%1.2f%%',
        labels=data["job"].value_counts().keys())
plt.title("Pie chart of Job (Fig 5)")
plt.show()

# Scatter plot between salary and age
plt.scatter(data["salary"], data["age"], color="red", alpha=0.5)
plt.xlabel("salary")
plt.ylabel("age")

Exemple #56

0

Afficher le fichier

Fichier : Kernel.py Projet : chfenix/kaggle

data_train["Embarked"].fillna(data_train["Embarked"].mode().iloc[0],
                              inplace=True)

print("==================Feture Fill End===================")
print(data_train.info())

# 查看不同特性下生存概率是否与特性有关系
plt.figure(figsize=(15, 10))
view_feature = ["Pclass", "Sex", "SibSp", "Parch", "Cabin", "Embarked"]
for i, feature_name in enumerate(view_feature):
    plt.subplot(2, 3, (i + 1))
    # 按照属性Groupby，并计算生存均值（由于Survived为0、1，均值即表示生存概率）
    sns.barplot(
        x=feature_name,
        y="Survived",
        # hue="Survived",
        data=data_train[[feature_name,
                         "Survived"]].groupby([feature_name],
                                              as_index=False).mean())
    # 显示不同属性值下生存与未生还的柱状图组（个人感觉不是太直观）
    # sns.countplot(x=feature_name, hue="Survived", data=data_train)

# 查看年龄和生存率的关系
data_train["Age_int"] = data_train["Age"].astype("int")
plt.subplots(1, 1, figsize=(18, 4))
sns.barplot(x="Age_int",
            y="Survived",
            data=data_train[["Age_int",
                             "Survived"]].groupby(["Age_int"],
                                                  as_index=False).mean())
# plt.show()

Exemple #57

0

Afficher le fichier

mask = (DatosOrdenadosPorFecha_df['dateRep'] >=
        start_date) & (DatosOrdenadosPorFecha_df['dateRep'] <= end_date)
fechasfiltradas_df = DatosOrdenadosPorFecha_df.loc[mask]

#Creamos el dataframe con los datos que se van a utilizar en el informe
grafico_df = fechasfiltradas_df[['dateRep', 'cases', 'moving14', 'moving7']]

#Formateamos la fecha de nuevo a formato cadena para que se muestre correctamente en el gráfico
grafico_df['dateRep'] = grafico_df['dateRep'].astype(str)

#Dibujamos el gráfico
fig, ax = plt.subplots(1, 1)

grafico = sns.barplot(ax=ax,
                      x="dateRep",
                      y="cases",
                      data=grafico_df,
                      label="Nuevos Casos Diarios")

###Esto es para los índices de un gráfico catplot
#grafico.set_titles("Nuevos Casos en España", fontsize=30)
#grafico.set_xlabels("Fecha",fontsize=20)
#grafico.set_ylabels("España",fontsize=20)
#grafico.set_yticklabels(fontsize=10)
#grafico.set_xticklabels(fontsize=5)

#Gráfico de líneas media móvil de los últimos 14 días
graficomv14 = sns.lineplot(ax=ax,
                           x="dateRep",
                           y="moving14",
                           data=grafico_df,

Exemple #58

0

Afficher le fichier

Fichier : seaborn_data_disp.py Projet : miroslavgasparek/python_intro

####

# Make a bar graph
fig1 = plt.figure(1)
plt.bar(np.arange(4),
        mean_impf,
        yerr=sem_impf,
        ecolor='black',
        tick_label=['I', 'II', 'III', 'IV'],
        align='center')
plt.ylabel('impact force (nM)')
fig1.show()

# Easier plot with Seaborn
fig2 = plt.figure(2)
sns.barplot(data=df, x='ID', y='impf')
plt.xlabel('')
plt.ylabel('impact force (mN)')
fig2.show()

###
# Message: do not make bar graphs.
###

# Bee swarm plot
fig3 = plt.figure(3)
sns.swarmplot(data=df, x='ID', y='impf')
plt.margins(0.02)
plt.xlabel('')
plt.ylabel('impact force (mN)')
fig3.show()

Exemple #59

0

Afficher le fichier

Fichier : temp.py Projet : chandan1234-c/Shortlister

    svcscore=(model.score(X_test_array,y_test1))*100
    
    clf=RandomForestClassifier(n_estimators=100)
    clf.fit(X_train_array,y_train)
    clf_pred=clf.predict(X_test_array)
    clfscore=(clf.score(X_test_array,y_test1))*100
      
    knn = KNeighborsClassifier(n_neighbors = 11,metric='minkowski' , p=2).fit(X_train_array, y_train) 
    knnscore=(knn.score(X_test_array,y_test1))*100
    
    scores = [gnbscore,naivescore,svcscore,knnscore,dtscore,clfscore]
    algorithms = ["Gaussian naive bayes","Bernoulli naive bayes","Support Vector Machine","K-Nearest Neighbors","Decision Tree","Random Forest"]
    sns.set(rc={'figure.figsize':(15,8)})
    plt.xlabel("Algorithms")
    plt.ylabel("Accuracy score")
    sns.barplot(algorithms,scores)
    
    final_model = naive_bayes
    
    # save the model to disk
    pickle.dump(final_model, open(save_model, 'wb'))
    
def make_prediction(resumeNo):
    resume = 'C:/Users/Muskaan Ratra/Desktop/CVs/CVs/c' + str(resumeNo+1) + '.pdf'
    loaded_model = pickle.load(open(save_model, 'rb'))
    loaded_vector = pickle.load(open(save_vector, 'rb'))
    resumeFile=open(resume,'rb')
    sample_resume=slate.PDF(resumeFile)
    sample_resume=sample_resume[0]
    sample_resume=loaded_vector.transform([sample_resume])
    return loaded_model.predict(sample_resume)[0]

Exemple #60

0

Afficher le fichier

Fichier : Seaborn_visualizing_world_cup_data_project.py Projet : saadhahmed00/Learning-Seaborn

import codeacademylib3_seaborn
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns

df = pd.read_csv('WorldCupMatches.csv')
print(df.head())

df['Total Goals'] = df['Home Team Gaols'] + df['Away Team Goals']

print(df.head())

sns.set_style('whitegrid')
sns.set_context('poster', font_size=10)
f, ax = plt.subplots(figsize=(10, 25))
ax = sns.barplot(data=df, x=df['year'], y=df['total goals'])
ax.set_title('Year vs. Av Goals')

df_goals = pd.read_csv('goals.csv')
#print(df_goals.head())
f, ax2 = sns.subplots(figsize=(12, 7))
ax2 = sns.set_context('notebook', font_scale=1.25)
ax2 = sns.boxplot(data=d_goals, x='year', y='goals', palette='Spectral')
ax2.set_title('Boxplot')

plt.show()