コード例 #1
0
def plot_accuracy_with_random_by_category(true, predicted, sort=True):
    """plots the random
    Parameters:
        true (array): the observed values
        predicted (array): the observed values.
        sort (bool): if true sort by the
    Returns:
        res, ax
        """
    # compute the values
    res = calcuate_accuracy_above_random_chance(true, predicted, sort)

    # set styles
    sns.set_style('white')
    sns.set_context('talk')

    # axes
    ax = sns.barplot(x='Model', y='Category', data=res, color='#c0cdf3')
    sns.barplot(x='Random', y='Category', data=res, color='#4f73dd', ax=ax)

    # configure the plot details
    plt.xlim(0, 100)
    plt.xlabel('Accuracy (%)', size=24)
    plt.ylabel('')
    ax.tick_params(axis='both', labelsize=22)
    sns.despine()

    return ax, res
コード例 #2
0
ファイル: dist_plotting.py プロジェクト: sallamander/dsfuncs
def _plot_categorical_var_dist(var_data, ax, show): 
    """Plot a boxplot of the continuous variable data inputted. 
    
    This is a helper function called from plot_var_dist. It'll 
    be used in the case that categorical data is passed in. 

    Args: 
        var_data: 1d numpy.ndarray
        ax: matplotlib.pyplot.Axes object 
            This may or may not be None, depending on what 
            was passed from plot_var_dist. 
        show: bool 
    """

    var_data_counts = var_data.value_counts()
    var_data_percs = var_data_counts / var_data_counts.sum()

    if ax: 
        sns.barplot(var_data_percs.index, 
                var_data_percs.values, palette="BuGn_d", ax=ax)
    else: 
        ax = sns.barplot(var_data_percs.index, 
                var_data_percs.values, palette="BuGn_d") 
    bars = ax.patches
    labels = var_data_percs.values
    _add_bar_text(ax, bars, labels) 

    if show: 
        plt.show()
コード例 #3
0
def sentimentAccuracy(tickeName):
    path2 = 'resultsMKII'
    frame2 = call_data(tick_Name,path2)
    logReturn = [[],[],[]]
    sentiment = []
    index = []
    for i in range(len(frame2)):
        for x in range(3):
            logReturn[x].append(frame2[str(x+1)+' day'].values[i])
        sentiment.append(frame2['Sentiment'].values[i])
        index.append(i)
    result = {'logReturn1':pd.Series(logReturn[0],index = index),
              'logReturn2':pd.Series(logReturn[1],index = index),
              'logReturn3':pd.Series(logReturn[2],index = index),
              'Sentiment':pd.Series(sentiment,index = index)}

    sns.plt.subplot(3,1,1)
    aw = sns.barplot(x="Sentiment",y = "logReturn1",ci=None,data = result)
    aw.set(xlabel='Sentiment', ylabel='Day 1')
    sns.plt.subplot(3,1,2)
    ax = sns.barplot(x="Sentiment",y = "logReturn2",ci=None,data = result)
    ax.set(xlabel='Sentiment', ylabel='Day 2')
    sns.plt.subplot(3,1,3)
    ay = sns.barplot(x="Sentiment",y = "logReturn3",ci=None,data = result)
    ay.set(xlabel='Sentiment', ylabel='Day 3')
    sns.plt.show()
コード例 #4
0
ファイル: plotting.py プロジェクト: Hushpar/titanic_ml
def p6(data):
    f, (ax1, ax2) = plt.subplots(2, 1, figsize=(10, 8), sharex=True)

    sns.boxplot(x="Title", y="Age", data=data.sort_values("Age"), ax=ax1)
    sns.barplot(x="Title", y="Survived", data=data, ax=ax2)

    plt.show()
コード例 #5
0
def historyEffectOnSentiment(tickeName):
    # from mpl_toolkits.mplot3d import Axes3D
    path2 = 'resultsMKII'
    frame2 = call_data(tick_Name,path2)
    logReturn = [[],[],[],[],[]]
    sentiment = []
    index = []
    for i in range(len(frame2)):
        for x in range(5):
            logReturn[x].append(frame2['-'+str(x+1)+' day'].values[i])
        sentiment.append(frame2['Sentiment'].values[i])
        index.append(i)
    result = {'logReturn1':pd.Series(logReturn[0],index = index),
              'logReturn2':pd.Series(logReturn[1],index = index),
              'logReturn3':pd.Series(logReturn[2],index = index),
              'logReturn4':pd.Series(logReturn[3],index = index),
              'logReturn5':pd.Series(logReturn[4],index = index),
              'Sentiment':pd.Series(sentiment,index = index)}
    sns.plt.subplot(5,1,1)
    aw = sns.barplot(x="Sentiment",y = "logReturn1",data = result)
    aw.set(xlabel='Sentiment', ylabel='Day -1')
    sns.plt.subplot(5,1,2)
    ax = sns.barplot(x="Sentiment",y = "logReturn2",data = result)
    ax.set(xlabel='Sentiment', ylabel='Day -2')
    sns.plt.subplot(5,1,3)
    ay = sns.barplot(x="Sentiment",y = "logReturn3",data = result)
    ay.set(xlabel='Sentiment', ylabel='Day -3')
    sns.plt.subplot(5,1,4)
    az = sns.barplot(x="Sentiment",y = "logReturn4",data = result)
    az.set(xlabel='Sentiment', ylabel='Day -4')
    sns.plt.subplot(5,1,5)
    bx = sns.barplot(x="Sentiment",y = "logReturn5",data = result)
    bx.set(xlabel='Sentiment', ylabel='Day -5')
    sns.plt.show()
コード例 #6
0
def get_xgb_feature_importance_plot(best_param_, experiment_, 
                                    png_folder,
                                    png_fname,
                                    score_threshold=0.8):

    # 1. 
    train_X, train_y = experiment_.get_train_data()
    clf = XGBClassifier()
    try:
        del best_param_['model_type']
    except:
        pass
    clf.set_params(**best_param_)
    clf.fit(train_X, train_y)
    index2feature = clf.booster().get_fscore()
    fis = pd.DataFrame({'name':index2feature.keys(),
                        'score':index2feature.values()})
    fis = fis.sort('score', ascending=False)
    if len(fis.index) > 20:
        score_threshold = fis['score'][fis['score'] > 0.0].quantile(score_threshold)
        #where_str = 'score > %f & score > %f' % (score_threshold, 0.0)
        where_str = 'score >= %f' % (score_threshold)
        fis = fis.query(where_str)

    # 2. plot
    #gs = GridSpec(2,2)
    #ax1 = plt.subplot(gs[:,0])
    #ax2 = plt.subplot(gs[0,1])
    #ax3 = plt.subplot(gs[1,1])

    # 3.1 feature importance
    sns.barplot(x = 'score', y = 'name',
                data = fis,
                #ax=ax1,
                color="blue")
    #plt.title("Feature_Importance", fontsize=10)
    plt.ylabel("Feature", fontsize=10)
    plt.xlabel("Feature_Importance : f-Score", fontsize=10)

    """
    # 3.2 PDF
    confidence_score = clf.oob_decision_function_[:,1]
    sns.distplot(confidence_score, kde=False, rug=False, ax=ax2)
    ax2.set_title("PDF")

    # 3.3 CDF
    num_bins = min(best_param_.get('n_estimators',1), 100)
    counts, bin_edges = np.histogram(confidence_score, bins=num_bins, normed=True)
    cdf = np.cumsum(counts)
    ax3.plot(bin_edges[1:], cdf / cdf.max())
    ax3.set_title("CDF")
    ax3.set_xlabel("Oob_Decision_Function:Confidence_Score", fontsize=10)
    """

    png_fname = os.path.join(Config.get_string('data.path'), 'graph', png_fname)
    plt.tight_layout()
    plt.savefig(png_fname)#, bbox_inches='tight', pad_inches=1)
    plt.close()

    return True
コード例 #7
0
def clust_stability(log2_expdf_gene, iterations=16):
    sns.set(context='poster', font_scale = 1)
    sns.set_palette("RdBu_r")
    stability_ratio = []
    total_genes = len(log2_expdf_gene.columns.tolist())
    end_num = 1000
    iter_list = range(100,int(round(end_num)),int(round(end_num/iterations)))
    for gene_number in iter_list:
        title= str(gene_number)+' genes plot.'
        top_pca = plot_PCA(log2_expdf_gene, num_genes=gene_number, title=title)
        top_pca_by_gene = log2_expdf_gene[top_pca]
        top_pca_by_cell = top_pca_by_gene.transpose()
        cell_linkage, plotted_df_by_gene, col_order = clust_heatmap(top_pca, top_pca_by_gene, num_to_plot=gene_number, title=title)
        if gene_number == 100:
            s1 = col_order
            s0 = col_order
        else:
            s2= col_order
            sm_running = difflib.SequenceMatcher(None,s1,s2)
            sm_first = difflib.SequenceMatcher(None,s0,s2)
            stability_ratio.append((sm_running.ratio(), sm_first.ratio()))
            s1=col_order
        plt.close()
    x= iter_list[1:]
    f, (ax1, ax2) = plt.subplots(2, 1, figsize=(10, 8), sharex=True)
    y1= [m[0] for m in stability_ratio]
    y2= [m[1] for m in stability_ratio]
    sns.barplot(x, y1, palette="RdBu_r", ax=ax1)
    ax1.set_ylabel('Running ratio (new/last)')
    sns.barplot(x, y2, palette="RdBu_r", ax=ax2)
    ax2.set_ylabel('Ratio to 100')
    plt.savefig(os.path.join(filename,'clustering_stability.pdf'), bbox_inches='tight')
    plt.show()
    plt.close()
    return stability_ratio
コード例 #8
0
ファイル: frame.py プロジェクト: TAKSIM/camp
    def createSubOverviewPage(self):
        layout = QtGui.QGridLayout()
        w = QtGui.QWidget()
        sns.set(style="whitegrid")
        f, ax = plt.subplots(figsize=(20, 12))
        canvas = figureCanvas(f)
        canvas.setParent(w)
        sns.set(style="whitegrid")
        q = QtSql.QSqlQuery("""SELECT EXP_DATE, SUM(AMOUNT), SUM(AMOUNT*(1+EXP_RETURN*(datediff(EXP_DATE, SETTLE_DATE)+1)/36500.0)) FROM LIABILITY WHERE EXP_DATE>='%s' GROUP BY EXP_DATE ORDER BY EXP_DATE"""%self.sysdate.date().toPyDate())
        dates, vals = [], []
        x_amt = range(0,1000000000,100000000)
        while q.next():
            dates.append(q.value(0).toDate().toPyDate().isoformat())
            vals.append((q.value(1).toDouble()[0], q.value(2).toDouble()[0]))
        data = pd.DataFrame(vals, index=dates, columns=['Amount', 'Total Return'])
        # Plot the total crashes
        sns.set_color_codes("pastel")
        sns.barplot(x='Total Return', y=dates, data=data,
                    label='Interest', color="b")

        # Plot the crashes where alcohol was involved
        sns.set_color_codes("muted")
        sns.barplot(x='Amount', y=dates, data=data,
                    label="Principal", color="b")

        # Add a legend and informative axis label
        ax.legend(ncol=2, loc="upper right", frameon=True)
        ax.set(ylabel="Maturity Date", title='Liability Overview')
        sns.despine(left=True, bottom=True)

        layout.addWidget(w, 0, 0, 1, 1)
        return layout
コード例 #9
0
ファイル: visr.py プロジェクト: CoAxLab/radd
def plot_simdf_summary(simdf):
    f, axes = plt.subplots(2, 2, figsize=(12,8))
    a1, a2, a3, a4 = axes.flatten()
    targets=['A', 'B', 'C', 'D']
    clrs = ['#3572C6',  '#c44e52', '#8172b2', '#83a83b']
    targetColors = dict(zip(targets,clrs))
    sns.barplot(x='choice', y='rt', data=simdf, ax=a1, order=targets, palette=targetColors)
    sns.barplot(x='choice', y='switch', data=simdf, ax=a2, order=targets, palette=targetColors)
    a1.set_ylabel('Response Time (ms)', fontsize=13)
    a2.set_ylabel('P(Switch)', fontsize=13)
    rts = simdf.groupby('choice').mean().rt.values
    sw = simdf.groupby('choice').mean().switch.values
    a1.set_ylim(rts.min()*.85, rts.max()*1.15)
    a2.set_ylim(sw.min()*.50, sw.max()*1.20)
    for i, target in enumerate(targets):
        tcolor=targetColors[target]
        tdf = simdf[simdf.choice=='target'].reset_index()
        sns.timeseries.tsplot(data=simdf, time='trial', unit='agent', value='vd'+target, ax=a3, color=tcolor)
        sns.timeseries.tsplot(data=simdf, time='trial', unit='agent', value='vi'+target, ax=a4, color=tcolor)
    a3.legend(loc=0)
    f.subplots_adjust(hspace=.35, wspace=.4)
    a3.set_ylabel('$v^G_t$', fontsize=16)
    a4.set_ylabel('$v^N_t$', fontsize=16)
    a3.set_xlabel('Trial ( $t$ )', fontsize=13)
    a4.set_xlabel('Trial ( $t$ )', fontsize=13)
    plt.subplots_adjust(wspace=.4)
    sns.despine()
コード例 #10
0
ファイル: profile.py プロジェクト: Stefannn/PyUtil
def barplot_top_n_functions(df, n, sort_criterium='tot_time', show_std=True):
    '''
    Barplot of the n most time consuming functions (sorted by sort_criterium)
    df: panda dataframe (e.g. via get_df_from_stats())
    ci: confidence intervall, set to None if you don't want them
    returns: figure
    '''
    tt = ('tot_time', 'mean')
    s_c = (sort_criterium, 'mean') # sort criterium including mean
    total_time = df[tt].sum()
    data = df.sort(columns=[s_c], ascending=False).iloc[0:n]
    topn_time = data[tt].sum()
    frac_time = topn_time / total_time
    if show_std:
        errs = data[(sort_criterium, 'std')]
    else:
        errs = None

    f, ax = plt.subplots(figsize=(10,5))
    sns.barplot(data=data, x=s_c, y='flf', color='b', xerr=errs)
    sns.despine(left=True, bottom=True)
    ax.set(ylabel="", xlabel=sort_criterium + " [s]")
    # write the fraction of total time spent in these n functions
    rect = ax.patches[0] # last rectangle to get position of text
    txt = str(int(100*frac_time)) + "% of total runtime"
    ax.text(rect.get_width()*0.7, rect.get_height()*1.5, txt,
            ha="center", va="center")

    plt.tight_layout()
    return ax
コード例 #11
0
ファイル: stats.py プロジェクト: fxfactorial/macholibre
def gen_abnormalities_bar(good, bad):
    print 'Parsing good json.'
    g = gen_abnormalities_data(good)
    print 'Total Good:', len(g)
    print 'Parsing bad json.'
    b = gen_abnormalities_data(bad)
    print 'Total Bad:', len(b)

    mcg = map(lambda x: x[0], g.most_common(25))
    mcb = map(lambda x: x[0], b.most_common(25))

    most_common = set(mcg + mcb)
    for k in g.keys():
        if k not in most_common:
            del g[k]
    print 'Filtered Good:', len(g)

    for k in b.keys():
        if k not in most_common:
            del b[k]
    print 'Filtered Bad:', len(b)

    gabnormalities, gcounts = zip(*g.most_common())
    babnormalities, bcounts = zip(*b.most_common())
    gdata = pd.DataFrame({'alignment': 'good', 'abnormality': gabnormalities,
                          'count': gcounts})
    bdata = pd.DataFrame({'alignment': 'bad', 'abnormality': babnormalities,
                          'count': bcounts})

    data = gdata.append(bdata).sort_values('count', ascending=False)
    print data

    sns.barplot(x='abnormality', y='count', hue='alignment', data=data)
コード例 #12
0
    def __init__(self, master, x_train, y_train, x_test, y_test, evaluator, df, console):
        Tk.Frame.__init__(self, master)
        self.x_train = x_train
        self.y_train = y_train
        self.x_test = x_test
        self.y_test = y_test
        self.evaluator = evaluator
        self.df = df
        self.console = console

        frame_train = Tk.Frame(self)
        frame_train.pack(fill=Tk.BOTH, expand=1, padx=15, pady=15)
        plt.figure(figsize=(12, 20))
        plt.subplot(111)
        
        # k best feature's names
        plt.figure(figsize=(12, 8))
        plt.subplot(111)
        selection = SelectKBest(f_classif, k=3)
        selection.fit(self.x_train, self.y_train)
        feature_scores = selection.scores_
        feature_names = df.columns.values
        feature_names = feature_names[feature_names != "NSP"]
        kbest_feature_indexes = selection.get_support()
        kbest_feature_names = feature_names[kbest_feature_indexes]

        # 存为DataFrame
        rec = zip(feature_scores, feature_names)
        data = pd.DataFrame(rec, columns=["Score", "Feature"])

        sns.barplot(x="Feature", y="Score", data=data)
        plt.xticks(rotation=-90)
        plt.title("Cardiotocography Feature Scores Ranking")
        self.attach_figure(plt.gcf(), frame_train)
コード例 #13
0
ファイル: ratings.py プロジェクト: papousek/analysis
def plot_number_of_user_ratings_per_context():
    nums = load_ratings_with_contexts().groupby(['user', 'context_name', 'term_type']).apply(len).reset_index().rename(columns={0: 'num'}).groupby('num').apply(len).reset_index().rename(columns={0: 'count'})
    nums = nums.head(n=20)
    sns.barplot(x='num', y='count', data=nums, color=output.palette()[0])
    plt.ylabel('Number of users')
    plt.xlabel('Number of ratings per context')
    output.savefig('number_of_ratings')
コード例 #14
0
def age_histogram(df_age):
    age_counts = df_age.groupby('age').age.count()

    y = age_counts.values
    x = [int(age) for age in age_counts.index]

    f, ax = plt.subplots(1,1, figsize=(12,8))
    sns.barplot(x,y, palette=sns.dark_palette('#008080', reverse=True, n_colors=60), linewidth=0)
    ax.set_ylabel('Postings')
    ax.set_xlabel('')
    ax.set_title('Histogram of Postings by Age')
    x_ticks = [0]
    x_ticks.extend(range(2,95, 5))
    x_ticklabels = ['']
    x_ticklabels.extend(range(20,95,5))
    ax.set_xticks(x_ticks)
    ax.set_xticklabels(x_ticklabels)
    #need to fix xlabels
    sns.despine(bottom=True, right=True)
    sns.plt.xlim(-1, 90)
    for i,p  in enumerate(ax.patches):
        height = p.get_height()
        if ((i+18) % 5 == 0) and (i+18 < 70): 
            ax.text(p.get_x()-1, height + 4, i+18, fontsize=18)

    plt.show()
コード例 #15
0
def plot_heldout_prediction(input_vals, probs,
                            fname, n=10, title=""):
  """Save a PNG plot visualizing posterior uncertainty on heldout data.

  Args:
    input_vals: A `float`-like Numpy `array` of shape
      `[num_heldout] + IMAGE_SHAPE`, containing heldout input images.
    probs: A `float`-like Numpy array of shape `[num_monte_carlo,
      num_heldout, num_classes]` containing Monte Carlo samples of
      class probabilities for each heldout sample.
    fname: Python `str` filename to save the plot to.
    n: Python `int` number of datapoints to vizualize.
    title: Python `str` title for the plot.
  """
  fig = figure.Figure(figsize=(9, 3*n))
  canvas = backend_agg.FigureCanvasAgg(fig)
  for i in range(n):
    ax = fig.add_subplot(n, 3, 3*i + 1)
    ax.imshow(input_vals[i, :].reshape(IMAGE_SHAPE[:-1]), interpolation="None")

    ax = fig.add_subplot(n, 3, 3*i + 2)
    for prob_sample in probs:
      sns.barplot(np.arange(10), prob_sample[i, :], alpha=0.1, ax=ax)
      ax.set_ylim([0, 1])
    ax.set_title("posterior samples")

    ax = fig.add_subplot(n, 3, 3*i + 3)
    sns.barplot(np.arange(10), np.mean(probs[:, i, :], axis=0), ax=ax)
    ax.set_ylim([0, 1])
    ax.set_title("predictive probs")
  fig.suptitle(title)
  fig.tight_layout()

  canvas.print_figure(fname, format="png")
  print("saved {}".format(fname))
コード例 #16
0
ファイル: plot.py プロジェクト: clarkfitzg/ballistics
def make_plots(groups):

    sns.stripplot("ammo", "moa", data=groups, jitter=True)
    postprocess()
    plt.savefig("points.png")

    plt.clf()
    sns.boxplot("ammo", "moa", data=groups)
    postprocess()
    plt.savefig("boxplot.png")

    plt.clf()
    sns.barplot("ammo", "mean", data=groups, ci=None)
    plt.title("mean moa for best 9 of 10 five shot groups")
    plt.ylabel("moa")
    postprocess()
    plt.savefig("avg_moa.png")

    plt.clf()
    std = groups["standard"]
    std = std[std.notnull()]

    fig, axes = plt.subplots(ncols=2)
    sns.distplot(std, ax=axes[0])
    stats.probplot(std, plot=axes[1])
    fig.set_size_inches(6, 4)
    fig.tight_layout()
    plt.savefig("qqplot.png")
コード例 #17
0
ファイル: exploreData.py プロジェクト: wrodezno/Bimbo
 def hbars(colrow,colcol,groupedData,tempcolors,title,ylab,xlab):
     #Input: colrow: Values alone the x axis
     #       colcol: Values along the y-axis
     #  groupedData: Pandas DataFrame
     #   tempcolors: Bar colors in plot
     #        title: Plot title
     #        ylab:  Y-label
     #        xlab:  X-label
     #Output: Horizontal Bar Plot with value labels the the end of each bar
     valuePlotting = sns.barplot(x = colcol,y = colrow,order = groupedData[colcol],data = groupedData)
     fig, ax = plt.subplots()                                                       #Plot Figure and axes handles 
     fig.set_size_inches(14, 14)
     sns.despine()
     ax = sns.barplot(x = colrow,y = colcol,data = groupedData,order = groupedData[colcol], color = tempcolors)
     plt.setp(ax.patches, linewidth=0)   
     ax.set_title(title,fontsize = 16)
     ax.set_ylabel(ylab,fontsize = 15)
     ax.set_xlabel(xlab,fontsize = 15)
     for p in valuePlotting.patches:
         xpos = p.get_height()
         height = p.get_x()   
         if xpos > 50:
             t = .01
         elif xpos > 8:
             t = .008
         elif xpos > 15:
             t = .001
         else:
             t = .1
         ax.text(xpos + t*xpos, height+ .5, '%1.1f'%(xpos))
     return fig
コード例 #18
0
def plot_avg_rank_all_models(P,split_type='balancedavg1',saveout=True):
    '''
    Generate bar plot of average rank (out of 64) of correct sketch category, by model, for a particular split.
    Wrapper around get_avg_rank_all_models, which itself wraps around get_avg_rank_across_samples.
    '''
    HU,H0U,H1U,MU,M0U,M1U,M2U,M3U = get_avg_rank_all_models(P,split_type=split_type)
    sns.set_context('talk')
    sns.set_style("ticks")
    fig = plt.figure(figsize=(4,8))
    ax = fig.add_subplot(111)
    U = pd.concat([HU,H0U,H1U,MU,M0U,M1U,M2U,M3U],axis=0)
    sns.barplot(data=U,
                x='adaptor',
                y='target_rank',
                ci='sd',
                order = ['human_combined_cost','human_S0_cost','human_combined_nocost',\
                         'multimodal_fc6_combined_cost', \
                         'multimodal_fc6_S0_cost','multimodal_fc6_combined_nocost',
                         'multimodal_conv42_combined_cost',\
                         'multimodal_pool1_combined_cost'])
    plt.ylabel('mean rank of congruent sketch')
    plt.ylim([1,32])
    xticklabels=['Context Cost Human','NoContext Cost Human','Context NoCost Human','Context Cost HighAdaptor',
                 'NoContext Cost HighAdaptor','Context NoCost HighAdaptor', 'Context Cost MidAdaptor',\
                 'Context Cost LowAdaptor']
    plt.xlabel('')
    l = ax.set_xticklabels(xticklabels, rotation = 90, ha="left")
    plt.tight_layout()
    if saveout:
        plt.savefig('./plots/avg_rank_all_models_{}.pdf'.format(split_type))
コード例 #19
0
def plot_prop_congruent_all_models(P,split_type='balancedavg1',saveout=True):
    '''
    Generate bar plot of proportion of trials for which context-congruent sketch preferred over incongruent sketch.
    Wrapper around get_prop_congruent_all_models, which itself wraps around get_prop_congruent.
    '''
    HU,H0U,H1U,MU,M0U,M1U,M2U,M3U = get_prop_congruent_all_models(P,split_type=split_type)
    sns.set_context('talk')
    fig = plt.figure(figsize=(4,8))
    ax = fig.add_subplot(111)     
    D = pd.concat([HU,H0U,H1U,MU,M0U,M1U,M2U,M3U],axis=0)    
    sns.barplot(data=D,
                x='adaptor',
                y='sign_diff_rank',ci='sd')
    plt.axhline(y=0.5,linestyle='dashed',color='k')
    plt.ylim([0,1])
    plt.ylabel('proportion context-congruent sketch preferred')

    xticklabels=['Context Cost Human','NoContext Cost Human','Context NoCost Human','Context Cost HighAdaptor',
                 'NoContext Cost HighAdaptor','Context NoCost HighAdaptor', 'Context Cost MidAdaptor',\
                 'Context Cost LowAdaptor']
    plt.xlabel('')
    l = ax.set_xticklabels(xticklabels, rotation = 90, ha="left")
    plt.tight_layout()
    if saveout:
        plt.savefig('./plots/prop_congruent_all_models_{}.pdf'.format(split_type))
コード例 #20
0
ファイル: meme_processory.py プロジェクト: saketkc/bio-tricks
def main(argv):
    parser = argparse.ArgumentParser(description='Process meme files')
    parser.add_argument('-i', '--meme', metavar='<meme_out>', help='Meme input file', required=True)
    parser.add_argument('-m', '--motif', metavar='<motif_no>', help='Motif number', required=True, type=int)
    parser.add_argument('-c', '--phylo', metavar='<phylo_out>', help='PhyloP conservation scores', required=True)
    parsed = parser.parse_args(argv)
    handle = open(parsed.meme)
    records = motifs.parse(handle, 'meme')
    record = records[parsed.motif-1]
    phylo_data = csv.reader(open(parsed.phylo,'r'), delimiter='\t')
    phylo_scores = []
    for line in phylo_data:
        phylo_scores.append(float(line[2]))
    print "Motif length", record.length
    print "phylo length", len(phylo_scores)
    profile = position_wise_profile(record.counts, record.length)
    max_occur = find_max_occurence(profile, max_count=1)
    motif_scores = []
    for position in max_occur:
        motif_scores.append(position[0][1])
    pr = pearsonr(np.array(motif_scores), np.array(phylo_scores))
    print 'Pearson correlation: {}'.format(pr)
    fig, ax = plt.subplots()
    ax= sns.regplot(y=np.array(motif_scores), x=np.array(phylo_scores), scatter=True)
    ax.set(ylabel="Count of most freq nucleotide", xlabel="PhyloP scores", title='CTCF | pearsonr = {}, p-val={}'.format(pr[0],pr[1]));
    fig.savefig('{}_motif{}_scatter.png'.format(parsed.phylo, parsed.motif))
    x = np.linspace(1,len(phylo_scores)+1,num=len(phylo_scores), endpoint=False)
    f, (ax1, ax2) = plt.subplots(2, 1)
    x1 = sns.barplot(x,y=np.array(motif_scores), ax=ax1)
    x2 = sns.barplot(x,y=np.array(phylo_scores), ax=ax2)
    x1.set(ylabel='Counts of most freq nucleotide', xlabel='Position in motif')
    x2.set(ylabel='Phylop Score', xlabel='Position in motif')
    f.tight_layout()
    f.savefig('{}_motif{}_trend.png'.format(parsed.phylo, parsed.motif))
コード例 #21
0
    def animate(i):
        df = grouped.get_group(keys[i]).sort_values("hour")

        print(df.head())
        ax.clear()
        sns.barplot(x="hour", y="lenMsgs", hue="sender", data=df, ax=ax)
        ax.set_title(i)
コード例 #22
0
ファイル: models_impact.py プロジェクト: thran/experiments2.0
def compare_more_models(experiments):
    labels = sorted(experiments.keys())

    results_d = pd.DataFrame(index=labels, columns=labels, dtype=float)
    results_s = pd.DataFrame(index=labels, columns=labels, dtype=float)
    results_p = pd.DataFrame(index=labels, columns=labels, dtype=float)
    for label1 in labels:
        for label2 in labels:
            d, s, p = compare_models(experiments[label1][0](label1), experiments[label2][0](label2),
                                     experiments[label1][1](label1), experiments[label2][1](label2), plot=False)
            results_d[label1][label2] = d
            results_s[label1][label2] = s
            results_p[label1][label2] = p

    df = pd.DataFrame(columns=["labels", "rmse"])
    for label in labels:
        r = Evaluator(experiments[label][0](label), experiments[label][1](label)).get_report()
        df.loc[len(df)] = (label, r["rmse"])

    plt.subplot(221)
    plt.title("Correlations of difficulties")
    sns.heatmap(results_d)
    plt.subplot(222)
    plt.title("Correlations of skills")
    sns.heatmap(results_s)
    plt.subplot(223)
    plt.title("Correlations of predictions")
    sns.heatmap(results_p)
    plt.subplot(224)
    sns.barplot(x="labels", y="rmse", data=df,)
コード例 #23
0
ファイル: wrong_answers.py プロジェクト: papousek/analysis
def plot_answer_frequency_all(wrong_only=True, contexts=20, show_names=False, normalize=True, top=5):
    plot_cols = 4 if contexts >= 20 else 2
    plot_rows = math.ceil(contexts / plot_cols)
    context_answers = get_context_answers()['count'].to_dict()
    data_all = prepare_answer_frequency_all()
    plot_contexts = sorted(data_all['group_name'].unique(), key=lambda c: -context_answers[c])[:contexts]
    data_all = data_all[data_all['group_name'].isin(plot_contexts)]
    if wrong_only:
        data_all = data_all[data_all['term_name_asked'] != data_all['term_name_answered']]
    if normalize:
        def _normalize(group):
            group['answer_frequency'] = group['answer_frequency'] / group['answer_frequency'].sum()
            return group
        data_all = data_all.groupby(['group_name', 'term_name_asked']).apply(_normalize)
    rcParams['figure.figsize'] = 7.5 * plot_cols, 5 * plot_rows
    for i, (group_name, data) in enumerate(data_all.groupby('group_name')):
        plt.subplot(plot_rows, plot_cols, i + 1)
        to_plot = defaultdict(list)
        for term, term_data in data.groupby('term_name_asked'):
            to_plot[term] = list(term_data['answer_frequency'].head(top).cumsum().sort_values(ascending=False, inplace=False))
        terms, terms_data = zip(*sorted(to_plot.items(), key=lambda x: x[1][-1], reverse=True))
        plt.title(group_name[:30])
        for i in range(top):
            sns.barplot(list(range(len(terms))), list(map(lambda x: ([0] * (top - len(x)) + x)[i], terms_data)), color=output.palette()[i])
        plt.xticks(plt.xticks()[0], terms, rotation=90)
    output.savefig(filename='answer_frequencies_all')
コード例 #24
0
 def ModelsSummary(self, df1, df2, scoring_metric):
     if(scoring_metric == "balanced_accuracy"):
         scoring_metric = "CCR"
     res1 = self.reshapeDf(df1, "AUC")
     res2 = self.reshapeDf(df2, scoring_metric)
     res1 = res1.sort_values(by=['AUC'], ascending=False)
     res2 = res2.sort_values(by=[scoring_metric], ascending=False)
     ###################################################
     sns.set(style="whitegrid")
     fig, ax = plt.subplots(nrows=1,ncols=2,squeeze=False,sharex=False, sharey=True)
     fig.suptitle("Models Performance", fontsize=20)
     fig.tight_layout()
     fig.subplots_adjust(top=0.85)
     fig.set_figheight(6)
     fig.set_figwidth(14)
     ax[0,0].set_title("AUC",fontsize=15)
     ax[0,1].set_title(scoring_metric, fontsize=15)
     #ax[0,0].set_xlabel(xlabel="fsadf",fontsize=24)
     #ax[0,1].set_xlabel(xlabel="fsadf",fontsize=24)
     #sns.set_context("paper", rc={"font.size":15,"axes.titlesize":10,"axes.labelsize":20})
     #sns.set()
     sns.set_context("paper",font_scale=1.6)
     sns.barplot(x="Model", y="AUC", hue="Descriptor", data=res1, ax = ax[0,0])
     sns.barplot(x="Model", y=scoring_metric, hue="Descriptor", data=res2, ax = ax[0,1])
     fig.savefig(fname=self.out_df_path + "SB_models_performance_summary_both.png" , dpi=400 ,format="png")
     fig.clf()
     return res1, res2
コード例 #25
0
ファイル: hw2.py プロジェクト: grin3s/technosfera_dm_2
def plot_bar_counts(data):
    fig = plt.figure(figsize=(20,10))
    plt.yticks(fontsize=8)
    plt.xticks(rotation=90)
    sns.barplot(x=data.keys().values,y=data.values)
    plt.xlabel('')
    plt.ylabel('Number of jobs',fontsize=10)
コード例 #26
0
ファイル: storage_dashboard.py プロジェクト: rolandet/tools
def backline_esc_by_region(data, component_chart_dir, component):
  textcolor='black'
  palette=['#aad962','#fbbf45','#ef6a32']
  plt.clf()
  plt.rcParams['figure.figsize']=(25,15)
  f, ax = plt.subplots(3)
  ax[0].set_title("Total " + component + " BL Escalations by Regional COEs", fontsize=40)
  f.subplots_adjust(hspace=0.4)
  #f.tight_layout()
  for idx, comp in enumerate(["EMEA", "Americas", "APJ"]):
    if comp == "EMEA":
      regional_data = data[data['region'].isin(['Africa','Europe','GMT',''])]
    elif comp == "Americas":
      regional_data = data[data['region'].isin(['America','Atlantic'])]
    elif comp == "APJ":
      regional_data = data[data['region'].isin(['Australia','Asia','Pacific'])]
    regional_data = regional_data.groupby(['month']).sum().reset_index()
    #display(regional_data)
    if not regional_data.empty:
      sns.barplot(x="month", y="bl_esc_count", data=regional_data, color=palette[idx], ax=ax[idx], errcolor='None')

    ax[idx].set_ylabel(comp,fontsize=25, color=textcolor)
    ax[idx].yaxis.set_major_locator(ticker.MaxNLocator(integer=True))
    ax[idx].set_xlabel("")
    for label in ax[idx].get_xticklabels():
        label.set_rotation(15)
        label.set_color(textcolor)
    for label in ax[idx].get_yticklabels():
        label.set_color(textcolor)
  plt.savefig(component_chart_dir + '/total_regional_bl_esc.png')
  #plt.show()
  plt.close()
  return        
コード例 #27
0
ファイル: xgbparser.py プロジェクト: GuillaumeMohr/modelint
 def plot_parts(self, x, groups=None):
     """ Plots individual parameter importance
     Parameters
     ----------
     x: (p,) array
         input variables (p features)
     groups: dict
         group variables under a common name
     """
     if groups is None:
         groups = {}
     p, fp, b = self.predict(x)
     features = [''] * (len(self.feat_map) + 1)
     for f, i in self.feat_map.items():
         features[i] = f
     features[-1] = 'bias'
     parts = np.r_[fp, b]
     df = pd.DataFrame({'participation': parts,
                        'feature': features})
     df['group'] = df.feature.apply(lambda f: groups.get(f, f))
     df = df.groupby('group', as_index=False).sum()
     df['abs_participation'] = df['participation'].abs()
     sns.barplot(x='participation',
                 y='group',
                 data=df.sort_values('abs_participation',
                                     ascending=False))
コード例 #28
0
ファイル: plot_stats.py プロジェクト: FAB4D/agrigater
def simple_barplot(xlabel, ylabels, stype, df, filename, exts):
    ## how to have multiple y values? pass list?
    with sns.axes_style('ticks'):
        fig = plt.figure()
        df = complete_df(df, xlabel)
        if len(ylabels) > 1:
            df = tidy_df(df, xlabel, ylabels, stype)
            ylabel = 'missing percentage'
            plot = sns.barplot(x=xlabel, y='value', hue=stype, data=df)
        else:
            ylabel = ylabels[0]
            if stype == "coverage":
                plot = sns.barplot(x=xlabel, y=ylabels[0], data=df, color="green") #palette=sns.light_palette("green"))
            else:
                plot = sns.barplot(x=xlabel, y=ylabels[0], data=df)
        if stype in ["coverage", "nas"]:
            plot.set_ylim([0, 100])
        sns.despine()
        plt.ylabel(ylabel)
        if xlabel == "year":
            plt.setp(plot.get_xticklabels(), rotation=45)
        plt.xlabel(xlabel)
        fig.add_axes(plot)
        fig.tight_layout()
        ### TODO: call complete_df() and tidy_df()
        for ext in exts:
            save_plot(fig, outpath, filename, '.'+ext)
        plt.close()
    return
コード例 #29
0
ファイル: storage_dashboard.py プロジェクト: rolandet/tools
def owner_barcharts(data, component_chart_dir, multi_month=False):
  textcolor='black'
  palette=['#aad962','#fbbf45','#ef6a32']
  plt.clf()
  plt.rcParams['figure.figsize']=(25,15)
  f, ax = plt.subplots(3)
  for idx, comp in enumerate(["EMEA", "Americas", "APJ"]):
    if comp == "EMEA":
      regional_data = data[data['region'].isin(['Africa','Europe','GMT',''])]
    elif comp == "Americas":
      regional_data = data[data['region'].isin(['America','Atlantic'])]
    elif comp == "APJ":
      regional_data = data[data['region'].isin(['Australia','Asia','Pacific'])]

    if multi_month:
      sorted_hue = sorted(regional_data.month.unique())
      sns.barplot(x="name", y="total_count", hue="month", hue_order=sorted_hue, data=regional_data, color=palette[idx], ax=ax[idx])
    else:
      sns.barplot(x="name", y="total_count", data=regional_data, color=palette[idx], ax=ax[idx])

    ax[idx].set_ylabel(comp,fontsize=25, color=textcolor)
    ax[idx].yaxis.set_major_locator(ticker.MaxNLocator(integer=True))
    ax[idx].set_xlabel("")
    for label in ax[idx].get_xticklabels():
        label.set_rotation(15)
        label.set_color(textcolor)
    for label in ax[idx].get_yticklabels():
        label.set_color(textcolor)
  plt.show()
  plt.close()
  return
コード例 #30
0
ファイル: MICAnalysis.py プロジェクト: Ernestyj/PyProj
def plotMICHist():
    f, ax = plt.subplots(figsize=(12, 6))
    sns.barplot(ks, vs, palette="BuGn_d", ax=ax)
    ax.set_ylabel("MIC")
    plt.xticks(rotation=90)
    f.subplots_adjust(bottom=0.2)
    plt.show()
コード例 #31
0
        axis=0)
    df_anova['choice'] = 1 - df_anova['choice']
    df_anova['qn'] = qn

    df_anova['choice'] = pd.to_numeric(df_anova['choice'])
    print('####### question: ' + q1 + ' #######')
    F, p = stats.f_oneway(df_anova['choice'][df_anova['experiment'] == 0],
                          df_anova['choice'][df_anova['experiment'] == 1],
                          df_anova['choice'][df_anova['experiment'] == 2])
    print('ANOVA: %.2f, %.4f' % (F, p))
    if p < .05:
        res = pairwise_tukeyhsd(df_anova['choice'], df_anova['experiment'])
        print(res)
    df_anova_qn = pd.concat((df_anova_qn, df_anova), axis=0)

    sns.barplot(data=df_anova_qn, x='qn', y='choice', hue='experiment')

questions_df_multilinear = pd.melt(questions_df,
                                   id_vars=['q'],
                                   value_vars=questions_df.columns[:-1],
                                   var_name='experiment',
                                   value_name='percentage')
questions_df_multilinear.to_csv('data/paper/00questions_df_multilinear.csv')

### plotting the regression lines for all the frequency
figure, ax = plt.subplots(1, 1)
for x in questions_df.columns:
    if x != 'q':
        sns.regplot(x='q', y=x, data=questions_df, label=x, ax=ax)
ax.set_ylabel('Prefer towards rational')
plt.legend()
コード例 #32
0
ファイル: main.py プロジェクト: padhidebasish5/CitiHackathon
HT_regular = hashtag_extract(train['tweet'][train['label'] == 0])

# extracting hashtags from racist/sexist tweets
HT_negative = hashtag_extract(train['tweet'][train['label'] == 1])

# unnesting list
HT_regular = sum(HT_regular, [])
HT_negative = sum(HT_negative, [])

a = nltk.FreqDist(HT_regular)
d = pd.DataFrame({'Hashtag': list(a.keys()), 'Count': list(a.values())})

# selecting top 20 most frequent hashtags
d = d.nlargest(columns="Count", n=20)
plt.figure(figsize=(16, 5))
ax = sns.barplot(data=d, x="Hashtag", y="Count")
ax.set(ylabel='Count')
plt.show()

a = nltk.FreqDist(HT_negative)
d = pd.DataFrame({'Hashtag': list(a.keys()), 'Count': list(a.values())})

# selecting top 20 most frequent hashtags
d = d.nlargest(columns="Count", n=20)
plt.figure(figsize=(16, 5))
ax = sns.barplot(data=d, x="Hashtag", y="Count")
ax.set(ylabel='Count')
plt.show()

# tokenizing the words present in the training set
tokenized_tweet = train['tweet'].apply(lambda x: x.split())
コード例 #33
0
    label[label.isin(['WALKING_UPSTAIRS', 'WALKING_DOWNSTAIRS'])], 'subject'
])['duration'].count() * 1.28)
duration_df = pd.DataFrame(duration_df)

# Sort the values of duration
plot_data = duration_df.reset_index().sort_values('duration', ascending=False)
plot_data['Activity'] = plot_data['Activity'].map({
    'WALKING_UPSTAIRS':
    'Upstairs',
    'WALKING_DOWNSTAIRS':
    'Downstairs'
})

# Plot the durations for staircase use
plt.figure(figsize=(15, 5))
sns.barplot(data=plot_data, x='subject', y='duration', hue='Activity')
plt.title('Participants Compared By Their Staircase Walking Duration')
plt.xlabel('Participants')
plt.ylabel('Total Duration [s]')
plt.show()

# --------------
#exclude the Activity column and the subject column
feature_cols = data.columns[:-2]

#Calculate the correlation values
correlated_values = data[feature_cols].corr()
#stack the data and convert to a dataframe

correlated_values = (correlated_values.stack().to_frame().reset_index().rename(
    columns={
コード例 #34
0
print(diabetes_df.head())

diabetes_df.info()
diabetes_df.isnull().sum()

corr = diabetes_df.corr()
print(corr)
sns.heatmap(corr, xticklabels=corr.columns, yticklabels=corr.columns)

plt.subplots(figsize=(18, 15))
plt.subplot(4, 3, 1)
plt.subplots_adjust(wspace=0.2, hspace=0.5)
sns.countplot(x='Outcome', data=diabetes_df)
plt.subplot(4, 3, 2)
plt.subplots_adjust(wspace=0.2, hspace=0.5)
sns.barplot(x='Outcome', y='Age', data=diabetes_df)
plt.show()

#data analysis
columns = diabetes_df.columns[:8]
# print(columns)
plt.subplots(figsize=(18, 15))
length = len(columns)
for i, j in itertools.zip_longest(columns, range(length)):
    plt.subplot((length / 2), 3, j + 1)
    plt.subplots_adjust(wspace=0.2, hspace=0.5)
    diabetes_df[i].hist(bins=20, edgecolor='black')
    plt.title(i)
plt.show()

#analysis of diabetic classes
コード例 #35
0
# drop Parch & SibSp
data_train = data_train.drop(['SibSp', 'Parch'], axis=1)
data_test = data_test.drop(['SibSp', 'Parch'], axis=1)

import seaborn as sns
sns.set_style('whitegrid')
get_ipython().magic(u'matplotlib inline')
# plot
fig, (axis1, axis2) = plt.subplots(1, 2, sharex=True, figsize=(10, 5))

sns.countplot(x='Family', data=data_train, order=[1, 0], ax=axis1)

family_perc = data_train[["Family",
                          "Survived"]].groupby(['Family'],
                                               as_index=False).mean()
sns.barplot(x='Family', y='Survived', data=family_perc, order=[1, 0], ax=axis2)

axis1.set_xticklabels(["With Family", "Alone"], rotation=0)

# In[ ]:

#cabin
#看看这个值的有无,对于survival的分布状况,影响如何
fig = plt.figure(figsize=(13, 7))
fig.set(alpha=0.5)  # 设定图表颜色alpha参数

Survived_cabin = data_train.Survived[pd.notnull(
    data_train.Cabin)].value_counts()
Survived_nocabin = data_train.Survived[pd.isnull(
    data_train.Cabin)].value_counts()
df = pd.DataFrame({
コード例 #36
0
        if name in acc_dict:
            acc_dict[name] += acc
        else:
            acc_dict[name] = acc
        # print '{0}: {1}'.format(name, acc * 100)

for clf in acc_dict:
    acc_dict[clf] = acc_dict[clf] / 10.0
    log_entry = pd.DataFrame([[clf, acc_dict[clf]]], columns=log_cols)
    log = log.append(log_entry)

plt.xlabel('Accuracy')
plt.title('Classifier Accuracy')
# plt.show()
sns.set_color_codes("muted")
sns.barplot(x='Accuracy', y='Classifier', data=log, color="b")
from operator import itemgetter

sorted_dict = sorted(acc_dict.items(), key=itemgetter(1), reverse=True)

for k, v in sorted_dict:
    print "{0}-{1:.2%}".format(k, v)

ntrain = X_train.shape[0]
ntest = y_test.shape[0]
SEED = 0  # for reproducibility
NFOLDS = 5  # set folds for out-of-fold prediction
kf = KFold(ntrain, n_folds=NFOLDS, random_state=SEED)

class SklearnHelper(object):
    def __init__(self, clf, seed=0, params=None):
コード例 #37
0
# ax=sns.barplot(x="smoothing_window(hour)", y="RMSE", hue="weather_data_used", data=tmp, ax=ax )
# ax.set_title("PPA data")
#
# ax = fig.add_subplot(122)
# tmp=df[df["test_set"]=="EPA"]
# ax=sns.barplot(x="smoothing_window(hour)", y="RMSE", hue="weather_data_used", data=tmp, ax=ax )
# ax.set_title("EPA data")
#
# plt.show()

fig = plt.figure(figsize=(20, 10))

ax = fig.add_subplot(121)
tmp = df[df["weather_data_used"] == True]
ax = sns.barplot(x="smoothing_window(hour)",
                 y="RMSE",
                 hue="test_set",
                 data=tmp,
                 ax=ax)
ax.set_title("with weather data")

ax = fig.add_subplot(122)
tmp = df[df["weather_data_used"] == False]
ax = sns.barplot(x="smoothing_window(hour)",
                 y="RMSE",
                 hue="test_set",
                 data=tmp,
                 ax=ax)
ax.set_title("no weather data")

plt.show()
コード例 #38
0
df.isnull().sum()

# # Visualization

# In[91]:

import matplotlib.pyplot as plt
import seaborn as sns
get_ipython().run_line_magic('matplotlib', 'inline')

# In[92]:

sns.barplot(x=df['Gender'],
            y=df['Loan_Status'],
            data=df,
            label="Relationship among Gender and Loan Approval Status",
            ci=None)

# In[93]:

sns.barplot(x=df["Married"],
            y=df['Loan_Status'],
            data=df,
            label="Relationship among Gender and Loan Approval Status",
            ci=None)

# In[94]:

sns.catplot(x="Married",
            y="ApplicantIncome",
コード例 #39
0
# %%
import graphviz

with open("tree.dot") as f:
    dot_graph = f.read()
graphviz.Source(dot_graph)
# %%
import seaborn as sns
import numpy as np

print("Feature importances:\n{0}".format(
    np.round(dt_clf.feature_importances_, 3)))
for name, value in zip(iris_data.feature_names, dt_clf.feature_importances_):
    print("{0} : {1:.3f}".format(name, value))

sns.barplot(x=dt_clf.feature_importances_, y=iris_data.feature_names)
sns.despine()

# %%
import pandas as pd
import matplotlib.pyplot as plt

feature_name_df = pd.read_csv('./human_activity/features.txt',
                              sep='\s+',
                              header=None,
                              names=['column_index', 'column_name'])


def get_new_feature_name_df(old_feature_name_df):
    feature_dup_df = pd.DataFrame(
        data=old_feature_name_df.groupby('column_name').cumcount(),
コード例 #40
0
ファイル: kernel_Titanic.py プロジェクト: nbrrawal/Kaggle
    train_predict = cl.predict(a_test)
    acct = accuracy_score(b_test, train_predict)
    if name in acct_dict: 
        acct_dict[name] += acct 
    else : 
        acct_dict[name] = acct 
    
for cl in acct_dict: 
    acct_dict[cl] = acct_dict[cl]/10.0
    log_entry = pd.DataFrame([[cl, acct_dict[cl]]], columns=["Classifier", "Accuracy"])
    log= log.append(log_entry)
    
plt.xlabel('Accuracy')
plt.title('Classifier Accuracy')
sns.set_color_codes('muted')
sns.barplot(x="Accuracy", y ="Classifier", data=log, color="b")

 
 
 
 
 
 
 
 
 
 
 
 
 
 
コード例 #41
0
def train_model_classification(X, X_test, y, params, folds, model_type='lgb', eval_metric='auc', columns=None,
                               plot_feature_importance=False, model=None,
                               verbose=10000, early_stopping_rounds=200, n_estimators=50000):
    """
    A function to train a variety of regression models.
    Returns dictionary with oof predictions, test predictions, scores and, if necessary, feature importances.

    :params: X - training data, can be pd.DataFrame or np.ndarray (after normalizing)
    :params: X_test - test data, can be pd.DataFrame or np.ndarray (after normalizing)
    :params: y - target
    :params: folds - folds to split data
    :params: model_type - type of model to use
    :params: eval_metric - metric to use
    :params: columns - columns to use. If None - use all columns
    :params: plot_feature_importance - whether to plot feature importance of LGB
    :params: model - sklearn model, works only for "sklearn" model type

    """
    columns = X.columns if columns == None else columns
    X_test = X_test[columns]

    # to set up scoring parameters
    metrics_dict = {'auc': {'lgb_metric_name': eval_auc,
                            'catboost_metric_name': 'AUC',
                            'sklearn_scoring_function': metrics.roc_auc_score},
                    }

    result_dict = {}

    # out-of-fold predictions on train data
    oof = np.zeros((len(X), len(set(y.values))))

    # averaged predictions on train data
    prediction = np.zeros((len(X_test), oof.shape[1]))

    # list of scores on folds
    scores = []
    feature_importance = pd.DataFrame()

    # split and train on folds
    for fold_n, (train_index, valid_index) in enumerate(folds.split(X)):
        print(f'Fold {fold_n + 1} started at {time.ctime()}')
        if type(X) == np.ndarray:
            X_train, X_valid = X[columns][train_index], X[columns][valid_index]
            y_train, y_valid = y[train_index], y[valid_index]
        else:
            X_train, X_valid = X[columns].iloc[train_index], X[columns].iloc[valid_index]
            y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]

        if model_type == 'lgb':
            model = lgb.LGBMClassifier(**params, n_estimators=n_estimators, n_jobs=-1)
            model.fit(X_train, y_train,
                      eval_set=[(X_train, y_train), (X_valid, y_valid)],
                      eval_metric=metrics_dict[eval_metric]['lgb_metric_name'],
                      verbose=verbose, early_stopping_rounds=early_stopping_rounds)

            y_pred_valid = model.predict_proba(X_valid)
            y_pred = model.predict_proba(X_test, num_iteration=model.best_iteration_)

        if model_type == 'xgb':
            train_data = xgb.DMatrix(data=X_train, label=y_train, feature_names=X.columns)
            valid_data = xgb.DMatrix(data=X_valid, label=y_valid, feature_names=X.columns)

            watchlist = [(train_data, 'train'), (valid_data, 'valid_data')]
            model = xgb.train(dtrain=train_data, num_boost_round=n_estimators, evals=watchlist,
                              early_stopping_rounds=early_stopping_rounds, verbose_eval=verbose, params=params)
            y_pred_valid = model.predict(xgb.DMatrix(X_valid, feature_names=X.columns),
                                         ntree_limit=model.best_ntree_limit)
            y_pred = model.predict(xgb.DMatrix(X_test, feature_names=X.columns), ntree_limit=model.best_ntree_limit)

        if model_type == 'sklearn':
            model = model
            model.fit(X_train, y_train)

            y_pred_valid = model.predict(X_valid).reshape(-1, )
            score = metrics_dict[eval_metric]['sklearn_scoring_function'](y_valid, y_pred_valid)
            print(f'Fold {fold_n}. {eval_metric}: {score:.4f}.')
            print('')

            y_pred = model.predict_proba(X_test)

        if model_type == 'cat':
            model = CatBoostClassifier(iterations=n_estimators,
                                       eval_metric=metrics_dict[eval_metric]['catboost_metric_name'], **params,
                                       loss_function=metrics_dict[eval_metric]['catboost_metric_name'])
            model.fit(X_train, y_train, eval_set=(X_valid, y_valid), cat_features=[], use_best_model=True,
                      verbose=False)

            y_pred_valid = model.predict(X_valid)
            y_pred = model.predict(X_test)

        oof[valid_index] = y_pred_valid
        scores.append(metrics_dict[eval_metric]['sklearn_scoring_function'](y_valid, y_pred_valid[:, 1]))

        prediction += y_pred

        if model_type == 'lgb' and plot_feature_importance:
            # feature importance
            fold_importance = pd.DataFrame()
            fold_importance["feature"] = columns
            fold_importance["importance"] = model.feature_importances_
            fold_importance["fold"] = fold_n + 1
            feature_importance = pd.concat([feature_importance, fold_importance], axis=0)

    prediction /= folds.n_splits

    print('CV mean score: {0:.4f}, std: {1:.4f}.'.format(np.mean(scores), np.std(scores)))

    result_dict['oof'] = oof
    result_dict['prediction'] = prediction
    result_dict['scores'] = scores

    if model_type == 'lgb':
        if plot_feature_importance:
            feature_importance["importance"] /= folds.n_splits
            cols = feature_importance[["feature", "importance"]].groupby("feature").mean().sort_values(
                by="importance", ascending=False)[:50].index

            best_features = feature_importance.loc[feature_importance.feature.isin(cols)]

            plt.figure(figsize=(16, 12));
            sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance", ascending=False));
            plt.title('LGB Features (avg over folds)');

            result_dict['feature_importance'] = feature_importance

    return result_dict
コード例 #42
0
   
#Algorithm Random Forest
#Visualize important features
import matplotlib.pyplot as plt
import seaborn as sns
#%matplotlib inline
pd.set_option('display.max_rows', 350)
pd.set_option('display.max_columns', 350)
plt.style.use('ggplot')

feature_imp = pd.Series(clf.feature_importances_,index=X.columns).sort_values(ascending=False)

# Creating a bar plot, displaying only the top k features
k=20
sns.barplot(x=feature_imp[:20], y=feature_imp.index[:k])
# Add labels to your graph
plt.xlabel('Feature Importance Score')
plt.ylabel('Features')
plt.title("Visualizing Important Features")
plt.legend()
plt.show()

# List top k important features
k=20
feature_imp.sort_values(ascending=False)[:k]

#Algorithm Random Forest
#Select the top important features, set the threshold
# Create a selector object that will use the random forest classifier to identify
# features that have an importance of more than 0.03
コード例 #43
0
#dans l'opération précédente la date et le channel sont partis dans l'index
dateChannel_data = dateChannel_data.reset_index(
)  #recrée les colonnes date et channel
dateChannel_data['Année'] = dateChannel_data['date'].astype(
    str).str[:4]  #ajoute l'année
dateChannel_data.info()
dateChannel_data.sort_values(by=['date', 'channel'])
dateChannel_data.head(20)
##########################################################################
# Graphique en barre général Répartition du trafic selon les canaux.
##########################################################################
sns.set()  #paramètres esthétiques ressemble à ggplot par défaut.
fig, ax = plt.subplots()  #un seul plot
sns.barplot(x='channel',
            y='pageviews',
            data=dateChannel_data,
            estimator=sum,
            order=sorted(dfPVChannel['channel'].unique()))
fig.suptitle(
    "Le canal 'search' est le premier contributeur en termes de trafic.",
    fontsize=14,
    fontweight='bold')
ax.set(
    xlabel="Canal",
    ylabel="Pages vues",
    title="Le canal 'direct' (fourre tout) est malheureusement important aussi."
)
fig.text(.35,
         -.03,
         "Trafic Global - Pages vues selon les canaux depuis 2011",
         fontsize=9)
コード例 #44
0
ファイル: plotting.py プロジェクト: lytb123/fitbit-analyzer
def plot(data, columns, measureName, nrows, ncols, order=None):
    f, axes = plt.subplots(nrows=nrows, ncols=ncols)
    axes = axes.reshape(-1)
    for i, c in enumerate(columns):
        sns.barplot(x=measureName, y=c, data=data, order=order, ax=axes[i])
    sns.plt.show()
コード例 #45
0
print(countries_df.head())
analysis_dict = {'variables': list(countries_df.columns.values),
                 'count': list(countries_df.count().values),
                 'v_types': list(countries_df.dtypes.values),
                 'n_null': list(countries_df.isnull().sum().values),
                 'n_uniques': list(countries_df.nunique().values)}

analysis = pd.DataFrame(analysis_dict)
print(analysis)

brazil_land_type_df = countries_df.loc[countries_df['countryName'] == 'Brazil', ['year', 'cropLand', 'grazingLand', 'forestLand']].set_index('year')
ax1 = sns.lineplot(data=brazil_land_type_df, ci=None, legend='brief')
ax1.set_title('Brazil ground types areas evolution over the years')
ax1.set_ylabel('Areas (gha)')
ax1.set_xlabel('Years')
plt.show()

fishingGround_df = countries_df.loc[(countries_df['year'] == 2016) & (countries_df['record'] == 'AreaTotHA'), ['countryName', 'fishingGround']].nlargest(40, 'fishingGround')
ax2 = sns.barplot(x='fishingGround', y='countryName', data=fishingGround_df)
ax2.set_title('2016 world top40 fishing ground')
ax2.set_xlabel('Fishing Ground (gha)')
plt.show()

portugal_area_df = countries_df.loc[(countries_df['countryName'] == 'Portugal') & (countries_df['year'] == 2016) & (countries_df['record'] == 'AreaTotHA'), ['cropLand', 'grazingLand', 'forestLand']].T
ax3 = portugal_area_df.plot.pie(y=160552, title='Portugal 2016 ground types area', legend=False)
ax3.set_ylabel('')
plt.show()


コード例 #46
0
# In[8]:

#Let's find the columns with any null objects and plot them based on their count
data_na = train.isnull().sum()
data_na = data_na[data_na > 0]
data_na = data_na.to_frame()
data_na.columns = ['count']
data_na.index.names = ['name']
data_na['name'] = data_na.index

# In[9]:

#Plotting a bar plot of number null objects in each column
plt.figure(figsize=(25, 8))
sns.set(style='ticks')
sns.barplot(x='name', y='count', data=data_na)
plt.show()

# In[10]:

#The numerical features of data. We will used the median of each and group movies based on production companies.
train.select_dtypes(include=[np.number]).columns

# In[11]:

#First drop columns with more than 80% null values
train = train.dropna(thresh=0.80 * len(train), axis=1)

# In[12]:

#Check now to see which columns still have null values
コード例 #47
0
cm_6=confusion_matrix(y_true=y_test,y_pred=model.predict(X_test))
acc_6=accuracy_score(y_test,model.predict(X_test))
dic["XG Boost"]=acc_6

#Analysing the results of the Results
Estimators=[]
Accuracy=[]
for i in dic:
    Estimators.append(i)
    Accuracy.append(dic[i]*100)
d={'Estimators':Estimators,"Accuracy":Accuracy}
df=pd.DataFrame(data=d)
plt.figure(num=3)
plt.ylim(0,100)
plt.title("All classification estimators with accuracy score")
sns.barplot(x='Estimators',y='Accuracy',data=df)
plt.show()


#Finalizing the model
model=tree_final
print("Enter the Value of X acceleration: ")
x=int(input())
print("Enter the Value of Y acceleration: ")
y=int(input())
f=sc.transform(np.array([[x,y]]))
x=f[0][0]
y=f[0][1]
t=model.predict(np.array([[x,y]]))
if(t==0):
    print("The phone has fallen")
コード例 #48
0
Y=c1.iloc[:,10]
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.5,random_state=0)
from sklearn.preprocessing import StandardScaler
sc=StandardScaler()
X_train=sc.fit_transform(X_train)
X_test=sc.fit_transform(X_test)

from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators = 200, criterion = 'entropy', random_state = 0)
classifier.fit(X_train,Y_train)
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score
pred=classifier.predict(X_test)
accuracy=accuracy_score(Y_test,pred)
print("Accuracy:",accuracy*100)
cm=confusion_matrix(Y_test,pred)
cm
cls1=classification_report(Y_test,pred)
print(cls1)
sns.pairplot(c1)
classifier.estimators_
sns.heatmap(cm, annot=True)
import graphviz 
dot_data = tree.export_graphviz(clf, out_file='tree.dot')
classifier.feature_importances_
featureimp=pd.Series(classifier.feature_importances_).sort_values(ascending=True)
print(featureimp)
sns.barplot(x=round(featureimp,4),y=featureimp)
plt.xlabel("Feature importance")
plt.show()
i
コード例 #49
0
ファイル: rdm_compare.py プロジェクト: Brinkmak/URIAL
def rdm_compare(rdms, models, comp=None, plot=None):
    '''function to compare target and model rmds'''

    import pandas as pd
    from scipy.spatial import distance
    from nilearn.connectome import sym_matrix_to_vec, vec_to_sym_matrix
    from scipy.stats import rankdata, spearmanr, kendalltau, pearsonr, mstats
    import numpy as np
    from itertools import combinations
    import pickle
    import seaborn as sns
    import matplotlib.pyplot as plt
    import copy

    if isinstance(rdms, str) is True:
        with open(rdms, 'rb') as f:
            dict_rdms = pickle.load(f)
        target_rdms = copy.deepcopy(dict_rdms['rdm'])
        target_conds = target_rdms[0].keys()
    else:
        target_rdms = rdms
        target_conds = rdms[0].keys()

    if isinstance(models, str) is True:
        with open(models, 'rb') as f:
            dict_models = pickle.load(f)
            models = dict_models['rdm']
            model_ids = dict_models['id']
    else:
        models = models

    for rdm in dict_models['rdm']:
        if 'Unnamed: 0' in rdm:
            del rdm['Unnamed: 0']

    for index, rdm in enumerate(target_rdms):
        target_rdms[index] = target_rdms[index].as_matrix()

    list_cor_rdm = list(range(0, len(target_rdms)))
    list_p = list(range(0, len(target_rdms)))
    target_rdms_trans = list(range(0, len(target_rdms)))

    if comp is None or comp == 'spearman':
        for index, rdm in enumerate(target_rdms):
            target_rdms_trans[index] = vec_to_sym_matrix(rankdata(sym_matrix_to_vec(rdm)))
            rdm_avg = pd.DataFrame(np.mean(target_rdms_trans, axis=0), columns=target_conds)

        for index, part_rdm in enumerate(target_rdms_trans):
            list_cor_rdm[index], list_p[index] = spearmanr(part_rdm.flatten(), rdm_avg.as_matrix().flatten())

        list_cor_sub = list()
        list_cor_rdm_sub = list()
        list_p_sub = list()

        for index, part in enumerate(target_rdms_trans):
            tmp_rdms = target_rdms_trans.copy()
            tmp_part = target_rdms_trans[index]
            tmp_rdms.pop(index)
            tmp_rdm_avg = np.mean(tmp_rdms, axis=0)
            list_cor_sub.append(spearmanr(tmp_part.flatten(), tmp_rdm_avg.flatten()))

        for i, cor in enumerate(list_cor_sub):
            list_cor_rdm_sub.append(cor.correlation)
            list_p_sub.append(cor.pvalue)

    elif comp == 'kendalltaua':
        for index, rdm in enumerate(target_rdms):
            target_rdms_trans[index] = vec_to_sym_matrix(rankdata(sym_matrix_to_vec(rdm)))
            rdm_avg = pd.DataFrame(np.mean(target_rdms, axis=0), columns=target_conds)

        for index, part_rdm in enumerate(target_rdms):
            list_cor_rdm[index], list_p[index] = kendalltau(part_rdm.flatten(), rdm_avg.as_matrix().flatten())

        list_cor_sub = list()
        list_cor_rdm_sub = list()
        list_p_sub = list()

        for index, part in enumerate(target_rdms):
            tmp_rdms = target_rdms.copy()
            tmp_part = target_rdms[index]
            tmp_rdms.pop(index)
            tmp_rdm_avg = np.mean(tmp_rdms, axis=0)
            list_cor_sub.append(kendalltau(tmp_part.flatten(), tmp_rdm_avg.flatten()))

        for i, cor in enumerate(list_cor_sub):
            list_cor_rdm_sub.append(cor.correlation)
            list_p_sub.append(cor.pvalue)

    elif comp == 'pearson':
        for index, rdm in enumerate(target_rdms):
            target_rdms_trans[index] = vec_to_sym_matrix(mstats.zscore(sym_matrix_to_vec(rdm)))
            rdm_avg = pd.DataFrame(np.mean(target_rdms_trans, axis=0), columns=target_conds)

        for index, part_rdm in enumerate(target_rdms_trans):
            list_cor_rdm[index], list_p[index] = pearsonr(part_rdm.flatten(), rdm_avg.as_matrix().flatten())

        list_cor_sub = list()
        list_cor_rdm_sub = list()
        list_p_sub = list()

        for index, part in enumerate(target_rdms_trans):
            tmp_rdms = target_rdms_trans.copy()
            tmp_part = target_rdms_trans[index]
            tmp_rdms.pop(index)
            tmp_rdm_avg = np.mean(tmp_rdms, axis=0)
            list_cor_sub.append(pearsonr(tmp_part.flatten(), tmp_rdm_avg.flatten()))

        for i, cor in enumerate(list_cor_sub):
            list_cor_rdm_sub.append(cor[0])
            list_p_sub.append(cor[1])

    upper_noise_ceiling = np.mean(list_cor_rdm)
    lower_noise_ceiling = np.mean(list_cor_rdm_sub)

    model_comp = pd.DataFrame(columns=['participant', 'models', 'cor'],
                              index=np.arange(len(dict_models['id']) * len(dict_rdms['id'])))
    model_comp['participant'] = dict_rdms['id'] * len(dict_models['id'])
    model_comp['models'] = sorted(dict_models['id'] * len(dict_rdms['id']))

    list_cor_models = list()

    snd_rdms = list()
    snd_rdms.append(rdm_avg.as_matrix())
    for mod_rdm in models:
        snd_rdms.append(mod_rdm.as_matrix())

    ids_rdms = list()
    ids_rdms.append('group average')
    for mod_ids in model_ids:
        ids_rdms.append(mod_ids)

    if comp is None or comp == 'spearman':
        for index, model_rdm in enumerate(dict_models['rdm']):
            for i, sub_rdm in enumerate(target_rdms_trans):
                list_cor_models.append(spearmanr(sub_rdm.flatten(), model_rdm.as_matrix().flatten()).correlation)
                rdms_dist = [spearmanr(x.flatten(), y.flatten()).correlation for x, y in combinations(snd_rdms, 2)]
                rdms_dist = pd.DataFrame(distance.squareform(rdms_dist), columns=ids_rdms)
    elif comp == 'kendalltaua':
        for index, model_rdm in enumerate(dict_models['rdm']):
            for i, sub_rdm in enumerate(target_rdms):
                list_cor_models.append(kendalltau(sub_rdm.flatten(), model_rdm.as_matrix().flatten()).correlation)
                rdms_dist = [kendalltau(x.flatten(), y.flatten()).correlation for x, y in combinations(snd_rdms, 2)]
                rdms_dist = pd.DataFrame(distance.squareform(rdms_dist), columns=ids_rdms)
    elif comp == 'pearson':
        for index, model_rdm in enumerate(dict_models['rdm']):
            for i, sub_rdm in enumerate(target_rdms_trans):
                list_cor_models.append(pearsonr(sub_rdm.flatten(), model_rdm.as_matrix().flatten())[0])
                rdms_dist = [pearsonr(x.flatten(), y.flatten())[0] for x, y in combinations(snd_rdms, 2)]
                rdms_dist = pd.DataFrame(distance.squareform(rdms_dist), columns=ids_rdms)

    model_comp['cor'] = list_cor_models

    if plot is None:
        print('results will not be plotted')
    elif plot == 'bar':
        ax = sns.barplot(x=model_comp['models'], y=model_comp['cor'], data=model_comp)
        plt.plot(np.linspace(-20, 120, 1000), [upper_noise_ceiling] * 1000, 'r', alpha=0.1)
        plt.plot(np.linspace(-20, 120, 1000), [lower_noise_ceiling] * 1000, 'r', alpha=0.1)
        rect = plt.Rectangle((-20, lower_noise_ceiling), 10000, (upper_noise_ceiling - lower_noise_ceiling), color='r',
                             alpha=0.5)
        ax.set_xticklabels(labels=list(dict_models['id']))
        if comp is None or comp == 'spearman':
            ax.set(ylabel='spearman correlation with target RDM')
        if comp == 'pearson':
            ax.set(ylabel='pearson correlation with target RDM')
        if comp == 'kendalltaua':
            ax.set(ylabel='kendall tau a correlation with target RDM')
        ax.add_patch(rect)
        plt.tight_layout()
    elif plot == 'violin':
        ax = sns.violinplot(x=model_comp['models'], y=model_comp['cor'], data=model_comp)
        plt.plot(np.linspace(-20, 120, 1000), [upper_noise_ceiling] * 1000, 'r', alpha=0.1)
        plt.plot(np.linspace(-20, 120, 1000), [lower_noise_ceiling] * 1000, 'r', alpha=0.1)
        rect = plt.Rectangle((-20, lower_noise_ceiling), 10000, (upper_noise_ceiling - lower_noise_ceiling), color='r',
                             alpha=0.5)
        ax.set_xticklabels(labels=list(dict_models['id']))
        if comp is None or comp == 'spearman':
            ax.set(ylabel='spearman correlation with target RDM')
        if comp == 'pearson':
            ax.set(ylabel='pearson correlation with target RDM')
        if comp == 'kendalltaua':
            ax.set(ylabel='kendall tau a correlation with target RDM')
        ax.add_patch(rect)
        plt.tight_layout()

    return rdm_avg, model_comp, rdms_dist
コード例 #50
0
 for g1 in grps:
     for g2 in grps[grps.index(g1) + 1:]:
         if g1 != g2:
             keys.append(str(g1 + '_' + g2))
             x = list(df.loc[g1, :])
             y = list(df.loc[g2, :])
             res = mannwhitneyu(x, y, alternative='two-sided')
             res_df = res_df.append({
                 'statistic': res[0],
                 'p-value': res[1]
             },
                                    ignore_index=True)
 res_df = res_df.set_index(pd.Index(keys, 'Tissue'))
 corrected_p_values = multipletests(res_df['p-value'])[1]
 res_df['cor_p-value'] = pd.Series(corrected_p_values, index=keys)
 res_df = res_df.sort_values(by='cor_p-value')
 df['Average Accuracy'] = df.mean(axis=1)
 df['sdev'] = df.std(axis=1)
 plt.figure(figsize=(10, 6))
 ax = sns.barplot(x=df.index.values,
                  y=df['Average Accuracy'],
                  yerr=df['sdev'] * 1,
                  capsize=.2)
 x = ax.set_title("Average Model Accuracies")
 x = ax.set_xlabel("Tissues")
 x = ax.set_ylabel("Average Accuracy - 1 SD")
 x = ax.set_xticklabels(labels=df.index.values, rotation=38)
 fig = ax.get_figure()
 fig.savefig("plots/all_model_accuracy.png", dpi=100, bbox_inches="tight")
 from IPython.display import display, HTML
 display(res_df.head(10))
コード例 #51
0
# In[ ]:

m = sns.distplot(dataset["Fare"],
                 color="r",
                 label="Skewness : %.2f" % (dataset["Fare"].skew()))
m = m.legend(loc="best")

# skewness is reduced

# ### 3.2 Categorical values

# #### Sex

# In[ ]:

g = sns.barplot(x="Sex", y="Survived", data=train)
g = g.set_ylabel("Survival Probability")

# Females have a high rate of Survival

# In[ ]:

# See the two groups data ratio
train[["Sex", "Survived"]].groupby('Sex').mean()

# It shows clearly that Female have more chance to survive than Male.
# So Sex, will play an important role in the prediction of the survival.

# #### Pclass

# In[ ]:
コード例 #52
0
####################################
fig, ax = plt.subplots(nrows=1, ncols=3, figsize=(12, 6))

sns.scatterplot(x=diamond['cut'], y=diamond['price'], color='r', ax=ax[0])

sns.scatterplot(x=diamond['color'], y=diamond['price'], color='y', ax=ax[1])

sns.scatterplot(x=diamond['clarity'], y=diamond['price'], color='g', ax=ax[2])
plt.tight_layout()
plt.show()
# %% [markdown]
# ### 3.2.3 Bar plot
# %%
fig, ax = plt.subplots(nrows=1, ncols=3, figsize=(12, 6))

sns.barplot(x=diamond['cut'], y=diamond['price'], color='r', ax=ax[0])

sns.barplot(x=diamond['color'], y=diamond['price'], color='y', ax=ax[1])
sns.barplot(x=diamond['clarity'], y=diamond['price'], color='g', ax=ax[2])

# %% [markdown]
# Insight:<br>
# 1. cut at 4 level is best deal price;<br>
# 2. color at 1 is best average price, which is weird, because the worst color has the best price.<br>
#    Probably most customers are hard to tell which is the better color;<br>
# 3. clarity 2 is the best price, which is weird too, as the clarity 2 is not a good level.

# %%
print(diamond['price'][diamond['color'] == 5].mean())
print(diamond['carat'][diamond['color'] == 5].mean())
# %%
コード例 #53
0
#Adding a grid for better visualization
plt.grid()

#%%
#Visualization - 2

sns.set(style="white", context="talk")

# Set up the matplotlib figure
f2, (ax1, ax2, ax3) = plt.subplots(3, 1, figsize=(7, 5), sharex=True)

# Generate some sequential data

x = [60, 90, 120]
y1 = y_past_1
sns.barplot(x=x, y=y1, palette="Reds", ax=ax1).set_title('Corridor 1')
ax1.axhline(0, color="k", clip_on=False)
ax1.set_ylabel("Pasture")

# Generate some sequential data

x = [60, 90, 120]
y1 = y_agri_1
sns.barplot(x=x, y=y1, palette="Reds", ax=ax2)
ax2.axhline(0, color="k", clip_on=False)
ax2.set_ylabel("Agriculture")

# Generate some sequential data

x = [60, 90, 120]
y1 = y_exp_s_1
コード例 #54
0
ファイル: analy.py プロジェクト: heavy-snowy/python
zf = pd.DataFrame(zf, columns=columns)

# # 重新审视数据集
display(zf.head(n=2))

# 对二手房区域分组对比二手房数量和每平米房价
df_house_count = zf.groupby('Region')['price'].count().sort_values(
    ascending=False).to_frame().reset_index()
df_house_mean = zf.groupby('Region')['perPrice'].mean().sort_values(
    ascending=False).to_frame().reset_index()

f, [ax3, ax1, ax2] = plt.subplots(3, 1, figsize=(20, 15))

sns.barplot(x='Region',
            y='perPrice',
            palette="Blues_d",
            data=df_house_mean,
            ax=ax1)
ax1.set_title('深圳各个区域的每平方米的租金对比', fontsize=15)
ax1.set_xlabel('region', rotation=80, fontsize=1)
ax1.set_ylabel('unit price')

sns.barplot(x='Region',
            y='price',
            palette="Greens_d",
            data=df_house_count,
            ax=ax2)
ax2.set_title('深圳各个区域的出租房数量对比', fontsize=15)
ax2.set_xlabel('region')
ax2.set_ylabel('quantity')
コード例 #55
0
plt.plot(job_admin.keys(), job_admin.values, label="admin")
plt.plot(job_technician.keys(), job_technician.values, label="technician")
plt.plot(job_blue.keys(), job_blue.values, label="blue-collar")
plt.plot(job_entrepreneur.keys(),
         job_entrepreneur.values,
         label="entrepreneur")
plt.plot(job_management.keys(), job_management.values, label="management")
plt.plot(job_retired.keys(), job_retired.values, label="retired")
plt.ylabel("No of employees")
plt.xlabel("Age range")
plt.title("Jobs VS age (Fig3)")
plt.legend()
plt.show()

# Bar plot for Job and salary using seaborn
sns.barplot(x="job", y="salary", hue="marital", data=data)
plt.xticks(rotation=45)
plt.title("Jobs and salaries (Fig4)")
plt.show()

# pie chart for job
plt.pie(data["job"].value_counts().values,
        autopct='%1.2f%%',
        labels=data["job"].value_counts().keys())
plt.title("Pie chart of Job (Fig 5)")
plt.show()

# Scatter plot between salary and age
plt.scatter(data["salary"], data["age"], color="red", alpha=0.5)
plt.xlabel("salary")
plt.ylabel("age")
コード例 #56
0
ファイル: Kernel.py プロジェクト: chfenix/kaggle
data_train["Embarked"].fillna(data_train["Embarked"].mode().iloc[0],
                              inplace=True)

print("==================Feture Fill End===================")
print(data_train.info())

# 查看不同特性下生存概率是否与特性有关系
plt.figure(figsize=(15, 10))
view_feature = ["Pclass", "Sex", "SibSp", "Parch", "Cabin", "Embarked"]
for i, feature_name in enumerate(view_feature):
    plt.subplot(2, 3, (i + 1))
    # 按照属性Groupby,并计算生存均值(由于Survived为0、1,均值即表示生存概率)
    sns.barplot(
        x=feature_name,
        y="Survived",
        # hue="Survived",
        data=data_train[[feature_name,
                         "Survived"]].groupby([feature_name],
                                              as_index=False).mean())
    # 显示不同属性值下生存与未生还的柱状图组(个人感觉不是太直观)
    # sns.countplot(x=feature_name, hue="Survived", data=data_train)

# 查看年龄和生存率的关系
data_train["Age_int"] = data_train["Age"].astype("int")
plt.subplots(1, 1, figsize=(18, 4))
sns.barplot(x="Age_int",
            y="Survived",
            data=data_train[["Age_int",
                             "Survived"]].groupby(["Age_int"],
                                                  as_index=False).mean())
# plt.show()
コード例 #57
0
mask = (DatosOrdenadosPorFecha_df['dateRep'] >=
        start_date) & (DatosOrdenadosPorFecha_df['dateRep'] <= end_date)
fechasfiltradas_df = DatosOrdenadosPorFecha_df.loc[mask]

#Creamos el dataframe con los datos que se van a utilizar en el informe
grafico_df = fechasfiltradas_df[['dateRep', 'cases', 'moving14', 'moving7']]

#Formateamos la fecha de nuevo a formato cadena para que se muestre correctamente en el gráfico
grafico_df['dateRep'] = grafico_df['dateRep'].astype(str)

#Dibujamos el gráfico
fig, ax = plt.subplots(1, 1)

grafico = sns.barplot(ax=ax,
                      x="dateRep",
                      y="cases",
                      data=grafico_df,
                      label="Nuevos Casos Diarios")

###Esto es para los índices de un gráfico catplot
#grafico.set_titles("Nuevos Casos en España", fontsize=30)
#grafico.set_xlabels("Fecha",fontsize=20)
#grafico.set_ylabels("España",fontsize=20)
#grafico.set_yticklabels(fontsize=10)
#grafico.set_xticklabels(fontsize=5)

#Gráfico de líneas media móvil de los últimos 14 días
graficomv14 = sns.lineplot(ax=ax,
                           x="dateRep",
                           y="moving14",
                           data=grafico_df,
コード例 #58
0
####

# Make a bar graph
fig1 = plt.figure(1)
plt.bar(np.arange(4),
        mean_impf,
        yerr=sem_impf,
        ecolor='black',
        tick_label=['I', 'II', 'III', 'IV'],
        align='center')
plt.ylabel('impact force (nM)')
fig1.show()

# Easier plot with Seaborn
fig2 = plt.figure(2)
sns.barplot(data=df, x='ID', y='impf')
plt.xlabel('')
plt.ylabel('impact force (mN)')
fig2.show()

###
# Message: do not make bar graphs.
###

# Bee swarm plot
fig3 = plt.figure(3)
sns.swarmplot(data=df, x='ID', y='impf')
plt.margins(0.02)
plt.xlabel('')
plt.ylabel('impact force (mN)')
fig3.show()
コード例 #59
0
ファイル: temp.py プロジェクト: chandan1234-c/Shortlister
    svcscore=(model.score(X_test_array,y_test1))*100
    
    clf=RandomForestClassifier(n_estimators=100)
    clf.fit(X_train_array,y_train)
    clf_pred=clf.predict(X_test_array)
    clfscore=(clf.score(X_test_array,y_test1))*100
      
    knn = KNeighborsClassifier(n_neighbors = 11,metric='minkowski' , p=2).fit(X_train_array, y_train) 
    knnscore=(knn.score(X_test_array,y_test1))*100
    
    scores = [gnbscore,naivescore,svcscore,knnscore,dtscore,clfscore]
    algorithms = ["Gaussian naive bayes","Bernoulli naive bayes","Support Vector Machine","K-Nearest Neighbors","Decision Tree","Random Forest"]
    sns.set(rc={'figure.figsize':(15,8)})
    plt.xlabel("Algorithms")
    plt.ylabel("Accuracy score")
    sns.barplot(algorithms,scores)
    
    final_model = naive_bayes
    
    # save the model to disk
    pickle.dump(final_model, open(save_model, 'wb'))
    
def make_prediction(resumeNo):
    resume = 'C:/Users/Muskaan Ratra/Desktop/CVs/CVs/c' + str(resumeNo+1) + '.pdf'
    loaded_model = pickle.load(open(save_model, 'rb'))
    loaded_vector = pickle.load(open(save_vector, 'rb'))
    resumeFile=open(resume,'rb')
    sample_resume=slate.PDF(resumeFile)
    sample_resume=sample_resume[0]
    sample_resume=loaded_vector.transform([sample_resume])
    return loaded_model.predict(sample_resume)[0]
import codeacademylib3_seaborn
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns

df = pd.read_csv('WorldCupMatches.csv')
print(df.head())

df['Total Goals'] = df['Home Team Gaols'] + df['Away Team Goals']

print(df.head())

sns.set_style('whitegrid')
sns.set_context('poster', font_size=10)
f, ax = plt.subplots(figsize=(10, 25))
ax = sns.barplot(data=df, x=df['year'], y=df['total goals'])
ax.set_title('Year vs. Av Goals')

df_goals = pd.read_csv('goals.csv')
#print(df_goals.head())
f, ax2 = sns.subplots(figsize=(12, 7))
ax2 = sns.set_context('notebook', font_scale=1.25)
ax2 = sns.boxplot(data=d_goals, x='year', y='goals', palette='Spectral')
ax2.set_title('Boxplot')

plt.show()