def plot_nesting(results, thresh=.5, rotate='oblimin', title=True,
                 dpi=300, figsize=12, ext='png', plot_dir=None):
    """ Plots nesting of factor solutions
    
    Args:
        results: a dimensional structure results object
        thresh: the threshold to pass to EFA.get_nesting_matrix
        dpi: the final dpi for the image
        figsize: scalar - the width and height of the (square) image
        ext: the extension for the saved figure
        plot_dir: the directory to save the figure. If none, do not save
    """
    EFA = results.EFA
    explained_scores, sum_explained = EFA.get_nesting_matrix(thresh, 
                                                             rotate=rotate)

    # plot lower nesting
    fig, ax = plt.subplots(1, 1, figsize=(figsize, figsize))
    cbar_ax = fig.add_axes([.905, .3, .05, .3])
    sns.heatmap(sum_explained, annot=explained_scores,
                fmt='.2f', mask=(explained_scores==-1), square=True,
                ax = ax, vmin=.2, cbar_ax=cbar_ax,
                xticklabels = range(1,sum_explained.shape[1]+1),
                yticklabels = range(1,sum_explained.shape[0]+1))
    ax.set_xlabel('Higher Factors (Explainer)', fontsize=25)
    ax.set_ylabel('Lower Factors (Explainee)', fontsize=25)
    ax.set_title('Nesting of Lower Level Factors based on R2', fontsize=30)
    if plot_dir is not None:
        filename = 'lower_nesting_heatmap.%s' % ext
        save_figure(fig, path.join(plot_dir, filename), 
                    {'bbox_inches': 'tight', 'dpi': dpi})
        plt.close()
def show_sgs_result(r, ax):
    N = r[0].shape[0] * r[0].shape[1]
    df = pd.DataFrame(np.c_[r[0].reshape(N), r[1].reshape(N), r[2].reshape(N)], columns=['x', 'y', 'flux'])
    df = df.pivot('x', 'y')
    sns.heatmap(df, ax=ax, robust=True, cbar=False, cmap="coolwarm", xticklabels=False, yticklabels=False)
    ax.set_ylabel("y")
    ax.set_xlabel("x")
def plot_corr(file, score, stat, ind_var, brain_type):

    # seaborn
    sns.set(style="white")

    # import the dataframe
    dt = pd.read_csv(file)

    # Compute the correlation matrix
    corr = dt.corr()

    ### Create the matrix figure with seaborn
    # Generate a mask for the upper triangle
    mask = np.zeros_like(corr, dtype=np.bool)
    mask[np.triu_indices_from(mask)] = True

    # Set up the matplotlib figure
    f, ax = plt.subplots(figsize=(len(ind_var),len(ind_var)))

    # Generate a custom diverging colormap
    cmap = sns.diverging_palette(220, 10, as_cmap=True)

    # Draw the heatmap with the mask and correct aspect ratio
    sns.heatmap(corr, mask=mask, cmap=cmap, annot=False, ax=ax)
    plt.subplots_adjust(left= 0.30,bottom=0.30)
    plt.savefig(os.path.join(stat,score, "heatmap_" + score + "_" + stat + "_"+ brain_type + ".png"))
    plt.close()

    return corr
def get_cov(data):
    dat = data.training_data_all_ways + data.testing_data_all_ways
    num_ways = len(data.get_list_of_ways())
    m = {}
    i = 0
    for way in data.get_list_of_ways():
        m[way] = i
        i += 1
    mat = np.zeros((num_ways,num_ways))
    for elem in dat:
        ways = elem[1]
        for way in ways:
            mat[m[way],m[way]] = mat[m[way],m[way]] + 1
        for w1 in ways:
            for w2 in ways:
                if w1 == w2: continue
                mat[m[w1],m[w2]] = mat[m[w1],m[w2]] + 1
    print mat
    emp_cov = empirical_covariance(mat)
    print emp_cov
    corr = np.zeros((num_ways,num_ways))
    for i in range(num_ways):
        for j in range(num_ways):
            corr[i,j] = emp_cov[i,j]/(math.sqrt(emp_cov[i,i])*math.sqrt(emp_cov[j,j]))
    print corr
    sns.heatmap(corr,vmin = -1, vmax = 1,square=True,xticklabels=m.keys(),yticklabels=m.keys())
    sns.plt.title("Covariance of WAYS frequencies")
    sns.plt.show()
def plotGraphicalCorrelationMatrix(data):
    '''
        Input : data
        Output : graphical correlation matrix
        Inspired from : https://stanford.edu/~mwaskom/software/seaborn/examples/many_pairwise_correlations.html
    '''
    try:
        print "\nGenerating the graphical correlation matrix...\n"
        time.sleep(3)

        corr = data.corr()
        f, ax = plt.subplots(figsize=(20, 20))
        # Generate a custom diverging colormap
        cmap = sns.diverging_palette(220, 10, as_cmap=True)
        # Draw the heatmap with the mask and correct aspect ratio
        sns.heatmap(corr, cmap=cmap,
                    square=True, xticklabels=False, yticklabels=False,
                    linewidths=.5, cbar_kws={"shrink": .5}, ax=ax)
        plt.title('Correlation Matrix', fontsize=30)
        ax.set_ylabel('Features', fontsize=20)
        ax.set_xlabel('Features', fontsize=20)
        xticklabels = ['video_category_id','viewCount','likeCount','dislikeCount','favoriteCount','commentCount','dimension','definition','caption','licensedContent']
        ylabel = xticklabels[::-1]
        ax.set_xticklabels(xticklabels, rotation=45)
        ax.set_yticklabels(ylabel, rotation=0)
        name = "../YoutubeData/correlation_matrix.pdf"
        plt.savefig(name)
        print "\nPlease close the Bar Chart when you want to move ahead..."
        plt.show()

        print "You can always retrieve the graphical correlation matrix in YoutubeData folder.\n"
        time.sleep(3)
        return True
    except:
        raise VideoAnalysisException(" Error while Generating the graphical correlation matrix")
Esempio n. 6
0
def plot_performance(parser, args, pore_measure):
    """
    Plot the pore performance in terms of reads per pore
    """
    flowcell_layout = minion_flowcell_layout()

    pore_values = []
    for pore in flowcell_layout:
        if pore in pore_measure:
            pore_values.append(pore_measure[pore])
        else:
            pore_values.append(0)

    # make a data frame of the lists
    d = {'rownum': range(1,17)*32,
        'colnum': sorted(range(1,33)*16),
        'tot_reads': pore_values,
        'labels': flowcell_layout}
    df = pd.DataFrame(d)

    d = df.pivot("rownum", "colnum", "tot_reads")
    sns.heatmap(d, annot=True, fmt="d", linewidths=.5)

    if args.saveas is not None:
        plot_file = args.saveas
        plt.savefig(plot_file, figsize=(8.5, 8.5))
    else:
        plt.show()
def item_nbr_tendency_finely(store_nbr, year, month_start=-1, month_end=-1, graph=True):
    '''
    input
        1. store_nbr = 스토어 번호
        2. year = 연도
        3. month_start = 시작달
        4. month_start = 끝달
        5. graph = 위의 정보에 대한 item_nbr 그래프 출력여부
    
    output
        1. store_nbr, year, month로 filtering한 item_nbr의 pivot 테이블
    '''
    store = df_1[(df_1['store_nbr'] == store_nbr) &
                 (df_1['year'] == year)]

    if month_start != -1:
        if month_end == -1:
            month_end = month_start + 1
        store = store[(month_start <= store['month']) & (store['month'] < month_end)]

    pivot = store.pivot_table(index='item_nbr',
                              columns='date',
                              values='units',
                              aggfunc=np.sum)

    zero_index = pivot == 0
    pivot = pivot[pivot != 0].dropna(axis=0, how='all')
    pivot[zero_index] = 0

    if graph:
        plt.figure(figsize=(12, 8))
        sns.heatmap(pivot, cmap="YlGnBu", annot=True, fmt='.0f')
        plt.show()

    return pivot
Esempio n. 8
0
File: c5.py Progetto: 3774257/abu
def sample_54_2():
    """
    5.4 使用seaborn可视化数据
    :return:
    """
    change_df = pd.DataFrame({'tsla': tsla_df.p_change})
    # join usGOOG
    change_df = change_df.join(pd.DataFrame({'goog': ABuSymbolPd.make_kl_df('usGOOG', n_folds=2).p_change}),
                               how='outer')
    # join usAAPL
    change_df = change_df.join(pd.DataFrame({'aapl': ABuSymbolPd.make_kl_df('usAAPL', n_folds=2).p_change}),
                               how='outer')
    # join usFB
    change_df = change_df.join(pd.DataFrame({'fb': ABuSymbolPd.make_kl_df('usFB', n_folds=2).p_change}),
                               how='outer')
    # join usBIDU
    change_df = change_df.join(pd.DataFrame({'bidu': ABuSymbolPd.make_kl_df('usBIDU', n_folds=2).p_change}),
                               how='outer')

    change_df = change_df.dropna()
    # 表5-2所示
    print('change_df.head():\n', change_df.head())

    # 使用corr计算数据的相关性
    corr = change_df.corr()
    _, ax = plt.subplots(figsize=(8, 5))
    # sns.heatmap热力图展示每组股票涨跌幅的相关性
    sns.heatmap(corr, ax=ax)
    plt.show()
Esempio n. 9
0
def plot_heatmap(df):
    df2 = df[['bonus',
                    'deferred_income',
                    'exercised_stock_options',
                    'expenses',
                    'long_term_incentive',
                    'other',
                    'restricted_stock',
                    'salary',
                    'total_payments',
                    'total_stock_value',
                    'from_messages',
                    'from_poi_to_this_person',
                    'from_this_person_to_poi',
                    'shared_receipt_with_poi',
                    'to_messages',
                    'perc_from_poi',
                    'perc_to_poi']]
    
    colormap = plt.cm.viridis
    plt.figure(figsize=(12,12))
    plt.title("Pearson's Correlation of Features", y=1.05, size=15)
    sns.heatmap(df2.astype(float).corr(),linewidths=0.1,vmax=1.0, square=True,
                cmap=colormap, linecolor='white', annot=True)
    fig = plt.gcf()
    fig.set_size_inches(18.5, 10.5)
    fig.savefig('PearsonCorrelationOfFeatures.png', dpi=100)    
    #plt.savefig('PearsonCorrelationOfFeatures.png')
    plt.show()
def plot_EFA_retest(combined, size=4.6, dpi=300, 
                    ext='png', plot_dir=None):
    corr = combined.corr()
    max_val = abs(corr).max().max()
    
    fig = plt.figure(figsize=(size,size)); 
    ax = fig.add_axes([.1, .1, .8, .8])
    cbar_ax = fig.add_axes([.92, .15, .04, .7])
    sns.heatmap(corr, square=True, ax=ax, cbar_ax=cbar_ax,
                vmin=-1, vmax=1,
                cmap=sns.diverging_palette(220,15,n=100,as_cmap=True),
                cbar_kws={'orientation': 'vertical',
                          'ticks': [-1, 0, 1]}); 
    ax.set_xticklabels(ax.get_xticklabels(), rotation=90)
    ax.set_yticklabels(ax.get_yticklabels(), rotation=0)
    ax.tick_params(labelsize=size/len(corr)*40)
    
    # format cbar axis
    cbar_ax.set_yticklabels([format_num(-max_val), 0, format_num(max_val)])
    cbar_ax.tick_params(labelsize=size, length=0, pad=size/2)
    cbar_ax.set_ylabel('Factor Loading', rotation=-90, 
                   fontsize=size, labelpad=size/2)
    
    # set divider lines
    n = corr.shape[1]
    ax.axvline(n//2, 0, n, color='k', linewidth=size/3)
    ax.axhline(n//2, 0, n, color='k', linewidth=size/3)
    
    if plot_dir is not None:
            save_figure(fig, path.join(plot_dir, 'EFA_test_retest_heatmap.%s' % ext),
                        {'bbox_inches': 'tight', 'dpi': dpi})
            plt.close()
Esempio n. 11
0
def plot_number_of_options_by_attempt(length):
    data = load_options_by_attempt()
    data['value'] = data['value'].apply(lambda x: x * 100)
    data = data[(data['attempt'] < length)]
    max_options = data['options'][data['options'] != 0].max()
    data['options'] = data['options'].apply(lambda x: max_options + 1 if x == 0 else x)
    cols = len(data['experiment_setup_name'].unique())
    gs = gridspec.GridSpec(1, cols, width_ratios=[3.5] * (cols - 1) + [4])
    rcParams['figure.figsize'] = cols * 2, int(5 * length / 50)
    rcParams['axes.linewidth'] = 1
    for j, (setup, setup_data) in enumerate(data.groupby('experiment_setup_name')):
        for opt in range(2, max_options + 1):
            if opt not in setup_data['options'].unique():
                for attempt in range(0, int(length)):
                    setup_data = setup_data.append(pandas.DataFrame([{'attempt': attempt, 'options': opt, 'value': 0}]))
        plt.subplot(gs[j])
        to_plot = setup_data.pivot_table(columns='options', index='attempt', values='value', dropna=False, fill_value=0)
        plt.title(setup)
        sns.heatmap(to_plot, annot=False, cbar=(j == cols - 1), linewidths=0.1, cbar_kws={'format': '%.0f%%'})
        plt.xticks(plt.xticks()[0], [lab.get_text() if int(lab.get_text()) <= max_options else 'O' for lab in plt.xticks()[1]])
        if j != 0:
            plt.gca().axes.get_yaxis().set_ticks([])
            plt.ylabel('')
        else:
            pos = plt.yticks()[0]
            lab = plt.yticks()[1]
            plt.yticks([pos[0], pos[-1]], [int(lab[0].get_text()) + 1, int(lab[-1].get_text()) + 1])
    output.savefig('options_by_attempt')
Esempio n. 12
0
    def _plot_monthly_returns(self, stats, ax=None, **kwargs):
        """
        Plots a heatmap of the monthly returns.
        """
        returns = stats['returns']
        if ax is None:
            ax = plt.gca()

        monthly_ret = perf.aggregate_returns(returns, 'monthly')
        monthly_ret = monthly_ret.unstack()
        monthly_ret = np.round(monthly_ret, 3)
        monthly_ret.rename(
            columns={1: 'Jan', 2: 'Feb', 3: 'Mar', 4: 'Apr',
                     5: 'May', 6: 'Jun', 7: 'Jul', 8: 'Aug',
                     9: 'Sep', 10: 'Oct', 11: 'Nov', 12: 'Dec'},
            inplace=True
        )

        sns.heatmap(
            monthly_ret.fillna(0) * 100.0,
            annot=True,
            fmt="0.1f",
            annot_kws={"size": 8},
            alpha=1.0,
            center=0.0,
            cbar=False,
            cmap=cm.RdYlGn,
            ax=ax, **kwargs)
        ax.set_title('Monthly Returns (%)', fontweight='bold')
        ax.set_ylabel('')
        ax.set_yticklabels(ax.get_yticklabels(), rotation=0)
        ax.set_xlabel('')

        return ax
Esempio n. 13
0
def test_gp_inference_per():
  # few to many observations. Less than 4 here normally crashes to due to bad covaricance matrices
  observations_n = range(20,50,2)
  number_steps =100
  ripl = shortcuts.make_lite_church_prime_ripl()
  every_n_step=1
  kl_matrix=np.zeros((len(observations_n),number_steps/every_n_step))

  for n_i in range(len(observations_n)):
    n = observations_n[n_i]
    x = np.random.uniform(0,30,n)
    y = f_periodic(x)# + np.random.normal(0,0.1,n)
    ripl.clear()
    ripl.bind_foreign_sp("make_gp_part_der",gp.makeGPSP)
    ripl.assume('make_const_func', VentureFunction(covs.makeConstFunc, [t.NumberType()], covs.constantType))
    ripl.assume('zero', "(apply_function make_const_func 0)")



    ripl.assume('make_per',VentureFunction(covs.makePeriodic,[t.NumberType(), t.NumberType(), t.NumberType()], t.AnyType("VentureFunction")))

    ripl.assume('make_noise',VentureFunction(covs.makeNoise,[t.NumberType()], t.AnyType("VentureFunction")))

    ripl.assume("func_plus", covs.makeLiftedAdd(lambda x1, x2: x1 + x2))

    ripl.assume('sf','(tag (quote hyper) 0 (uniform_continuous 0 100 ))')
    ripl.assume('l','(tag (quote hyper) 1 (uniform_continuous 0 100 ))')
    ripl.assume('p','(tag (quote hyper) 2 (uniform_continuous 0.01 100 ))')


    ripl.assume('sigma','0.1')

    ripl.assume('per', "(apply_function make_per sf p l )")
    ripl.assume('wn', "(apply_function make_noise sigma )")
    ripl.assume('gp',"""(tag (quote model) 0
                        (make_gp_part_der zero (apply_function func_plus per wn  )
                                )

                             )""")
   

    makeObservations(x,y,ripl)
    for steps in range(number_steps):
      if (steps % every_n_step )==0:
        xpost = np.random.uniform(33,36,1)[0]
        ypost = []
        for i in range(100):
            y = ripl.sample("(gp (array " + str(xpost) + " ))")
            ypost.append(y)
        kl_matrix[n_i][steps/every_n_step]= KL_normal(np.mean(ypost),np.std(ypost),f(xpost),0.1)
      ripl.infer("(mh (quote hyper) one 1)")
  orig_cmap = matplotlib.cm.coolwarm
  shifted_cmap = shiftedColorMap(orig_cmap, midpoint=0.3, name='shifted')  
  sns.heatmap(kl_matrix,cmap=shifted_cmap,yticklabels=observations_n)
  sns.plt.show()
  max_kl = kl_matrix.max()
  shift = 1./max_kl
  heavily_shifted_cmap = shiftedColorMap(orig_cmap, midpoint=shift, name='shifted')  
  sns.heatmap(kl_matrix,cmap=  heavily_shifted_cmap ,yticklabels=observations_n)
  sns.plt.show()
def plot_confusion(classifier,threshold =0.4):
    x_train,x_test,y_train,y_test = train_test_split(df_new,y,test_size = 0.2)
    y_pred = []
    try:
        prob_score = clf_grid.predict_proba(x_train)
    except:
	prob_score = clf_grid.predict_proba(np.float_(x_train))
    a = prob_score[:,1]
    for idx,item in enumerate(a):
        if item>= threshold:
            item = 1
        else:
            item =0
        y_pred.append(item)
    # Plotting                                                                                                              

    class_name = classifier.__repr__()
    class_name = re.sub(r'\([^)]*\)','',class_name)
    print ("")
    print ("")
    print("Legends")
    print ('1 - Substantiated')
    print ('0 - Unfounded')
    print("")
    print("Confusion Matrix: "+ class_name+ " (threshold- " +str(threshold)+")"  )
    sns.heatmap(metrics.confusion_matrix(y_pred, y_train), annot=True, cmap="YlGnBu",fmt ="d")
    plt.xlabel('Predicted')
    plt.ylabel('True')
Esempio n. 15
0
def main(csv_filepath):
    """Exploratory data analysis for the Titanic dataset."""
    # Read data
    dtype = {'PassengerId': 'str',
             'Embarked': 'category',
             'Survived': 'category',
             'Pclass': 'category',
             'Sex': 'category',
             'SibSp': 'uint8',
             'Parch': 'uint8'}
    df = pd.read_csv(csv_filepath, dtype=dtype)
    describe_pandas_df(df, dtype=dtype)

    # Show histograms
    numeric_types = ['float64', 'int64', 'uint8']
    numerical_features = df.select_dtypes(include=numeric_types)
    numerical_features.hist(figsize=(30, 16),
                            bins=50,
                            xlabelsize=8,
                            ylabelsize=8)
    plt.savefig("titanic-histograms.png")
    plt.show()

    # Show correlations
    import seaborn as sns
    corr = numerical_features.corr()
    sns.heatmap(corr)
    plt.savefig("titanic-correlation.png")
    plt.show()
Esempio n. 16
0
    def plot_region_heatmap(self, clim=None):
        """

        Plots a frequency x region heatmap of mean t-statistics.

        """

        # mean t-stat within subject by region and frequency, then mean across subjects
        mean_df = self.group_df.groupby(['subject', 'regions', 'frequency']).mean().groupby(['regions', 'frequency']).mean()
        mean_df = mean_df.reset_index()

        # ignore data without a region
        mean_df['regions'].replace('', np.nan, inplace=True)
        mean_df = mean_df.dropna(subset=['regions'])

        # reshape it for easier plotting with seaborn
        mean_df = mean_df.pivot_table(index='frequency', columns='regions', values='t-stat')

        # center the colormap and plot
        if clim is None:
            clim = np.max(np.abs(mean_df.values))
        with sns.plotting_context("talk"):
            sns.heatmap(mean_df, cmap='RdBu_r',
                        yticklabels=mean_df.index.values.round(2),
                        vmin=-clim,
                        vmax=clim,
                        cbar_kws={'label': 't-stat'})
            plt.gca().invert_yaxis()
            plt.ylabel('Frequency')
            plt.xlabel('')

        plt.gcf().set_size_inches(12, 9)
Esempio n. 17
0
    def _process(self,data):
        for x in data:
            
            if data[x][1] not in self.data:
                #prepares the data to visualise the xcor matrix of a specific batch number.
                self.data[data[x][1]]={}
                self.data[data[x][1]]['matrix']=numpy.identity(self.size)
                self.data[data[x][1]]['ro_count']=0
            
            self.data[data[x][1]]['matrix'][(data[x][2][1],data[x][2][0])]=data[x][0]
            #self.addToProvState('batch_'+str(data[x][1]),self.data[data[x][1]]['matrix'],metadata={'matrix':str(self.data[data[x][1]]['matrix'])},dep=['batch_'+str(data[x][1])],ignore_inputs=False)
            self.data[data[x][1]]['ro_count']+=1
            
            if self.data[data[x][1]]['ro_count']==(self.size*(self.size-1))/2:
                matrix=self.data[data[x][1]]['matrix']
                
                d = pd.DataFrame(data=matrix,
                 columns=range(0,self.size),index=range(0,self.size))
                
                mask = numpy.zeros_like(d, dtype=numpy.bool)
                mask[numpy.triu_indices_from(mask)] = True

                # Set up the matplotlib figure
                f, ax = plt.subplots(figsize=(11, 9))

                # Generate a custom diverging colormap
                cmap = sns.diverging_palette(220, 10, as_cmap=True)

                # Draw the heatmap with the mask and correct aspect ratio
                sns.heatmap(d, mask=mask, cmap=cmap, vmax=1,
                    square=True,
                    linewidths=.5, cbar_kws={"shrink": .5}, ax=ax)
                
                sns.plt.savefig("./plots/"+str(data[x][1])+"_plot.png") 
                self.write('output',(matrix,data[x][1]),metadata={'matrix':str(d),'batch':str(data[x][1])},dep=['batch_'+str(data[x][1])])
Esempio n. 18
0
def main():
    """Go Main Go"""
    pgconn = get_dbconn('asos')
    dfin = read_sql("""
    with mob as (
        select date_trunc('hour', valid) as ts, avg(dwpf) from alldata
        where station = 'MOB' and dwpf is not null GROUP by ts),
    cmi as (
        select date_trunc('hour', valid) as ts, avg(dwpf) from alldata
        where station = 'CMI' and dwpf is not null GROUP by ts),
    agg as (
        select m.ts, m.avg as dwpf, c.avg as tmpf
        from mob m JOIN cmi c on (m.ts = c.ts))
    select extract(month from ts) as month, extract(hour from ts) as hour,
    sum(case when dwpf >= tmpf then 1 else 0 end) / count(*)::float * 100.
    as freq from agg GROUP by month, hour ORDER by month, hour
    """, pgconn, index_col=None)

    df = dfin.pivot("month", "hour", "freq")

    fig, ax = plt.subplots(figsize=(9, 6))
    ax.set_title(("Hourly Frequency of Mobile (MOB) Dew Point\n"
                  "greater than or equal to Champaign (CMI) Dew Point"))
    sns.heatmap(df, annot=True, fmt=".0f", linewidths=.5, ax=ax, vmin=5, vmax=100)
    print(ax.get_yticks())
    ax.set_xlabel("Hour of Day (CDT or CST)")
    ax.set_xticklabels(["Mid", "1AM", "2", "3", "4", "5", "6", "7", "8", "9", "10",
                   "11", "Noon", "1PM", "2", "3", "4", "5", "6", "7", "8", "9", "10",
                   "11"])
    ax.set_yticklabels(calendar.month_abbr[1:])
    fig.savefig('test.png')
Esempio n. 19
0
def add_tensor_2d(box, length = 10, feature_num = 5, notification_box = None, x_label = None, y_label = None):

    # # Generate a large random dataset
    # rs = np.random.RandomState(33)
    # d = pd.DataFrame(data=rs.normal(size=(length, feature_num)),)
    #
    # # Generate a mask for the upper triangle
    # mask = np.zeros_like(d, dtype=np.bool)
    # mask[np.triu_indices_from(mask)] = True

    uniform_data = np.random.randn(feature_num, length)
    # Draw the heatmap with the mask and correct aspect ratio

    region = calculate_region(box, length, feature_num)
    cur_ax = plt.gcf().add_axes(region)

    sns.heatmap(uniform_data, xticklabels = False, yticklabels = False, square=True,
                linewidths=.5, ax = cur_ax, cbar = False)

    cur_ax.add_patch(Rectangle((0,0), length, feature_num, fill=False, color="black", linewidth=2))

    if not notification_box:
        notification_box = []

    for box in notification_box:
        cur_ax.add_patch(Rectangle(box["box"][:2], box["box"][2], box["box"][3], fill=True, color=box["color"], alpha=box["alpha"], linewidth=5))

    return cur_ax
Esempio n. 20
0
def heatmap_discrete_levels(data, key2color, fill_color=(1,1,1), **heatmap_kwargs):
    """data can be a DataFrame (with multiple value levels), or
    a tuple, list, or dict of DataFrames with True-False-like values.

    In the former case, key2color[key] is the color for data[data==key].
    In the latter case, key2color[key] is the color for data[key].

    key2color is a dict. Values must be r,g,b tuples [0,1]
    """
    filled = False
    for key, rgb in key2color.items():
        if isinstance(data, pd.DataFrame):
            vals = (data == key)
        else:
            vals = data[key]

        if filled:
            if not vals.any().any():
                # We will get an error if we try to
                # render this.
                continue
            color = monocolor(*rgb)
            vals[~vals.astype('bool')] = np.nan
            vals = vals.astype('float16')
        else:
            color = duocolor(fill_color, rgb)
            filled = True

        kwargs = {'cbar':False}
        kwargs.update(heatmap_kwargs)
        sns.heatmap(vals, cmap=color, **kwargs)
Esempio n. 21
0
def corrplots(ipfile,opname):
    opdir='/'.join(opname.split('/')[:-1])
    if not os.path.isdir(opdir):
        print "### Making Directory ####"
        os.makedirs(opdir)
    corrmat=pd.read_csv(ipfile,index_col=0)

    cols=list([c.split('/')[-1] for c in corrmat.columns])
    indices=list([i.split('/')[-1] for i in corrmat.index])

    corrmat.columns=[c.split('/')[-1] for c in corrmat.columns]
    corrmat.index=[i.split('/')[-1] for i in corrmat.index]

    cols=natsorted(cols,key= lambda s : s.split('-')[1])
    cols=natsorted(cols,key= lambda s : s.split('-')[2])
    cols=natsorted(cols,key= lambda s : s.split('-')[0])
    corrmat=corrmat[cols]

    indices=natsorted(indices,key= lambda s : s.split('-')[1])
    indices=natsorted(indices,key= lambda s : s.split('-')[2])
    indices=natsorted(indices,key= lambda s : s.split('-')[0])
    corrmat=corrmat.reindex(cols)

    print "###### Generating Heatmap ######"
    sns.heatmap(corrmat)
    print "###### Saving fig to "+opname+" ######"
    plt.savefig(opname)
    plt.title(opname.split('/')[-1].split('.')[0])
    plt.tight_layout()
    plt.close()
    plt.cla()

    return corrmat
Esempio n. 22
0
def plot_attention(sentence, Tx=20, Ty=25):
    """
    可视化Attention层

    @param sentence: 待翻译的句子,str类型
    @param Tx: 输入句子的长度
    @param Ty: 输出句子的长度
    """

    X = np.array(text_to_int(sentence, source_vocab_to_int))
    f = K.function(model.inputs, [model.layers[9].get_output_at(t) for t in range(Ty)])

    s0 = np.zeros((1, n_s))
    c0 = np.zeros((1, n_s))
    out0 = np.zeros((1, len(target_vocab_to_int)))

    r = f([X.reshape(-1, 20), s0, c0, out0])

    attention_map = np.zeros((Ty, Tx))
    for t in range(Ty):
        for t_prime in range(Tx):
            attention_map[t][t_prime] = r[t][0, t_prime, 0]

    Y = make_prediction(sentence)

    source_list = sentence.split()
    target_list = Y.split()

    f, ax = plt.subplots(figsize=(20, 15))
    sns.heatmap(attention_map, xticklabels=source_list, yticklabels=target_list, cmap="YlGnBu")
    ax.set_xticklabels(ax.get_xticklabels(), fontsize=15, rotation=90)
    ax.set_yticklabels(ax.get_yticklabels(), fontsize=15)
    def plot_heatmap(self, show=True, save=False, metric='growth rate', unit=None, vmin=None, vmax=None):

        if self.results['row'].max() > 8 or self.results['column'].max() > 12:
            results_arr = np.empty((16, 24))  # assume 384-well plate
            results_arr.fill(np.nan)
            indices = ['A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P']
            columns = range(1,25)
        else:
            results_arr = np.empty((8, 12))  # assume 96-well plate
            results_arr.fill(np.nan)
            indices = ['A','B','C','D','E','F','G','H']
            columns = range(1,13)

        results_arr[(self.results['row']-1), (self.results['column']-1)] = self.results[metric]

        # only show if not saved?
        data = pd.DataFrame(results_arr, index=indices, columns=columns)
        if vmin is not None:
            fig = sns.heatmap(data, vmin=vmin, vmax=vmax, cmap='spring_r', linewidths=0.01)
        else:
            fig = sns.heatmap(data, cmap='spring_r', linewidths=0.01)
        plt.yticks(rotation=0)
        fig.xaxis.set_ticks_position('top')
        if unit: plt.title(metric + ' (' + unit +')', y=1.1)
        else: plt.title(metric, y=1.1)

        if save:
            heat_name = self.name +'_'+ metric.replace(' ', '_') +'_heatmap.png'
            heat_file = os.path.join(self.out_dir, heat_name)
            plt.savefig(heat_file, dpi=300, bbox_inches='tight')

        if show:
            return plt.show()
Esempio n. 24
0
def get_legal_pairs():
    """
    Plots the legal pairs of detectors for GBM observations

    Returns
    -------

    """

    dlp = np.array([[0, 274, 39, 171, 12, 29, 0, 5, 1, 6, 1, 0],
                    [258, 0, 233, 55, 4, 100, 2, 1, 1, 12, 27, 0],
                    [55, 437, 0, 2, 2, 311, 0, 1, 1, 13, 235, 0],
                    [215, 80, 3, 0, 330, 107, 4, 8, 19, 2, 1, 0],
                    [13, 4, 8, 508, 0, 269, 2, 29, 236, 0, 1, 0],
                    [44, 188, 337, 166, 279, 0, 0, 0, 0, 0, 0, 0],
                    [0, 1, 1, 2, 2, 0, 0, 238, 46, 180, 12, 33],
                    [0, 2, 0, 18, 35, 0, 222, 0, 221, 61, 3, 109],
                    [0, 0, 1, 16, 215, 0, 51, 399, 0, 4, 2, 303],
                    [3, 18, 21, 4, 0, 0, 190, 82, 1, 0, 324, 110],
                    [1, 25, 191, 0, 0, 0, 16, 6, 4, 516, 0, 293],
                    [0, 0, 0, 0, 0, 0, 32, 147, 297, 138, 263, 0]])

    sns.heatmap(dlp, annot=True, fmt='d', cmap="YlGnBu")
    plt.ylabel("NaI")
    plt.xlabel("NaI")
Esempio n. 25
0
    def summary(self, stdout=True, plot=False):
        '''
        Displays diagnostics to the user

        Args:
            stdout (bool): print results to the console
            plot (bool): use Seaborn to plot results
        '''
        if stdout:
            print('Collinearity summary:')
            print(pd.concat([self.results['Eigenvalues'],
                            self.results['ConditionIndices'], 
                            self.results['VIFs'],
                            self.results['CorrelationMatrix']],
                            axis=1))

            print('Outlier summary:')
            print(self.results['RowMahalanobisDistances'])
            print(self.results['ColumnMahalanobisDistances'])

            print('Validity summary:')
            print(self.results['Variances'])
        
        if plot:
            for key, result in self.results.items():
                if key == 'CorrelationMatrix':
                    ax = plt.axes()
                    sns.heatmap(result, cmap='Blues', ax=ax)
                    ax.set_title(key)
                    sns.plt.show()
                else:
                    result.plot(kind='bar', title=key)
                    plt.show()
Esempio n. 26
0
def _fill(data, colors, ax):
    if len(data) == 0:
        return

    data = data.copy()

    # Subset data and colors.
    data = data.ix[data['alteration'].isin(colors)]
    colors = {k: v for k, v in colors.items()
              if k in set(data['alteration'])}

    # Build number/color maps.
    num_map, colors_ord = {}, []
    for i, (type_, color) in enumerate(colors.items()):
        num_map[type_] = i
        colors_ord.append(color)

    # Map to numeric and pivot.
    data['value'] = data['alteration'].map(num_map)
    mat = _pivot(data, values='value')

    # Mask na-values.
    mask = pd.isnull(mat)

    # Draw fills using heatmap.
    cmap = discrete_cmap(colors_ord)
    sns.heatmap(mat, cmap=cmap, ax=ax, mask=mask,
                cbar=False, linewidths=0.5)
Esempio n. 27
0
def show_heatmap(filename):
    """Show confusion matrix given of a partis-generated tab-delimited db."""
    true_labels, estimated_labels = get_clones_real_estimated(filename)
    cm, rows, cols = confusion_matrix(true_labels, estimated_labels)
    df = pd.DataFrame(cm, index=rows, columns=cols)
    sns.heatmap(df)
    sns.plt.show()
def plot_correlation_matrix(df, title='Correlation matrix', cmap=plt.cm.coolwarm):
    plt.figure(figsize=(12, 10))
    sns.heatmap(df.corr(), annot=False, cmap=cmap)
    plt.yticks(rotation=0, fontsize=8)
    plt.xticks(rotation=90, fontsize=8)
    plt.title(title)
    plt.savefig('corr_matrix.png')
Esempio n. 29
0
def spectrograms(D, p_local, p_global):
    if p_local['eog_in']:
        D = D[p_global['eeg_chans'], :]
    C = D.shape[0]
    T = D.shape[1]
    for c in range(C)[:3]:
        f, t, Sxx = spectrogram(D[c, :], p_global['sample_freq'])
        sns.heatmap(np.log(Sxx[::-1, :]), xticklabels = t.astype(int),
            yticklabels = f.astype(int)[::-1])

        # There is probably a better way to do this
        for label in plt.gca().get_xticklabels():
            label.set_visible(False)
        for label in plt.gca().get_xticklabels()[::Sxx.shape[1] / 6]:
            label.set_visible(True)
        for label in plt.gca().get_yticklabels():
            label.set_visible(False)
        for label in plt.gca().get_yticklabels()[::Sxx.shape[0] / 6]:
            label.set_visible(True)
        cbar = plt.gca().collections[0].colorbar
        plt.title('Spectrogram for channel ' + str(c + 1))
        plt.xlabel('Time in seconds')
        plt.ylabel('Frequency')
        cbar.set_label(r"$\log(\hat{f})$", labelpad=20, rotation=270)
        path = p_global['plot_folders']['spectrogram_dir'] \
               + '/' + 'spectrogram-%03d' % (c + 1)
	if p_global['plotting']['notebook']:
	    show_and_close()
	else:
	    save_and_close(path, p_local)
Esempio n. 30
0
def plot_metrics_correlation():
    data = load_data_to_correlate().rename(columns={
        'quit_score': 'quit score',
        'survival_answers_10': 'survival (10 ans.)',
        'survival_answers_100': 'survival (100 ans.)',
        'survival_time_60': 'survival (1 min.)',
        'survival_time_600': 'survival (10 min.)',
        'learning_slope_5': 'learning (5)',
        'learning_slope_10': 'learning (10)',
        'learning_slope_20': 'learning (20)',
    })
    data = data[~data['context'].apply(lambda c: 'region_cz' in c)]
    plt.title('Correlation of different metrics')
    sns.heatmap(data.corr().abs(), annot=True, fmt='.2f')
    output.savefig('abexp_metric_corr')
    g = sns.PairGrid(
        data[[
            # 'quit score',
            'survival (100 ans.)',
            'survival (10 min.)',
            'survival (10 ans.)',
            'survival (1 min.)',
            # 'learning (10)',
            'experiment',
        ]], hue='experiment')
    g = g.map_diag(plt.hist)
    g = g.map_offdiag(plt.scatter)
    g = g.add_legend()
    output.savefig('abexp_metrics', tight_layout=False)
Esempio n. 31
0
def main():
    activities = ["EDA", "Plots", "Model Building"]
    choice = st.sidebar.selectbox("Select Activities", activities)

    if choice == 'EDA':
        st.subheader("Exploratory Data Analysis")

        data = st.file_uploader("Upload a Dataset", type=["csv", "txt"])
        if data is not None:
            df = pd.read_csv(data)
            st.dataframe(df.head())

            if st.checkbox("Show Shape"):
                st.write(df.shape)

            if st.checkbox("Show Columns"):
                all_columns = df.columns.to_list()
                st.write(all_columns)

            if st.checkbox("Summary"):
                st.write(df.describe())

            if st.checkbox("Show Selected Columns"):
                selected_columns = st.multiselect("Select Columns",
                                                  all_columns)
                new_df = df[selected_columns]
                st.dataframe(new_df)

            if st.checkbox("Show Value Counts"):
                st.write(df.iloc[:, -1].value_counts())

            if st.checkbox("Correlation Plot(Matplotlib)"):
                plt.matshow(df.corr())
                st.pyplot()

            if st.checkbox("Correlation Plot(Seaborn)"):
                st.write(sns.heatmap(df.corr(), annot=True))
                st.pyplot()

            if st.checkbox("Pie Plot"):
                all_columns = df.columns.to_list()
                column_to_plot = st.selectbox("Select 1 Column", all_columns)
                pie_plot = df[column_to_plot].value_counts().plot.pie(
                    autopct="%1.1f%%")
                st.write(pie_plot)
                st.pyplot()

    elif choice == 'Plots':
        st.subheader("Data Visualization")
        data = st.file_uploader("Upload a Dataset", type=["csv", "txt"])
        if data is not None:
            df = pd.read_csv(data)
            st.dataframe(df.head())

            if st.checkbox("Show Value Counts"):
                st.write(df.iloc[:, -1].value_counts().plot(kind='bar'))
                st.pyplot()

            # Customizable Plot

            all_columns_names = df.columns.tolist()
            type_of_plot = st.selectbox(
                "Select Type of Plot",
                ["area", "bar", "line", "hist", "box", "kde"])
            selected_columns_names = st.multiselect("Select Columns To Plot",
                                                    all_columns_names)

            if st.button("Generate Plot"):
                st.success("Generating Customizable Plot of {} for {}".format(
                    type_of_plot, selected_columns_names))

                # Plot By Streamlit
                if type_of_plot == 'area':
                    cust_data = df[selected_columns_names]
                    st.area_chart(cust_data)

                elif type_of_plot == 'bar':
                    cust_data = df[selected_columns_names]
                    st.bar_chart(cust_data)

                elif type_of_plot == 'line':
                    cust_data = df[selected_columns_names]
                    st.line_chart(cust_data)

                # Custom Plot
                elif type_of_plot:
                    cust_plot = df[selected_columns_names].plot(
                        kind=type_of_plot)
                    st.write(cust_plot)
                    st.pyplot()

    elif choice == 'Model Building':
        st.subheader("Building ML Models")
        data = st.file_uploader("Upload a Dataset", type=["csv", "txt"])
        if data is not None:
            df = pd.read_csv(data)
            st.dataframe(df.head())

            # Model Building
            X = df.iloc[:, 0:-1]
            Y = df.iloc[:, -1]
            seed = 7
            # prepare models
            models = []
            models.append(('LR', LogisticRegression()))
            models.append(('LDA', LinearDiscriminantAnalysis()))
            models.append(('KNN', KNeighborsClassifier()))
            models.append(('CART', DecisionTreeClassifier()))
            models.append(('NB', GaussianNB()))
            models.append(('SVM', SVC()))
            # evaluate each model in turn

            model_names = []
            model_mean = []
            model_std = []
            all_models = []
            scoring = 'accuracy'
            for name, model in models:
                kfold = model_selection.KFold(n_splits=10, random_state=seed)
                cv_results = model_selection.cross_val_score(model,
                                                             X,
                                                             Y,
                                                             cv=kfold,
                                                             scoring=scoring)
                model_names.append(name)
                model_mean.append(cv_results.mean())
                model_std.append(cv_results.std())

                accuracy_results = {
                    "model name": name,
                    "model_accuracy": cv_results.mean(),
                    "standard deviation": cv_results.std()
                }
                all_models.append(accuracy_results)

            if st.checkbox("Metrics As Table"):
                st.dataframe(
                    pd.DataFrame(zip(model_names, model_mean, model_std),
                                 columns=["Algo", "Mean of Accuracy", "Std"]))

            if st.checkbox("Metrics As JSON"):
                st.json(all_models)

# In[40]:


# "convert" ungly state values in dict to 1d-array (only without usable ace)

v_matrix = np.zeros(100)

i = 0
for playersum in range(10):
    for dealercard in range (10):
        v_matrix[i] = state_values[(playersum, dealercard, 0)]
        i += 1
        


# In[41]:


import seaborn as sns
plt.figure(figsize=(4, 4))
sns.heatmap(v_matrix.reshape(10, 10),  cmap="YlGnBu", annot=True, cbar=False, square=True);


# In[ ]:




Esempio n. 33
0
sns.distplot(stock2['Gain'], hist = False, color = 'r' )
sns.distplot(stock3['Gain'], hist = False, color = 'g' )
sns.distplot(stock4['Gain'], hist = False, color = 'y' )

# ## Correlation
All_Stocks = pd.concat([stock1['Gain'],stock2['Gain'],stock3['Gain'],stock4['Gain']], axis=1)

names = [symbol1, symbol2, symbol3, symbol4]
All_Stocks.columns = names
All_Stocks = All_Stocks.dropna()

print (All_Stocks.corr())

#Heat map
sns.set(rc={"figure.figsize": (6, 4)});
sns.heatmap( All_Stocks.corr())

# ### Monthly Returns
Stock1_Monthly = stock1.asfreq('M').ffill()
Stock2_Monthly = stock2.asfreq('M').ffill()
Stock3_Monthly = stock3.asfreq('M').ffill()
Stock4_Monthly = stock4.asfreq('M').ffill()

print('Monthly Returns')
print('Stock '+ symbol1 + ' Mean:', Stock1_Monthly["Gain"].mean())
print('Stock '+ symbol1 + ' Variances:', Stock1_Monthly["Gain"].var())

print('Monthly Returns')
print('Stock '+ symbol2 + ' Mean:', Stock2_Monthly["Gain"].mean())
print('Stock '+ symbol2 + ' Variances:', Stock2_Monthly["Gain"].var())
Esempio n. 34
0
    "wt_bc==0").query("lib_type.isin(['sub','del'])")
antibody_selection_mean_for_heatmap = antibody_selection_mean_for_heatmap.loc[:,
                                                                              a20_postions_subs]
antibody_selection_mean_for_heatmap.index = antibody_selection_mean_for_heatmap.index.droplevel(
    [1, 2])
antibody_selection_mean_for_heatmap = antibody_selection_mean_for_heatmap.reindex(
    DESIRED_AA_ORD)
antibody_selection_mean_for_heatmap.head()

# In[31]:

sns.set(**PAPER_PRESET)
fig, ax = plt.subplots(figsize=[3.174 * 1.5, 1.625 * 1.5])
sns.heatmap(antibody_selection_mean_for_heatmap.apply(np.log2),
            cmap='RdBu_r',
            vmin=-10,
            vmax=10,
            ax=ax,
            yticklabels=True)
ax.set_ylabel("Amino Acid")
ax.set_xlabel("VP Position")

# In[32]:

all_residues = antibody_selection_mean.apply(np.log2).dropna()
a20_residues = antibody_selection_mean.loc[a20_postions_subs].apply(
    np.log2).dropna()

a20_fraction = a20[a20 > 2.5].count() / float(a20.count())
all_residues_fraction = all_residues[all_residues > 2.5].count() / float(
    all_residues.count())
fig = plt.figure(figsize=(1.2, 1.1))
# In[16]:

sns.pairplot(customers)

# In[30]:

customers.isna().sum()

# In[31]:

customers.corr()

# In[35]:

sns.heatmap(customers.corr(), annot=True)

# In[37]:

customers.columns

# In[69]:

customers['Avatar'].unique()

# In[94]:

customers_i1 = customers.copy(deep=True)

# In[95]:
Esempio n. 36
0
#%%
data_911_date_reason = data_911.groupby(['date', 'reason'])
data_911_date_reason_count = data_911_date_reason.count()
data_911_date_reason_count = data_911_date_reason_count.reset_index()
fg = sns.FacetGrid(data=data_911_date_reason_count, col='reason')
fg.map(sns.lineplot, 'date', 'timeStamp')

#%%
data_dayofweek_hour_group = data_911.groupby(['dayofweek_str', 'hour'])
data_dayofweek_hour_group_count = data_dayofweek_hour_group.count()
#data_dayofweek_hour_group_count
data_dayofweek_hour_group_count = data_dayofweek_hour_group_count[
    'timeStamp'].unstack(level=-1)

#%%
sns.heatmap(data=data_dayofweek_hour_group_count)

#%%
sns.clustermap(data=data_dayofweek_hour_group_count)

#%%
data_dayofweek_month_group = data_911.groupby(['month', 'dayofweek_str'])
data_dayofweek_month_group_count = data_dayofweek_month_group.count()
#data_dayofweek_hour_group_count
data_dayofweek_month_group_count = data_dayofweek_month_group_count[
    'timeStamp'].unstack(level=0)

#%%
sns.heatmap(data=data_dayofweek_month_group_count)

#%%
# Compute the correlation matrix
corr = dataset1[["revenue", "budget", "popularity", "runtime", "num_of_cast", "num_of_male_cast",
                 "num_of_female_cast",
                 "num_genres", "num_of_production_countries", "day_of_week", "month", "year", "week_of_year", "season",
                 "title_len", "overview_len", "tagline_len",
                 "num_of_directors", "num_of_producers", "num_of_editors", "num_of_art_crew", "num_of_sound_crew",
                 "num_of_costume_crew", "num_of_camera_crew", "num_of_visual_effects_crew", "num_of_lighting_crew",
                 "num_of_other_crew"]].corr()

# Generate a mask for the upper triangle
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True

sns.heatmap(corr, mask=mask, 
            annot=True, 
            fmt=".2f", 
            cmap='coolwarm')

plt.title("Correlation between numerical features")



#   Bivariate Analysis for log-transformed numerical features

sns.set(rc={'figure.figsize':(18,20)})

# Compute the correlation matrix
corr = dataset1[["log_revenue", "log_budget", "log_popularity", "log_runtime",
                 "log_num_of_cast", "log_num_of_male_cast",
                 "log_num_of_female_cast", "num_genres", "num_of_production_countries",
                "day_of_week", "month", "year", "week_of_year", "season",
Esempio n. 38
0
"""Inference : <br>
Before Attrition, There were a total of 14,999‬ Employees working for the company. <br>
And after the Attrition, 3571 employees left the company leaving behind 11428 employees.

1.  Analysis for Existing Employees.

Visualising and Dropping off the Completely Null Columns
"""

# to see how many values are missing in each column.
df_exist.isnull().sum()

# visualizing and observing the null elements in the dataset
plt.figure(figsize=(10,10))
sns.heatmap(df_exist.isnull(), cbar = False, cmap = 'YlGnBu')   # ploting missing data && # cbar, cmap = colour bar, colour map

"""Inference : <br>
There are null entries in both the datasets.

Checking for duplicate value columns
"""

x = set()                                                    # set() as to store only the unique values
for i in range(df_exist.shape[1]):
        c1 = df_exist.iloc[:, i]
        for j in range(i + 1, df_exist.shape[1]):
            c2 = df_exist.iloc[:, j]
            if c1.equals(c2):
                x.add(df_exist.columns.values[j])
for col in x:
Esempio n. 39
0
model.add(Dense(32, activation='relu', input_dim=8))
model.add(Dense(16, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=200, verbose=False)

# 결과 - 정확도
scores = model.evaluate(X_train, y_train, verbose=False)
print("Training Accuracy: %.2f%%\n" % (scores[1]*100))
scores = model.evaluate(X_test, y_test, verbose=False)
print("Testing Accuracy: %.2f%%\n" % (scores[1]*100))

# 결과 - 혼동 행렬
y_test_pred = model.predict_classes(X_test)
c_matrix = confusion_matrix(y_test, y_test_pred)
ax = sns.heatmap(c_matrix, annot=True, xticklabels=['No Diabetes', 'Diabetes'], yticklabels=['No Diabetes', 'Diabetes'], cbar=False, cmap='Blues')
ax.set_xlabel("Prediction")
ax.set_ylabel("Actual")
plt.show()
plt.clf()

# 결과 - ROC 곡선
y_test_pred_probs = model.predict(X_test)
FPR, TPR, _ = roc_curve(y_test, y_test_pred_probs)
plt.plot(FPR, TPR)
plt.plot([0,1],[0,1],'--', color='black') #diagonal line
plt.title('ROC Curve')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.show()
plt.clf()
plt.title('Scatter plot between Age vs Estimated Salary')

sns.set_style('whitegrid')
sns.FacetGrid(ads,hue='Purchased',height=7).map(plt.scatter,'Age','EstimatedSalary').add_legend()
plt.show()

ads.drop(columns=['User ID'],inplace=True)

sns.set_style('whitegrid')
sns.pairplot(ads,hue='Purchased',height=5)
plt.show()

sns.boxplot(x='Purchased',y='EstimatedSalary',data=ads)

sns.jointplot(x='Age',y='EstimatedSalary',data=ads)

sns.regplot(x='Age',y='EstimatedSalary',data=ads,fit_reg=True,order =6)

sns.lmplot(x='Age',y='EstimatedSalary',data=ads,hue='Purchased')

sns.clustermap(ads.corr(),figsize=(7,7),annot=True)
plt.show()

sns.heatmap(ads.corr(),annot =True,cbar_kws={'orientation':'horizontal'})
plt.show()

ads.isnull().sum()

ads.info()

ads.describe()
Esempio n. 41
0
def associations(dataset,
                 nominal_columns='auto',
                 mark_columns=False,
                 theil_u=False,
                 plot=True,
                 return_results=False,
                 clustering=False,
                 nan_strategy=REPLACE,
                 nan_replace_value=DEFAULT_REPLACE_VALUE,
                 ax=None,
                 **kwargs):
    """
    Calculate the correlation/strength-of-association of features in data-set
    with both categorical (eda_tools) and continuous features using:
     * Pearson's R for continuous-continuous cases
     * Correlation Ratio for categorical-continuous cases
     * Cramer's V or Theil's U for categorical-categorical cases

    **Returns:** a DataFrame of the correlation/strength-of-association between
    all features

    **Example:** see `associations_example` under `dython.examples`

    Parameters
    ----------
    dataset : NumPy ndarray / Pandas DataFrame
        The data-set for which the features' correlation is computed
    nominal_columns : string / list / NumPy ndarray
        Names of columns of the data-set which hold categorical values. Can
        also be the string 'all' to state that all columns are categorical,
        'auto' (default) to try to identify nominal columns, or None to state
        none are categorical
    mark_columns : Boolean, default = False
        if True, output's columns' names will have a suffix of '(nom)' or
        '(con)' based on there type (eda_tools or continuous), as provided
        by nominal_columns
    theil_u : Boolean, default = False
        In the case of categorical-categorical feaures, use Theil's U instead
        of Cramer's V
    plot : Boolean, default = True
        If True, plot a heat-map of the correlation matrix
    return_results : Boolean, default = False
        If True, the function will return a Pandas DataFrame of the computed
        associations
    clustering : Boolean, default = False
        If True, hierarchical clustering is applied in order to sort
        features into meaningful groups
    nan_strategy : string, default = 'replace'
        How to handle missing values: can be either 'drop_samples' to remove
        samples with missing values, 'drop_features' to remove features
        (columns) with missing values, or 'replace' to replace all missing
        values with the nan_replace_value. Missing values are None and np.nan.
    nan_replace_value : any, default = 0.0
        The value used to replace missing values with. Only applicable when
        nan_strategy is set to 'replace'
    ax : matplotlib ax, default = None
      Matplotlib Axis on which the heat-map will be plotted
    kwargs : any key-value pairs
        Arguments to be passed to used function and methods
    """
    dataset = convert(dataset, 'dataframe')
    if nan_strategy == REPLACE:
        dataset.fillna(nan_replace_value, inplace=True)
    elif nan_strategy == DROP_SAMPLES:
        dataset.dropna(axis=0, inplace=True)
    elif nan_strategy == DROP_FEATURES:
        dataset.dropna(axis=1, inplace=True)
    columns = dataset.columns
    if nominal_columns is None:
        nominal_columns = list()
    elif nominal_columns == 'all':
        nominal_columns = columns
    elif nominal_columns == 'auto':
        nominal_columns = identify_nominal_columns(dataset)

    corr = pd.DataFrame(index=columns, columns=columns)
    for i in range(0, len(columns)):
        for j in range(i, len(columns)):
            if i == j:
                corr[columns[i]][columns[j]] = 1.0
            else:
                if columns[i] in nominal_columns:
                    if columns[j] in nominal_columns:
                        if theil_u:
                            corr[columns[j]][columns[i]] = theils_u(
                                dataset[columns[i]],
                                dataset[columns[j]],
                                nan_strategy=SKIP)
                            corr[columns[i]][columns[j]] = theils_u(
                                dataset[columns[j]],
                                dataset[columns[i]],
                                nan_strategy=SKIP)
                        else:
                            cell = cramers_v(dataset[columns[i]],
                                             dataset[columns[j]],
                                             nan_strategy=SKIP)
                            corr[columns[i]][columns[j]] = cell
                            corr[columns[j]][columns[i]] = cell
                    else:
                        cell = correlation_ratio(dataset[columns[i]],
                                                 dataset[columns[j]],
                                                 nan_strategy=SKIP)
                        corr[columns[i]][columns[j]] = cell
                        corr[columns[j]][columns[i]] = cell
                else:
                    if columns[j] in nominal_columns:
                        cell = correlation_ratio(dataset[columns[j]],
                                                 dataset[columns[i]],
                                                 nan_strategy=SKIP)
                        corr[columns[i]][columns[j]] = cell
                        corr[columns[j]][columns[i]] = cell
                    else:
                        cell, _ = ss.pearsonr(dataset[columns[i]],
                                              dataset[columns[j]])
                        corr[columns[i]][columns[j]] = cell
                        corr[columns[j]][columns[i]] = cell
    corr.fillna(value=np.nan, inplace=True)
    if mark_columns:
        marked_columns = [
            '{} (nom)'.format(col)
            if col in nominal_columns else '{} (con)'.format(col)
            for col in columns
        ]
        corr.columns = marked_columns
        corr.index = marked_columns
    if clustering:
        corr, _ = cluster_correlations(corr)
    if plot:
        if ax is None:
            plt.figure(figsize=kwargs.get('figsize', None))
        sns.heatmap(
            corr,
            cmap=kwargs.get('cmap', None),
            annot=kwargs.get('annot', True),
            fmt=kwargs.get('fmt', '.2f'),
            ax=ax
        )
        if ax is None:
            plt.show()
    if return_results:
        return corr
Esempio n. 42
0
def run(argv):
    args = parseArgs(argv)
    # print(args.work_dir)
    # print(args.config_dir)
    global workpath
    global ntpath
    global threadsnum
    global positivepath
    global negativepath
    workpath = args.work_dir
    threadsnum = str(args.processes)
    if workpath[-1] != "/":
        workpath = workpath + "/"
    controlpath = args.config_dir
    try:  #Create Work Directory
        os.mkdir(workpath)
    except FileExistsError:
        print("File exists: " + workpath)
    ntpath = args.database_path
    '''
    Get Positive/Negative controls
    '''

    print("Work Path:" + workpath)
    print("Config file path: " + controlpath)
    with open(controlpath, "r") as f:
        l = f.readlines()
    [positive, positive_paths, negative, negative_paths, whole,
     group] = getinfo(l)
    positivepath = workpath + "positive/"
    negativepath = workpath + "negative/"
    if detectdataexist(workpath):
        print("Found generated positive/negative controls, skipping data\
 gathering process...")
    else:
        print("#############################################################")
        print("Taxon group: " + group)
        print("Positive control group:")
        for i in positive:
            print(i)
        print()
        print("Negative control group:")
        for i in negative:
            print(i)
        download_db(whole, group)  #Download database from NCBI database
        os.system("mkdir " + positivepath)
        os.system("mkdir " + negativepath)
    for i in positive_paths:
        for j in os.listdir(i):
            os.system("cp " + i + "/" + j + " " + positivepath)
    for i in negative_paths:
        for j in os.listdir(i):
            os.system("cp " + i + "/" + j + " " + negativepath)
    '''
    Generate and Print Tree
    '''
    if args.tree_help:
        printTree()

    print("#############################################################")
    print("Concatenating Negative Control...")
    randrefindex = np.random.randint(len(os.listdir(positivepath)))
    randreddir = os.listdir(positivepath)[randrefindex]
    gatherdatapath = workpath + "gatherdata/"
    os.system("mkdir " + gatherdatapath)
    with open(gatherdatapath + "concatenatedneg.fasta", "w") as f:
        with open(positivepath + randreddir, "r") as k:
            f.writelines(k.readlines())
        for i in os.listdir(negativepath):
            with open(negativepath + i, "r") as j:
                f.writelines(j.readlines())
    os.system("cp " + positivepath + "*" + " " + gatherdatapath)

    if "MUMs" not in os.listdir(workpath):
        print("#############################################################")
        print("running parsnp")
        print("python ./parsnp/Parsnp.py -c -r ! -d " + gatherdatapath +
              " -o " + workpath + "MUMs/ -p " + threadsnum)
        os.system("python ./parsnp/Parsnp.py -c -r " + gatherdatapath +
                  randreddir + " -d " + gatherdatapath + " -o " + workpath +
                  "MUMs/ -p " + threadsnum)
    with open(workpath + "MUMs/parsnp.xmfa", "r") as f:
        readlist = f.readlines()
    clusters = generate_read_list(readlist, 40)
    #     clusters = {}
    #     for i in range(len(nlist)):
    #         cluster = nlist[i].split(" ")[2]
    #         if cluster not in clusters:                                             #Exclude all LCBs
    #             if ('A' in rlist[i]) or ('T' in rlist[i]) or ('C' in rlist[i])\
    #                  or ('G' in rlist[i]) or ('-' in rlist[i]):
    #                 continue
    #             clusters.update({cluster: rlist[i]})
    os.system("mkdir " + workpath + "finalMUMs/")
    for i in clusters:
        with open(workpath + "finalMUMs/" + i + ".fasta", "w") as f:
            f.write(">" + i + "\n")
            f.write(clusters[i])
    clusterl = os.listdir(workpath + "finalMUMs/")
    if args.MUMS_only:
        print('-m: not entering into Blastn step, exiting...')
        exit()
    print("Running Blastn agianst nt database")
    os.system("mkdir " + workpath + "blastresult/")
    for i in pbar(clusterl):
        if ntpath == None:
            os.system(
                "blastn -max_target_seqs 2000 -query -db nt " + workpath +
                "finalMUMs/" + i + " -out " + workpath + "blastresult/" + i +
                ".out -outfmt '6 qseqid sseqid pident evalue stitle' -num_threads "
                + threadsnum + " -remote")
        else:
            os.system(
                "blastn -max_target_seqs 2000 -db " + ntpath + " -query " +
                workpath + "finalMUMs/" + i + " -out " + workpath +
                "blastresult/" + i +
                ".out -outfmt '6 qseqid sseqid pident evalue stitle' -num_threads "
                + threadsnum)
    # os.system("mv *.out " + workpath + "blastresult/")
    os.system('find ' + workpath + 'blastresult/ -name "*" -type\
 f -size 0c | xargs -n 1 rm -f')  #Remove all damaged blast results
    blastresultfile = os.listdir(workpath + 'blastresult/')
    [percent2strain, strain] = percenttostrain(workpath + 'blastresult/',\
         blastresultfile, clusters)
    positivelist = []
    others = []
    for i in positive:
        for j in strain:
            if set(j.split(" ")) > set(i.split(" ")):
                positivelist.append(j)
            else:
                others.append(j)
    whole = positivelist + others
    print("generating whole result...")
    res = pd.DataFrame(columns=["MUM"] + whole)
    for i in percent2strain:
        res = res.append(i, ignore_index=True)
    res = res.fillna(0)
    res.set_index(["MUM"], inplace=True)
    print("generating whole complete genome result...")
    rescg = res
    for i in strain:
        keys = i.split(" ")
        if keys[-1] == "genome" and keys[-2] == "complete":
            continue
        rescg = rescg.drop(i, axis=1)
    with open(workpath + "all_strains.csv", "w") as f:
        f.write(res.to_csv())
    with open(workpath + "complete_genomes.csv", "w") as f:
        f.write(rescg.to_csv())

    print("Generating heatmap with all over\
    95% alignment scores strains' complete genomes")
    newcg = pd.read_csv(workpath + "complete_genomes.csv").drop("MUM", axis=1)
    complete_genomes = dropless90(newcg)
    f, ax = plt.subplots(figsize=(200, 50))
    sns_plot = sns.heatmap(complete_genomes,
                           cmap=sns.color_palette("Blues", 500),
                           linewidths=0.1,
                           ax=ax)
    ax.set_title('Blast result for all MUMs')
    ax.set_xlabel('Strains')
    ax.set_ylabel('MUMs')
    sns_plot.figure.savefig(workpath + "competegenomes.png")
scaled_date_score.columns = [ s + '_s' for s in ['attr', 'sinc', 'intel', 'fun', 'amb', 'shar']]
data = pd.concat([data, scaled_date_score], axis=1)

data_algorithm = data.drop(['match', 'iid', "id","idg", "condtn", "wave", "round", "position", "partner", "pid", "career_c", "sports", "tvsports", 'exercise', 'dining', 'museums', 'art', 'hiking', 'gaming', 'clubbing','reading', 'tv', 'theater', 'movies','concerts', 'music', 'shopping', 'yoga'], axis=1)
data1 = data_algorithm.drop(list(data_algorithm.filter(regex="field")), axis=1)
data1 = data1.drop(list(data1.filter(regex="goal")), axis=1)
data1 = data1.drop(list(data1.filter(regex="_o")), axis=1)
data1 = data1.drop(list(data1.filter(regex="race")), axis=1)

corr = data1.corr()
corr_dec = corr['dec'].sort_values(ascending=False)

plt.subplots(figsize=(20,15))
ax = plt.axes()
ax.set_title("Correlation Heatmap")
sns.heatmap(corr,xticklabels=corr.columns.values,yticklabels=corr.columns.values)
#%% Check to see what the different genders value most on paper
from Plots import PlotBarSeries
male_rows = data[data['gender'] == 1]
female_rows = data[data["gender"] == 0]
male_avg = male_rows.mean()
female_avg = female_rows.mean()

self_look_for_before_average_male = male_avg[['attr1_1', 'sinc1_1', 'intel1_1', 'fun1_1', 'amb1_1', 'shar1_1']]
self_look_for_before_average_female = female_avg[['attr1_1', 'sinc1_1', 'intel1_1', 'fun1_1', 'amb1_1', 'shar1_1']]
dataframe = pd.concat([self_look_for_before_average_male, self_look_for_before_average_female],axis=1).T
dataframe.index = ["male", "female"]
PlotBarSeries(dataframe, "Mean value","Attribute value mean by gender (round 1_1)", [0,30])


#%% Mean values by attribute for dec = 1
Esempio n. 44
0
# In[61]:

#Let's break it down and reevaluate for closing prices
returns_fig = sns.PairGrid(closing_df)
#Lets have scatter plot
returns_fig.map_upper(plt.scatter, color='purple')
# Lets call KDE plot on the lower
returns_fig.map_lower(sns.kdeplot, cmap='cool_d')
#On the diagnal lets call the histogram
returns_fig.map_diag(plt.hist, bins=30)

# In[62]:

#To check out the actual value of co-relation , we do the actual heatmap with corelation functiom
sns.heatmap(tech_rets.dropna().corr(), annot=True)

# In[63]:

#Lets check on closing date
sns.heatmap(closing_df.corr(), annot=True)

# In[64]:

# Let's clean the version of the tech_rets DataFrame
rets = tech_rets.dropna()

# In[65]:

# Let's start by defining a new DataFrame as a clenaed version of the orignal tech_rets DataFrame
area = np.pi * 20
def show_heat_map(dataset):
    pca = PCA(n_components=2)
    comps = pd.DataFrame(columns=dataset['feature_names'])
    print(comps)
    sb.heatmap(comps, annot=False, linewidths=.5)
    plt.show()
def plot_cat(attr, labels=None):
    if (attr == 'JobRole'):
        sns.factorplot(data=df, kind='count', size=5, aspect=3, x=attr)
        return

    sns.factorplot(data=df, kind='count', size=5, aspect=1.5, x=attr)


#plot_cat('Attrition')
#corelation matrix.
cor_mat = df.corr()
mask = np.array(cor_mat)
mask[np.tril_indices_from(mask)] = False
fig = plt.gcf()
fig.set_size_inches(30, 12)
sns.heatmap(data=cor_mat, mask=mask, square=True, annot=True, cbar=True)
#df.drop(['BusinessTravel','DailyRate','EmployeeCount','EmployeeNumber','HourlyRate','MonthlyRate','NumCompaniesWorked','Over18','StandardHours', 'StockOptionLevel','TrainingTimesLastYear',],axis=1,inplace=True)

df.drop([
    'DailyRate', 'EmployeeCount', 'EmployeeNumber', 'HourlyRate',
    'MonthlyRate', 'Over18', 'StandardHours', 'StockOptionLevel',
    'TrainingTimesLastYear', 'EducationField', 'EnvironmentSatisfaction',
    'JobInvolvement', 'JobRole', 'JobSatisfaction', 'PercentSalaryHike',
    'PerformanceRating', 'RelationshipSatisfaction', 'YearsInCurrentRole',
    'YearsSinceLastPromotion', 'YearsWithCurrManager', 'WorkLifeBalance'
],
        axis=1,
        inplace=True)

le = LabelEncoder()
axes[0, 1].set(xlabel='atemp', title='体感温度分布')
axes[1, 0].set(xlabel='humidity', title='湿度分布')
axes[1, 1].set(xlabel='windspeed', title='风速分布')
plt.savefig('修正后分布分析.png')

# 计算相关系数
correlation = train.corr()
influence_order = correlation['count'].sort_values(ascending=False)
influence_order_abs = abs(correlation['count']).sort_values(ascending=False)
print(influence_order)
print(influence_order_abs)

# 作相关性分析的热力图
f, ax = plt.subplots(figsize=(16, 16))
cmap = sn.cubehelix_palette(light=1, as_cmap=True)
sn.heatmap(correlation, center=1, annot=True, cmap=cmap, linewidths=2, ax=ax)
plt.show()
plt.savefig('相关性分析.png')

# 每个特征对租赁量的影响
# 时间对租赁量的影响
# (1)时间维度——年份
sn.boxplot(train['year'], train['count'])
plt.title("The influence of year")
plt.show()
plt.savefig('年份对租赁总数的影响.png')
# (2)时间维度——月份
sn.pointplot(train['month'], train['count'])
plt.title("The influence of month")
plt.show()
plt.savefig('月份对租赁总数的影响.png')
# **The above graph shows that the Indian restaurants have recieved more positive reviews** 

# In[48]:


# calculating the mean values of the numerical columns, grouping it by the category, stars
stars = df.groupby('review_stars').mean()
stars


# In[49]:


# Visualising the correlation between the dataframe stars
sns.heatmap(stars.corr(),cmap='coolwarm',annot=True)


# # 5.2.3 Sentiment Detection

# In[37]:


# Classifying the dataset and splitting it into the reviews and stars.
# Here, we will classify the dataset into 3 types of rating. Rating 1 = "negative" , 3 ="Average", and 5 ="Positive".
data_class = df[(df.review_stars==1) | (df.review_stars==3) | (df.review_stars==5)]


# In[38]:

Esempio n. 49
0
def main():
    activities=['Analysis and Statistics of Chat','Visualization of chat']
    choice=st.sidebar.selectbox("Select Activity",activities)
    if choice=='Analysis and Statistics of Chat':
        st.header("ANALYSIS OF CHAT")
        data=st.file_uploader("Upload a file",type=['txt','csv'])
        if data is None:
            steps()
        else:
            df=process_data(data)
            ##Show processed data
            st.header("Processed Data")
            st.write(df)

            #media msg info
            st.header("Who Share more media msg ?")
            st.subheader(' media msgs info')
            mediamsg=df[df['Text']==' media shared']
            st.write(mediamsg)
            no_mediamsg=mediamsg['Name'].value_counts()
            st.write(no_mediamsg)
            st.bar_chart(no_mediamsg)

            ##word and letter count
            st.header("Words and Letters used by Each person")
            df['Letter_Count']=df['Text'].apply(char_counter)
            df['Word_Count']=df['Text'].apply(word_counter)
            st.write(df[['Text','Letter_Count','Word_Count']])

            ##most active user
            st.header("most active user :"******"Number of messages send by each user")
            st.write(df[['Name','Text']].groupby('Name').count())

            #Words used by each person
            st.header("Words used by each person: ")
            name_value_count=df['Name'].value_counts().head(4)
            st.write(name_value_count)

            ##chat go time
            #st.header("how long did the chat go?")

            ##delete file
            # st.header("Delete File")
            # if st.button("Delete uploaded file"):
            #     # os.remove("")
            #     st.write("File Removed")




    else:
        st.header("CHAT VISUALIZATION")
        data=st.file_uploader("Upload a file",type=['txt','csv'])
        if data is None:
            steps()
        else:
            df=process_data(data)
            df['Letter_Count']=df['Text'].apply(char_counter)
            df['Word_Count']=df['Text'].apply(word_counter)
            all_columns_name=df.columns.tolist()

            #Seaborn plot
            if st.checkbox("Correlation plot[Seaborn]"):
                st.write(sns.heatmap(df.corr(),annot=True))
                st.pyplot()

            #Count plot
            if st.checkbox("Plot of Value counts"):
                st.text("Value count by targets")
                all_columns_name=df.columns.tolist()
                primary_col=st.selectbox("Primary Column To GroupBy",all_columns_name)
                selected_columns_names=st.multiselect("Select Columns",all_columns_name)
                if st.button("Plot"):
                    st.text("Generate a plot")
                    if selected_columns_names:
                        vc_plot=df.groupby(primary_col)[selected_columns_names].count()
                    else:
                        vc_plot=df.iloc[:,-1].value_counts()
                    st.write(vc_plot.plot(kind="bar"))
                    st.pyplot()

            
            #pie chart
            if st.checkbox("Pie plot"):
                all_columns_name=df.columns.tolist()
                if st.button("Generate pie plot"):
                    st.write(df.iloc[:,-1].value_counts().plot.pie(autopct="%1.1f%%"))
                    st.pyplot()

            #customizable plot
            type_of_plot=st.selectbox("Select type of plot",["area","bar","line","hist","box","kde"])
            selected_columns_names=st.multiselect("Select Columns To Plot",all_columns_name)
            if st.button("Generate a Plot"):
                st.success("Generating a plot of {} for {}".format(type_of_plot,selected_columns_names))
                if type_of_plot=='area':
                    cust_data=df[selected_columns_names]
                    st.area_chart(cust_data)

                if type_of_plot=='bar':
                    cust_data=df[selected_columns_names]
                    st.bar_chart(cust_data)

                if type_of_plot=='line':
                    cust_data=df[selected_columns_names]
                    st.line_chart(cust_data)

                # custom plot

                if type_of_plot:
                    cust_plot=df[selected_columns_names].plot(kind=type_of_plot)
                    st.pyplot()

            if st.button("It's Completed"):
                st.balloons()
def main():
    input_imgs = 'data/fs-20sbj-output/in_bin_img/'
    output_image_file = 'figures/brain_segmentation_mni.png'
    ref_image_file = 'data/fs-20sbj-output/mni_reference.nii'

    # Load images using nibabel
    img_sum = ''
    for img in os.listdir(input_imgs):
        if os.path.splitext(os.path.basename(img))[1] in ['.nii', '.gz']:
            img1 = nibabel.load(os.path.join(input_imgs, img))
            data1 = img1.get_data()
            if img_sum == '':
                img_sum = data1
                continue
            img_sum = img_sum + data1
    im_ref = nibabel.load(ref_image_file)
    im_data_ref = im_ref.get_data()

    # Check that both images have the same dimensions
    # shape1 = im1.header.get_data_shape()
    # shape2 = im_ref.header.get_data_shape()
    hor_view = img_sum[129, :, :]
    hor_view_ref = im_data_ref[98, :, :]
    ver_view = img_sum[:, 155, :]
    ver_view_ref = im_data_ref[:, 116, :]
    axi_view = img_sum[:, :, 130]
    axi_view_ref = im_data_ref[:, :, 94]

    # Heatmap plots
    startcolor = '#990033'
    midcolor = '#ffff00'
    endcolor = '#FFFFFF'
    own_cmap1 = mpl.colors.LinearSegmentedColormap.from_list(
        'own2', [startcolor, midcolor, endcolor])
    fig = plt.figure(figsize=(25, 10), facecolor='white')
    ax1 = fig.add_subplot(131)
    ax2 = fig.add_subplot(132)
    ax3 = fig.add_subplot(133)
    plt.subplots_adjust(hspace=0.05, wspace=0.005)
    cbar_ax = fig.add_axes([.91, .18, .03, .65])
    cbar_ax.tick_params(labelsize=28, color='black', labelcolor='black')
    cbar_ax.yaxis.label.set_size(32)
    cbar_ax.yaxis.label.set_color('black')

    own_cmap1.set_under("0.5", alpha=0)
    hmax = sns.heatmap(np.rot90(ver_view),
                       cbar_ax=cbar_ax,
                       cmap=own_cmap1,
                       xticklabels='',
                       yticklabels='',
                       cbar_kws={'label': 'Number of subjects'},
                       ax=ax3,
                       vmin=1,
                       vmax=7)
    hmax.imshow(np.rot90(ver_view_ref),
                cmap='gray',
                aspect='equal',
                extent=hmax.get_xlim() + hmax.get_ylim())

    hmax2 = sns.heatmap(np.rot90(hor_view),
                        cmap=own_cmap1,
                        xticklabels='',
                        yticklabels='',
                        cbar=False,
                        ax=ax2,
                        vmin=1,
                        vmax=8)
    hmax2.imshow(np.rot90(hor_view_ref),
                 cmap='gray',
                 aspect='equal',
                 extent=hmax2.get_xlim() + hmax2.get_ylim())

    hmax3 = sns.heatmap(np.rot90(axi_view),
                        cmap=own_cmap1,
                        xticklabels='',
                        yticklabels='',
                        cbar=False,
                        ax=ax1,
                        vmin=1,
                        vmax=8)
    hmax3.imshow(np.rot90(axi_view_ref),
                 cmap='gray',
                 aspect='equal',
                 extent=hmax3.get_xlim() + hmax3.get_ylim())
    plt.rcParams['axes.facecolor'] = 'black'
    plt.savefig(output_image_file,
                facecolor=fig.get_facecolor(),
                bbox_inches='tight')
Esempio n. 51
0
weather = pd.read_csv('Weather.csv')
weather.head()
patientInfo.head()
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
plt.figure(figsize = (25,4))   
sns.countplot(patientInfo['province'])    #No. of cases in state of South Korea
sns.countplot(patientInfo['sex'])    #No. of male and female affected in south Korea
sns.countplot(patientInfo['sex'],hue = patientInfo['age'])     #Male and female based on ages
for i in range(3326):
    patientInfo['age'][i] = str(patientInfo['age'][i])[:2]
for i in range(3326):
    if patientInfo['age'][i][1] == 's':
        patientInfo['age'][i] = str(patientInfo['age'][i])[:1]
sns.heatmap(patientInfo.isna())
patientInfo.drop(['global_num','disease','infection_case','infection_order','infected_by',
              'contact_number','symptom_onset_date','confirmed_date','released_date','deceased_date'],axis = 1,inplace = True)
patientInfo.drop(['age'],inplace = True,axis = 1)
patientInfo['avg_temp'] = None
for i in range(3326):
    if patientInfo['sex'][i] == 'male':
        patientInfo['sex'][i] = 1
    else:
        patientInfo['sex'][i] = 0
for i in range(3326):
    for j in range(25135):
        if patientInfo['province'][i] == weather['province'][j]:
            patientInfo['avg_temp'][i] = weather['avg_temp'][j]
patientInfo.drop(['country','province','city','state'],axis = 1,inplace = True)
patientInfo.dropna(inplace = True)
Esempio n. 52
0
data.region = le.transform(data.region)

# A few words about coding "region". In general, categorical variables with large variability are best encoded using OneHotEncoder and so on.  But in this case, nothing will change, because there is no special order in which the regions would be listed. So I'm being very lazy only used the Label Encoder.

# In[ ]:


data.corr()['charges'].sort_values()

# In[ ]:



f, ax = pl.subplots(figsize=(10, 8))
corr = data.corr()
sns.heatmap(corr, mask=np.zeros_like(corr, dtype=np.bool), cmap=sns.diverging_palette(240,10,as_cmap=True),
            square=True, ax=ax)

# A strong correlation is observed only with the fact of smoking the patient.  To be honest, I expected a higher correlation with bmi.  Well. We investigate smoking in more detail.
# ![![image.png](attachment:image.png)](https://img-s2.onedio.com/id-5aa155e69065f7cf10132bc5/rev-0/w-500/s-5c6ec7366c0b35f7b310eae5c1ee17526982e700.gif)

# First, let's look at the distribution of charges. This will help us to know how much patients spend on treatment on average.
# We're importing another useful library that we'll need a few more times. Although it's not necessary, why not :D

# In[ ]:


from bokeh.io import output_notebook, show
from bokeh.plotting import figure
output_notebook()
import scipy.special
from bokeh.layouts import gridplot
Esempio n. 53
0
print(unique_country)

###Let's check for country
alpha = 1.0
plt.figure(figsize=(10, 25))
sns.countplot(y='country', data=dataset, alpha=alpha)
plt.title('Data by country')
plt.show()

# Between Genders Male vs Female
plt.figure(figsize=(7, 7))
sex = sns.countplot(x='sex', data=dataset)

# Corelation between the Data
plt.figure(figsize=(16, 7))
cor = sns.heatmap(dataset.corr(), annot=True)

g = sns.jointplot(dataset.year,
                  dataset.suicides_no,
                  kind="kde",
                  color="#bfa9e0",
                  size=7)
plt.savefig('graph.png')

# Visualizing which age of people Suicide the most
plt.figure(figsize=(16, 7))
bar_age = sns.barplot(x='sex', y='suicides_no', hue='age', data=dataset)

# Visualizing which Generation of people Suicide the most
plt.figure(figsize=(16, 7))
bar_gen = sns.barplot(x='sex', y='suicides_no', hue='generation', data=dataset)
#%%
# a sorted dataframe to get highest PNLs in the first rows
idx = pd.unique([i[1] for i in np.flip(t.values)])
col = pd.unique([i[0] for i in np.flip(t.values)])
sorted_df = data[col].loc[idx]

#%% [markdown]
# ### Taking best pairs, plot NLP
# Heatmap helps to visualilze results

#%%
# plot a heat map with top 30 PNLs
# light colors indicate high, while dark colors low values
m = np.array(sorted_df)
figure(num=None, figsize=(8, 8), dpi=80, facecolor='w', edgecolor='k')
ax = sns.heatmap(m, linewidth=0.01, cmap="RdYlGn")
ax.set_xticklabels(col, rotation=90)
ax.set_yticklabels(idx, rotation = 45)
plt.show()

#%% [markdown]
# # Testing for different windows sizes
#%% [markdown]
# ### Before the backtesting was for only one window size. It's also interesting to see how the strategy would work with different windows sizes

#%%
# recompute the NLP matrix for the above found combinations for window sizes from 1 to number of days specified
number_of_days = 30
top_n = 20
t_10 = so_st.tail(n=top_n).index
empty = np.zeros([top_n, number_of_days])
Esempio n. 55
0
# Flatten
axes = axes.flatten()

# Loop
for i, f in enumerate(FEATURES):

    # Show
    print("\n\n%s:" % f)
    print(b[f].T)

    # Draw a heatmap with the numeric values in each cell
    sns.heatmap(b[f].T,
                annot=True,
                fmt='.0f',
                annot_kws={'fontsize': 8},
                linewidths=.5,
                ax=axes[i],
                cmap=sns.cm.rocket_r)

    # Configure axes
    axes[i].set(xlabel="", title='<%s>' % f)

# Adjust
plt.tight_layout()

##########################################################################
# Lets plot the ``normalized`` count of samples for each individual dataset
# and the corresponding day
#
Esempio n. 56
0
Outliers_to_drop = detect_outliers(train, 2, ["Age", "SibSp", "Parch", "Fare"])
train.loc[Outliers_to_drop]  # Show the outliers rows
train = train.drop(Outliers_to_drop, axis=0).reset_index(drop=True)
train_len = len(train)
dataset = pd.concat(objs=[train, test], axis=0).reset_index(drop=True)
dataset = dataset.fillna(np.nan)
dataset.isnull().sum()
train.info()
train.isnull().sum()
train.head()
train.dtypes
train.describe()
g = sns.heatmap(
    train[["Survived", "SibSp", "Parch", "Age", "Fare"]].corr(),
    annot=True,
    fmt=".2f",
    cmap="coolwarm",
)
g = sns.factorplot(x="SibSp",
                   y="Survived",
                   data=train,
                   kind="bar",
                   size=6,
                   palette="muted")
g.despine(left=True)
g = g.set_ylabels("survival probability")
g = sns.factorplot(x="Parch",
                   y="Survived",
                   data=train,
                   kind="bar",
                   size=6,
def divergence(file,docs,ref_string):

    documents_list = []
    document_ids = []
    
    for d in range(len(docs)-1):
        document_ids.append(d)
        string = docs[d]
#         print("Doc :",d)
#         print(string)
        tokens = regex.tokenize(string)
        doc = []
        for j in tokens:
            if j.lower() not in stop:
                lem = lemmatizer.lemmatize(j.lower())
                doc.append(lem)
        documents_list.append(doc)
        
    string = ref_string
    tokens = regex.tokenize(string)
    doc = []
    
    for j in tokens:
        if j.lower() not in stop:
            lem = lemmatizer.lemmatize(j.lower())
            doc.append(lem)
            
    documents_list.append(doc)
    common_dictionary = Dictionary(documents_list)
    common_corpus = [common_dictionary.doc2bow(text) for text in documents_list]
    n,a,e = parameter_tuning(common_corpus,common_dictionary,documents_list)
    final_lda = gensim.models.LdaMulticore(common_corpus,id2word = common_dictionary,alpha = a,eta = e, workers=5,num_topics=4,chunksize=100,passes=10,random_state=100)
    
#     ref_vector = final_lda[common_corpus[len(common_corpus)-1]]
    
#     for i in range(len(common_corpus)):
#         print(final_lda[common_corpus[i]])
    
    print("Topic is :",file)
    print()
    
    print("topic distribution by words :")
    topic_words_dist = final_lda.show_topics(num_words=10, log=False, formatted=True)
    for i in range(len(topic_words_dist)):
        print(topic_words_dist[i])
    
    print()
    
    lda_array = np.full((len(common_corpus),n),0.001)
    for i in range(lda_array.shape[0]):
        vector = final_lda[common_corpus[i]]
        for j in vector:
            col = j[0]
            lda_array[i,col] = j[1]
            
    print("topic array :")
    for i in range(lda_array.shape[0]):
        if i!=lda_array.shape[0]-1:
            print(document_ids[i],":",lda_array[i:i+1,:])
        else:
            print("Reference summary :",lda_array[i:i+1,:])
    print()
#     print(np.sum(lda_array[10:11,:]))
    
    relevance = []
    for i in range(0,lda_array.shape[0]-1):
        document = lda_array[i:i+1,:]
        reference = lda_array[lda_array.shape[0]-1:lda_array.shape[0],:]
        cur_rel = find_rel(reference,document)
        relevance.append(cur_rel)
        
    redundancy = 0
    ref_vector = lda_array[lda_array.shape[0]-1:lda_array.shape[0],:]
    for i in range(ref_vector.shape[1]):
        redundancy = redundancy + (ref_vector[0,i]*math.log2(ref_vector[0,i]))
        
    intra_topic_r = np.zeros((lda_array.shape[0]-1,lda_array.shape[0]-1))
    r,c = intra_topic_r.shape
    for i in range(r):
        for j in range(c):
            if i==j:
                intra_topic_r[i,j] = np.inf
            else:
                doc_1 = lda_array[i:i+1,:]
                doc_2 = lda_array[j:j+1,:]
                intra_topic_r[i,j] = find_rel(doc_1,doc_2)
             
    
    redundancy_vector = []
    for i in range(0,lda_array.shape[0]-1):
        red = 0 
        d_vector = lda_array[i:i+1,:]
        for j in range(d_vector.shape[1]):
            red = red + (d_vector[0,j]*math.log2(d_vector[0,j]))
        redundancy_vector.append(red)
        
    intra_topic_d = np.zeros((lda_array.shape[0]-1,lda_array.shape[0]-1))
    r,c = intra_topic_d.shape
    for i in range(r):
        for j in range(c):
            if i==j:
                intra_topic_d[i,j] = np.inf
            else:
                intra_topic_d[i,j] = -(intra_topic_r[i,j] - redundancy_vector[i])
        
    mx = maximum(intra_topic_r)
    mn = minimum(intra_topic_r)
    normalized_intra_topic_r = normalize_r(intra_topic_r,mn,mx)
    print("Per document relevance is :")
    perdoc_rel = expectation(normalized_intra_topic_r)
    print()
    print("Intra-topic relevance is :")
    sns.set(font_scale=1.5)
    ax = sns.heatmap(normalized_intra_topic_r,vmin=-1, vmax=0 ,cmap = "YlGnBu",annot=False,linewidth=2.5)
    plt.savefig(file[0:len(file)-4]+".svg")
    plt.show()
    print()

    mx = maximum(intra_topic_d)
    mn = minimum(intra_topic_d)
    normalized_intra_topic_d = normalize_d(intra_topic_d,mn,mx)
    print("Per document divergence is :")
    perdoc_div = expectation(normalized_intra_topic_d)
    print()
    print("Intra-topic divergence is :")
    ax = sns.heatmap(normalized_intra_topic_d,vmin=0, vmax=1 ,cmap = "YlGnBu",annot=True,linewidth=0.5)
    plt.show()
    print()
    
    print("Redundancy vector is :")
    print(redundancy_vector)
    print()
    redundancy_dataset.append(sum(redundancy_vector)/len(redundancy_vector))
    relevance_dataset.append(sum(perdoc_rel)/len(perdoc_rel))
Esempio n. 58
0
def periodAnalysisPlot(folder, name='PeriodicityAll5.npz'):
    npzfile = np.load(os.path.join(folder, name))
    period_array = npzfile['arr_0']
    QD_array = npzfile['arr_1']
    width_array = npzfile['arr_2']
    count_array = npzfile['arr_3']
    period_list = period_array.tolist()
    period_list = [0 if x is None else x for x in period_list]
    period_array = np.array(period_list)
    #print(period_array.tolist())
    #print(width_array[np.where(period_array==0)], QD_array[np.where(period_array==0)], count_array[np.where(period_array==0)])

    fig, ax = plt.subplots(1, 1)
    total = period_array.size
    bin_number = np.ceil(np.sqrt(total)) // 2 * 2 + 1
    bins = np.linspace(-.5, 15, num=17)
    print(period_array.tolist())
    ax.hist(period_array,
            log=True,
            normed=True,
            bins=bins,
            alpha=1,
            label='Period of Memory',
            rwidth=0.8)
    ax.set_xlabel('Period')
    ax.set_ylabel('Probability of occurring')

    fig, ax = plt.subplots(1, 1)
    total = period_array.size
    bin_number = np.ceil(np.sqrt(total)) // 2 * 2 + 1
    bins = np.linspace(-0.5, 15, num=7)
    print(period_array.tolist())
    ax.hist(period_array,
            log=True,
            bins=bins,
            alpha=1,
            label='Period of Memory',
            rwidth=0.8)
    ax.set_xlabel('Period')
    ax.set_ylabel('Number of occurrences')

    data_dict = {
        'version': count_array,
        'QD': QD_array * 100,
        'Width': width_array * 1e-9,
        'Count': period_array
    }

    data_df = pd.DataFrame(data_dict)
    grouped = data_df.groupby('version')

    data = data_df.groupby(by=['Width', 'QD']).mean()
    piv = pd.pivot_table(data,
                         values='Count',
                         index=['QD'],
                         columns=['Width'],
                         fill_value=0)

    plt.figure()
    yticks = piv.index.values.round(2).tolist()[::4]
    ax = sns.heatmap(piv,
                     vmin=0,
                     vmax=6,
                     square=True,
                     yticklabels=yticks[::1],
                     cbar_kws={'label': 'Average period of lattice'},
                     cmap='Blues',
                     xticklabels=4)
    ax.set_yticks(np.array(yticks) * ax.get_ylim()[1] / 10.)
    ax.invert_yaxis()
    plt.tight_layout()
    ax.set_ylabel('Quenched Disorder (%)')
    ax.set_xlabel('Width (nm)')
    plt.setp(ax.get_xticklabels(), rotation=90, horizontalalignment='right')
    plt.setp(ax.get_yticklabels(), rotation=0, verticalalignment='top')

    for vers, group in grouped:
        print(vers)
        data = group.groupby(by=['Width', 'QD']).mean()
        piv = pd.pivot_table(data,
                             values='Count',
                             index=['QD'],
                             columns=['Width'],
                             fill_value=0)
        plt.figure()
        print(piv.index.values.round(2))
        print(list(piv.index.values.round(2))[::4])
        print(piv.index.values.round(2).tolist()[::4])
        yticks = piv.index.values.round(2).tolist()[::4]
        ax = sns.heatmap(piv,
                         vmin=0,
                         vmax=6,
                         square=True,
                         yticklabels=yticks[::1],
                         cbar_kws={'label': 'Average period of lattice'},
                         cmap='Blues',
                         xticklabels=4)
        ax.set_yticks(np.array(yticks) * ax.get_ylim()[1] / 10.)
        ax.invert_yaxis()
        plt.tight_layout()
        ax.set_ylabel('Quenched Disorder (%)')
        ax.set_xlabel('Width (nm)')
        plt.setp(ax.get_xticklabels(),
                 rotation=90,
                 horizontalalignment='right')
        plt.setp(ax.get_yticklabels(), rotation=0, verticalalignment='top')
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
%matplotlib inline

#get data
dataset = pd.read_csv('Social_Network_Ads.csv')

#checking out data
dataset.head()

#data info
dataset.info()

#checking for null values
sns.heatmap(dataset.isnull())

#droping userIID
dataset.drop(['User ID'], axis=1, inplace=True)

#getting dummies for gender
gender=pd.get_dummies(dataset['Gender'],drop_first=True)

#concating gender dataframe with dataset
dataset=pd.concat([dataset,gender],axis=1)

#drop Gender
dataset.drop(['Gender'],inplace=True,axis=1)

dataset.head()
Esempio n. 60
0
print("\ninfo :")
print('#############################################################')
print(ad_data.info)
print('#############################################################')

sns.pairplot(data=ad_data, x_vars=['TV', 'Radio', 'Newspaper'], y_vars='Sales')
plt.show()

print("\ncorrelation :")
print('#############################################################')
print(ad_data.corr())
print('#############################################################')

plt.title('Correlation b/w Data Attributes')
sns.heatmap(ad_data.corr(), annot=True)
plt.show()

#creating data
X = ad_data['TV']
y = ad_data['Sales']

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    train_size=0.7,
                                                    random_state=100)
print("\nSample of train data :")
print('#############################################################')
print(X_train.head())
print('#############################################################')