def plot_nesting(results, thresh=.5, rotate='oblimin', title=True, dpi=300, figsize=12, ext='png', plot_dir=None): """ Plots nesting of factor solutions Args: results: a dimensional structure results object thresh: the threshold to pass to EFA.get_nesting_matrix dpi: the final dpi for the image figsize: scalar - the width and height of the (square) image ext: the extension for the saved figure plot_dir: the directory to save the figure. If none, do not save """ EFA = results.EFA explained_scores, sum_explained = EFA.get_nesting_matrix(thresh, rotate=rotate) # plot lower nesting fig, ax = plt.subplots(1, 1, figsize=(figsize, figsize)) cbar_ax = fig.add_axes([.905, .3, .05, .3]) sns.heatmap(sum_explained, annot=explained_scores, fmt='.2f', mask=(explained_scores==-1), square=True, ax = ax, vmin=.2, cbar_ax=cbar_ax, xticklabels = range(1,sum_explained.shape[1]+1), yticklabels = range(1,sum_explained.shape[0]+1)) ax.set_xlabel('Higher Factors (Explainer)', fontsize=25) ax.set_ylabel('Lower Factors (Explainee)', fontsize=25) ax.set_title('Nesting of Lower Level Factors based on R2', fontsize=30) if plot_dir is not None: filename = 'lower_nesting_heatmap.%s' % ext save_figure(fig, path.join(plot_dir, filename), {'bbox_inches': 'tight', 'dpi': dpi}) plt.close()
def show_sgs_result(r, ax): N = r[0].shape[0] * r[0].shape[1] df = pd.DataFrame(np.c_[r[0].reshape(N), r[1].reshape(N), r[2].reshape(N)], columns=['x', 'y', 'flux']) df = df.pivot('x', 'y') sns.heatmap(df, ax=ax, robust=True, cbar=False, cmap="coolwarm", xticklabels=False, yticklabels=False) ax.set_ylabel("y") ax.set_xlabel("x")
def plot_corr(file, score, stat, ind_var, brain_type): # seaborn sns.set(style="white") # import the dataframe dt = pd.read_csv(file) # Compute the correlation matrix corr = dt.corr() ### Create the matrix figure with seaborn # Generate a mask for the upper triangle mask = np.zeros_like(corr, dtype=np.bool) mask[np.triu_indices_from(mask)] = True # Set up the matplotlib figure f, ax = plt.subplots(figsize=(len(ind_var),len(ind_var))) # Generate a custom diverging colormap cmap = sns.diverging_palette(220, 10, as_cmap=True) # Draw the heatmap with the mask and correct aspect ratio sns.heatmap(corr, mask=mask, cmap=cmap, annot=False, ax=ax) plt.subplots_adjust(left= 0.30,bottom=0.30) plt.savefig(os.path.join(stat,score, "heatmap_" + score + "_" + stat + "_"+ brain_type + ".png")) plt.close() return corr
def get_cov(data): dat = data.training_data_all_ways + data.testing_data_all_ways num_ways = len(data.get_list_of_ways()) m = {} i = 0 for way in data.get_list_of_ways(): m[way] = i i += 1 mat = np.zeros((num_ways,num_ways)) for elem in dat: ways = elem[1] for way in ways: mat[m[way],m[way]] = mat[m[way],m[way]] + 1 for w1 in ways: for w2 in ways: if w1 == w2: continue mat[m[w1],m[w2]] = mat[m[w1],m[w2]] + 1 print mat emp_cov = empirical_covariance(mat) print emp_cov corr = np.zeros((num_ways,num_ways)) for i in range(num_ways): for j in range(num_ways): corr[i,j] = emp_cov[i,j]/(math.sqrt(emp_cov[i,i])*math.sqrt(emp_cov[j,j])) print corr sns.heatmap(corr,vmin = -1, vmax = 1,square=True,xticklabels=m.keys(),yticklabels=m.keys()) sns.plt.title("Covariance of WAYS frequencies") sns.plt.show()
def plotGraphicalCorrelationMatrix(data): ''' Input : data Output : graphical correlation matrix Inspired from : https://stanford.edu/~mwaskom/software/seaborn/examples/many_pairwise_correlations.html ''' try: print "\nGenerating the graphical correlation matrix...\n" time.sleep(3) corr = data.corr() f, ax = plt.subplots(figsize=(20, 20)) # Generate a custom diverging colormap cmap = sns.diverging_palette(220, 10, as_cmap=True) # Draw the heatmap with the mask and correct aspect ratio sns.heatmap(corr, cmap=cmap, square=True, xticklabels=False, yticklabels=False, linewidths=.5, cbar_kws={"shrink": .5}, ax=ax) plt.title('Correlation Matrix', fontsize=30) ax.set_ylabel('Features', fontsize=20) ax.set_xlabel('Features', fontsize=20) xticklabels = ['video_category_id','viewCount','likeCount','dislikeCount','favoriteCount','commentCount','dimension','definition','caption','licensedContent'] ylabel = xticklabels[::-1] ax.set_xticklabels(xticklabels, rotation=45) ax.set_yticklabels(ylabel, rotation=0) name = "../YoutubeData/correlation_matrix.pdf" plt.savefig(name) print "\nPlease close the Bar Chart when you want to move ahead..." plt.show() print "You can always retrieve the graphical correlation matrix in YoutubeData folder.\n" time.sleep(3) return True except: raise VideoAnalysisException(" Error while Generating the graphical correlation matrix")
def plot_performance(parser, args, pore_measure): """ Plot the pore performance in terms of reads per pore """ flowcell_layout = minion_flowcell_layout() pore_values = [] for pore in flowcell_layout: if pore in pore_measure: pore_values.append(pore_measure[pore]) else: pore_values.append(0) # make a data frame of the lists d = {'rownum': range(1,17)*32, 'colnum': sorted(range(1,33)*16), 'tot_reads': pore_values, 'labels': flowcell_layout} df = pd.DataFrame(d) d = df.pivot("rownum", "colnum", "tot_reads") sns.heatmap(d, annot=True, fmt="d", linewidths=.5) if args.saveas is not None: plot_file = args.saveas plt.savefig(plot_file, figsize=(8.5, 8.5)) else: plt.show()
def item_nbr_tendency_finely(store_nbr, year, month_start=-1, month_end=-1, graph=True): ''' input 1. store_nbr = 스토어 번호 2. year = 연도 3. month_start = 시작달 4. month_start = 끝달 5. graph = 위의 정보에 대한 item_nbr 그래프 출력여부 output 1. store_nbr, year, month로 filtering한 item_nbr의 pivot 테이블 ''' store = df_1[(df_1['store_nbr'] == store_nbr) & (df_1['year'] == year)] if month_start != -1: if month_end == -1: month_end = month_start + 1 store = store[(month_start <= store['month']) & (store['month'] < month_end)] pivot = store.pivot_table(index='item_nbr', columns='date', values='units', aggfunc=np.sum) zero_index = pivot == 0 pivot = pivot[pivot != 0].dropna(axis=0, how='all') pivot[zero_index] = 0 if graph: plt.figure(figsize=(12, 8)) sns.heatmap(pivot, cmap="YlGnBu", annot=True, fmt='.0f') plt.show() return pivot
def sample_54_2(): """ 5.4 使用seaborn可视化数据 :return: """ change_df = pd.DataFrame({'tsla': tsla_df.p_change}) # join usGOOG change_df = change_df.join(pd.DataFrame({'goog': ABuSymbolPd.make_kl_df('usGOOG', n_folds=2).p_change}), how='outer') # join usAAPL change_df = change_df.join(pd.DataFrame({'aapl': ABuSymbolPd.make_kl_df('usAAPL', n_folds=2).p_change}), how='outer') # join usFB change_df = change_df.join(pd.DataFrame({'fb': ABuSymbolPd.make_kl_df('usFB', n_folds=2).p_change}), how='outer') # join usBIDU change_df = change_df.join(pd.DataFrame({'bidu': ABuSymbolPd.make_kl_df('usBIDU', n_folds=2).p_change}), how='outer') change_df = change_df.dropna() # 表5-2所示 print('change_df.head():\n', change_df.head()) # 使用corr计算数据的相关性 corr = change_df.corr() _, ax = plt.subplots(figsize=(8, 5)) # sns.heatmap热力图展示每组股票涨跌幅的相关性 sns.heatmap(corr, ax=ax) plt.show()
def plot_heatmap(df): df2 = df[['bonus', 'deferred_income', 'exercised_stock_options', 'expenses', 'long_term_incentive', 'other', 'restricted_stock', 'salary', 'total_payments', 'total_stock_value', 'from_messages', 'from_poi_to_this_person', 'from_this_person_to_poi', 'shared_receipt_with_poi', 'to_messages', 'perc_from_poi', 'perc_to_poi']] colormap = plt.cm.viridis plt.figure(figsize=(12,12)) plt.title("Pearson's Correlation of Features", y=1.05, size=15) sns.heatmap(df2.astype(float).corr(),linewidths=0.1,vmax=1.0, square=True, cmap=colormap, linecolor='white', annot=True) fig = plt.gcf() fig.set_size_inches(18.5, 10.5) fig.savefig('PearsonCorrelationOfFeatures.png', dpi=100) #plt.savefig('PearsonCorrelationOfFeatures.png') plt.show()
def plot_EFA_retest(combined, size=4.6, dpi=300, ext='png', plot_dir=None): corr = combined.corr() max_val = abs(corr).max().max() fig = plt.figure(figsize=(size,size)); ax = fig.add_axes([.1, .1, .8, .8]) cbar_ax = fig.add_axes([.92, .15, .04, .7]) sns.heatmap(corr, square=True, ax=ax, cbar_ax=cbar_ax, vmin=-1, vmax=1, cmap=sns.diverging_palette(220,15,n=100,as_cmap=True), cbar_kws={'orientation': 'vertical', 'ticks': [-1, 0, 1]}); ax.set_xticklabels(ax.get_xticklabels(), rotation=90) ax.set_yticklabels(ax.get_yticklabels(), rotation=0) ax.tick_params(labelsize=size/len(corr)*40) # format cbar axis cbar_ax.set_yticklabels([format_num(-max_val), 0, format_num(max_val)]) cbar_ax.tick_params(labelsize=size, length=0, pad=size/2) cbar_ax.set_ylabel('Factor Loading', rotation=-90, fontsize=size, labelpad=size/2) # set divider lines n = corr.shape[1] ax.axvline(n//2, 0, n, color='k', linewidth=size/3) ax.axhline(n//2, 0, n, color='k', linewidth=size/3) if plot_dir is not None: save_figure(fig, path.join(plot_dir, 'EFA_test_retest_heatmap.%s' % ext), {'bbox_inches': 'tight', 'dpi': dpi}) plt.close()
def plot_number_of_options_by_attempt(length): data = load_options_by_attempt() data['value'] = data['value'].apply(lambda x: x * 100) data = data[(data['attempt'] < length)] max_options = data['options'][data['options'] != 0].max() data['options'] = data['options'].apply(lambda x: max_options + 1 if x == 0 else x) cols = len(data['experiment_setup_name'].unique()) gs = gridspec.GridSpec(1, cols, width_ratios=[3.5] * (cols - 1) + [4]) rcParams['figure.figsize'] = cols * 2, int(5 * length / 50) rcParams['axes.linewidth'] = 1 for j, (setup, setup_data) in enumerate(data.groupby('experiment_setup_name')): for opt in range(2, max_options + 1): if opt not in setup_data['options'].unique(): for attempt in range(0, int(length)): setup_data = setup_data.append(pandas.DataFrame([{'attempt': attempt, 'options': opt, 'value': 0}])) plt.subplot(gs[j]) to_plot = setup_data.pivot_table(columns='options', index='attempt', values='value', dropna=False, fill_value=0) plt.title(setup) sns.heatmap(to_plot, annot=False, cbar=(j == cols - 1), linewidths=0.1, cbar_kws={'format': '%.0f%%'}) plt.xticks(plt.xticks()[0], [lab.get_text() if int(lab.get_text()) <= max_options else 'O' for lab in plt.xticks()[1]]) if j != 0: plt.gca().axes.get_yaxis().set_ticks([]) plt.ylabel('') else: pos = plt.yticks()[0] lab = plt.yticks()[1] plt.yticks([pos[0], pos[-1]], [int(lab[0].get_text()) + 1, int(lab[-1].get_text()) + 1]) output.savefig('options_by_attempt')
def _plot_monthly_returns(self, stats, ax=None, **kwargs): """ Plots a heatmap of the monthly returns. """ returns = stats['returns'] if ax is None: ax = plt.gca() monthly_ret = perf.aggregate_returns(returns, 'monthly') monthly_ret = monthly_ret.unstack() monthly_ret = np.round(monthly_ret, 3) monthly_ret.rename( columns={1: 'Jan', 2: 'Feb', 3: 'Mar', 4: 'Apr', 5: 'May', 6: 'Jun', 7: 'Jul', 8: 'Aug', 9: 'Sep', 10: 'Oct', 11: 'Nov', 12: 'Dec'}, inplace=True ) sns.heatmap( monthly_ret.fillna(0) * 100.0, annot=True, fmt="0.1f", annot_kws={"size": 8}, alpha=1.0, center=0.0, cbar=False, cmap=cm.RdYlGn, ax=ax, **kwargs) ax.set_title('Monthly Returns (%)', fontweight='bold') ax.set_ylabel('') ax.set_yticklabels(ax.get_yticklabels(), rotation=0) ax.set_xlabel('') return ax
def test_gp_inference_per(): # few to many observations. Less than 4 here normally crashes to due to bad covaricance matrices observations_n = range(20,50,2) number_steps =100 ripl = shortcuts.make_lite_church_prime_ripl() every_n_step=1 kl_matrix=np.zeros((len(observations_n),number_steps/every_n_step)) for n_i in range(len(observations_n)): n = observations_n[n_i] x = np.random.uniform(0,30,n) y = f_periodic(x)# + np.random.normal(0,0.1,n) ripl.clear() ripl.bind_foreign_sp("make_gp_part_der",gp.makeGPSP) ripl.assume('make_const_func', VentureFunction(covs.makeConstFunc, [t.NumberType()], covs.constantType)) ripl.assume('zero', "(apply_function make_const_func 0)") ripl.assume('make_per',VentureFunction(covs.makePeriodic,[t.NumberType(), t.NumberType(), t.NumberType()], t.AnyType("VentureFunction"))) ripl.assume('make_noise',VentureFunction(covs.makeNoise,[t.NumberType()], t.AnyType("VentureFunction"))) ripl.assume("func_plus", covs.makeLiftedAdd(lambda x1, x2: x1 + x2)) ripl.assume('sf','(tag (quote hyper) 0 (uniform_continuous 0 100 ))') ripl.assume('l','(tag (quote hyper) 1 (uniform_continuous 0 100 ))') ripl.assume('p','(tag (quote hyper) 2 (uniform_continuous 0.01 100 ))') ripl.assume('sigma','0.1') ripl.assume('per', "(apply_function make_per sf p l )") ripl.assume('wn', "(apply_function make_noise sigma )") ripl.assume('gp',"""(tag (quote model) 0 (make_gp_part_der zero (apply_function func_plus per wn ) ) )""") makeObservations(x,y,ripl) for steps in range(number_steps): if (steps % every_n_step )==0: xpost = np.random.uniform(33,36,1)[0] ypost = [] for i in range(100): y = ripl.sample("(gp (array " + str(xpost) + " ))") ypost.append(y) kl_matrix[n_i][steps/every_n_step]= KL_normal(np.mean(ypost),np.std(ypost),f(xpost),0.1) ripl.infer("(mh (quote hyper) one 1)") orig_cmap = matplotlib.cm.coolwarm shifted_cmap = shiftedColorMap(orig_cmap, midpoint=0.3, name='shifted') sns.heatmap(kl_matrix,cmap=shifted_cmap,yticklabels=observations_n) sns.plt.show() max_kl = kl_matrix.max() shift = 1./max_kl heavily_shifted_cmap = shiftedColorMap(orig_cmap, midpoint=shift, name='shifted') sns.heatmap(kl_matrix,cmap= heavily_shifted_cmap ,yticklabels=observations_n) sns.plt.show()
def plot_confusion(classifier,threshold =0.4): x_train,x_test,y_train,y_test = train_test_split(df_new,y,test_size = 0.2) y_pred = [] try: prob_score = clf_grid.predict_proba(x_train) except: prob_score = clf_grid.predict_proba(np.float_(x_train)) a = prob_score[:,1] for idx,item in enumerate(a): if item>= threshold: item = 1 else: item =0 y_pred.append(item) # Plotting class_name = classifier.__repr__() class_name = re.sub(r'\([^)]*\)','',class_name) print ("") print ("") print("Legends") print ('1 - Substantiated') print ('0 - Unfounded') print("") print("Confusion Matrix: "+ class_name+ " (threshold- " +str(threshold)+")" ) sns.heatmap(metrics.confusion_matrix(y_pred, y_train), annot=True, cmap="YlGnBu",fmt ="d") plt.xlabel('Predicted') plt.ylabel('True')
def main(csv_filepath): """Exploratory data analysis for the Titanic dataset.""" # Read data dtype = {'PassengerId': 'str', 'Embarked': 'category', 'Survived': 'category', 'Pclass': 'category', 'Sex': 'category', 'SibSp': 'uint8', 'Parch': 'uint8'} df = pd.read_csv(csv_filepath, dtype=dtype) describe_pandas_df(df, dtype=dtype) # Show histograms numeric_types = ['float64', 'int64', 'uint8'] numerical_features = df.select_dtypes(include=numeric_types) numerical_features.hist(figsize=(30, 16), bins=50, xlabelsize=8, ylabelsize=8) plt.savefig("titanic-histograms.png") plt.show() # Show correlations import seaborn as sns corr = numerical_features.corr() sns.heatmap(corr) plt.savefig("titanic-correlation.png") plt.show()
def plot_region_heatmap(self, clim=None): """ Plots a frequency x region heatmap of mean t-statistics. """ # mean t-stat within subject by region and frequency, then mean across subjects mean_df = self.group_df.groupby(['subject', 'regions', 'frequency']).mean().groupby(['regions', 'frequency']).mean() mean_df = mean_df.reset_index() # ignore data without a region mean_df['regions'].replace('', np.nan, inplace=True) mean_df = mean_df.dropna(subset=['regions']) # reshape it for easier plotting with seaborn mean_df = mean_df.pivot_table(index='frequency', columns='regions', values='t-stat') # center the colormap and plot if clim is None: clim = np.max(np.abs(mean_df.values)) with sns.plotting_context("talk"): sns.heatmap(mean_df, cmap='RdBu_r', yticklabels=mean_df.index.values.round(2), vmin=-clim, vmax=clim, cbar_kws={'label': 't-stat'}) plt.gca().invert_yaxis() plt.ylabel('Frequency') plt.xlabel('') plt.gcf().set_size_inches(12, 9)
def _process(self,data): for x in data: if data[x][1] not in self.data: #prepares the data to visualise the xcor matrix of a specific batch number. self.data[data[x][1]]={} self.data[data[x][1]]['matrix']=numpy.identity(self.size) self.data[data[x][1]]['ro_count']=0 self.data[data[x][1]]['matrix'][(data[x][2][1],data[x][2][0])]=data[x][0] #self.addToProvState('batch_'+str(data[x][1]),self.data[data[x][1]]['matrix'],metadata={'matrix':str(self.data[data[x][1]]['matrix'])},dep=['batch_'+str(data[x][1])],ignore_inputs=False) self.data[data[x][1]]['ro_count']+=1 if self.data[data[x][1]]['ro_count']==(self.size*(self.size-1))/2: matrix=self.data[data[x][1]]['matrix'] d = pd.DataFrame(data=matrix, columns=range(0,self.size),index=range(0,self.size)) mask = numpy.zeros_like(d, dtype=numpy.bool) mask[numpy.triu_indices_from(mask)] = True # Set up the matplotlib figure f, ax = plt.subplots(figsize=(11, 9)) # Generate a custom diverging colormap cmap = sns.diverging_palette(220, 10, as_cmap=True) # Draw the heatmap with the mask and correct aspect ratio sns.heatmap(d, mask=mask, cmap=cmap, vmax=1, square=True, linewidths=.5, cbar_kws={"shrink": .5}, ax=ax) sns.plt.savefig("./plots/"+str(data[x][1])+"_plot.png") self.write('output',(matrix,data[x][1]),metadata={'matrix':str(d),'batch':str(data[x][1])},dep=['batch_'+str(data[x][1])])
def main(): """Go Main Go""" pgconn = get_dbconn('asos') dfin = read_sql(""" with mob as ( select date_trunc('hour', valid) as ts, avg(dwpf) from alldata where station = 'MOB' and dwpf is not null GROUP by ts), cmi as ( select date_trunc('hour', valid) as ts, avg(dwpf) from alldata where station = 'CMI' and dwpf is not null GROUP by ts), agg as ( select m.ts, m.avg as dwpf, c.avg as tmpf from mob m JOIN cmi c on (m.ts = c.ts)) select extract(month from ts) as month, extract(hour from ts) as hour, sum(case when dwpf >= tmpf then 1 else 0 end) / count(*)::float * 100. as freq from agg GROUP by month, hour ORDER by month, hour """, pgconn, index_col=None) df = dfin.pivot("month", "hour", "freq") fig, ax = plt.subplots(figsize=(9, 6)) ax.set_title(("Hourly Frequency of Mobile (MOB) Dew Point\n" "greater than or equal to Champaign (CMI) Dew Point")) sns.heatmap(df, annot=True, fmt=".0f", linewidths=.5, ax=ax, vmin=5, vmax=100) print(ax.get_yticks()) ax.set_xlabel("Hour of Day (CDT or CST)") ax.set_xticklabels(["Mid", "1AM", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "Noon", "1PM", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11"]) ax.set_yticklabels(calendar.month_abbr[1:]) fig.savefig('test.png')
def add_tensor_2d(box, length = 10, feature_num = 5, notification_box = None, x_label = None, y_label = None): # # Generate a large random dataset # rs = np.random.RandomState(33) # d = pd.DataFrame(data=rs.normal(size=(length, feature_num)),) # # # Generate a mask for the upper triangle # mask = np.zeros_like(d, dtype=np.bool) # mask[np.triu_indices_from(mask)] = True uniform_data = np.random.randn(feature_num, length) # Draw the heatmap with the mask and correct aspect ratio region = calculate_region(box, length, feature_num) cur_ax = plt.gcf().add_axes(region) sns.heatmap(uniform_data, xticklabels = False, yticklabels = False, square=True, linewidths=.5, ax = cur_ax, cbar = False) cur_ax.add_patch(Rectangle((0,0), length, feature_num, fill=False, color="black", linewidth=2)) if not notification_box: notification_box = [] for box in notification_box: cur_ax.add_patch(Rectangle(box["box"][:2], box["box"][2], box["box"][3], fill=True, color=box["color"], alpha=box["alpha"], linewidth=5)) return cur_ax
def heatmap_discrete_levels(data, key2color, fill_color=(1,1,1), **heatmap_kwargs): """data can be a DataFrame (with multiple value levels), or a tuple, list, or dict of DataFrames with True-False-like values. In the former case, key2color[key] is the color for data[data==key]. In the latter case, key2color[key] is the color for data[key]. key2color is a dict. Values must be r,g,b tuples [0,1] """ filled = False for key, rgb in key2color.items(): if isinstance(data, pd.DataFrame): vals = (data == key) else: vals = data[key] if filled: if not vals.any().any(): # We will get an error if we try to # render this. continue color = monocolor(*rgb) vals[~vals.astype('bool')] = np.nan vals = vals.astype('float16') else: color = duocolor(fill_color, rgb) filled = True kwargs = {'cbar':False} kwargs.update(heatmap_kwargs) sns.heatmap(vals, cmap=color, **kwargs)
def corrplots(ipfile,opname): opdir='/'.join(opname.split('/')[:-1]) if not os.path.isdir(opdir): print "### Making Directory ####" os.makedirs(opdir) corrmat=pd.read_csv(ipfile,index_col=0) cols=list([c.split('/')[-1] for c in corrmat.columns]) indices=list([i.split('/')[-1] for i in corrmat.index]) corrmat.columns=[c.split('/')[-1] for c in corrmat.columns] corrmat.index=[i.split('/')[-1] for i in corrmat.index] cols=natsorted(cols,key= lambda s : s.split('-')[1]) cols=natsorted(cols,key= lambda s : s.split('-')[2]) cols=natsorted(cols,key= lambda s : s.split('-')[0]) corrmat=corrmat[cols] indices=natsorted(indices,key= lambda s : s.split('-')[1]) indices=natsorted(indices,key= lambda s : s.split('-')[2]) indices=natsorted(indices,key= lambda s : s.split('-')[0]) corrmat=corrmat.reindex(cols) print "###### Generating Heatmap ######" sns.heatmap(corrmat) print "###### Saving fig to "+opname+" ######" plt.savefig(opname) plt.title(opname.split('/')[-1].split('.')[0]) plt.tight_layout() plt.close() plt.cla() return corrmat
def plot_attention(sentence, Tx=20, Ty=25): """ 可视化Attention层 @param sentence: 待翻译的句子,str类型 @param Tx: 输入句子的长度 @param Ty: 输出句子的长度 """ X = np.array(text_to_int(sentence, source_vocab_to_int)) f = K.function(model.inputs, [model.layers[9].get_output_at(t) for t in range(Ty)]) s0 = np.zeros((1, n_s)) c0 = np.zeros((1, n_s)) out0 = np.zeros((1, len(target_vocab_to_int))) r = f([X.reshape(-1, 20), s0, c0, out0]) attention_map = np.zeros((Ty, Tx)) for t in range(Ty): for t_prime in range(Tx): attention_map[t][t_prime] = r[t][0, t_prime, 0] Y = make_prediction(sentence) source_list = sentence.split() target_list = Y.split() f, ax = plt.subplots(figsize=(20, 15)) sns.heatmap(attention_map, xticklabels=source_list, yticklabels=target_list, cmap="YlGnBu") ax.set_xticklabels(ax.get_xticklabels(), fontsize=15, rotation=90) ax.set_yticklabels(ax.get_yticklabels(), fontsize=15)
def plot_heatmap(self, show=True, save=False, metric='growth rate', unit=None, vmin=None, vmax=None): if self.results['row'].max() > 8 or self.results['column'].max() > 12: results_arr = np.empty((16, 24)) # assume 384-well plate results_arr.fill(np.nan) indices = ['A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P'] columns = range(1,25) else: results_arr = np.empty((8, 12)) # assume 96-well plate results_arr.fill(np.nan) indices = ['A','B','C','D','E','F','G','H'] columns = range(1,13) results_arr[(self.results['row']-1), (self.results['column']-1)] = self.results[metric] # only show if not saved? data = pd.DataFrame(results_arr, index=indices, columns=columns) if vmin is not None: fig = sns.heatmap(data, vmin=vmin, vmax=vmax, cmap='spring_r', linewidths=0.01) else: fig = sns.heatmap(data, cmap='spring_r', linewidths=0.01) plt.yticks(rotation=0) fig.xaxis.set_ticks_position('top') if unit: plt.title(metric + ' (' + unit +')', y=1.1) else: plt.title(metric, y=1.1) if save: heat_name = self.name +'_'+ metric.replace(' ', '_') +'_heatmap.png' heat_file = os.path.join(self.out_dir, heat_name) plt.savefig(heat_file, dpi=300, bbox_inches='tight') if show: return plt.show()
def get_legal_pairs(): """ Plots the legal pairs of detectors for GBM observations Returns ------- """ dlp = np.array([[0, 274, 39, 171, 12, 29, 0, 5, 1, 6, 1, 0], [258, 0, 233, 55, 4, 100, 2, 1, 1, 12, 27, 0], [55, 437, 0, 2, 2, 311, 0, 1, 1, 13, 235, 0], [215, 80, 3, 0, 330, 107, 4, 8, 19, 2, 1, 0], [13, 4, 8, 508, 0, 269, 2, 29, 236, 0, 1, 0], [44, 188, 337, 166, 279, 0, 0, 0, 0, 0, 0, 0], [0, 1, 1, 2, 2, 0, 0, 238, 46, 180, 12, 33], [0, 2, 0, 18, 35, 0, 222, 0, 221, 61, 3, 109], [0, 0, 1, 16, 215, 0, 51, 399, 0, 4, 2, 303], [3, 18, 21, 4, 0, 0, 190, 82, 1, 0, 324, 110], [1, 25, 191, 0, 0, 0, 16, 6, 4, 516, 0, 293], [0, 0, 0, 0, 0, 0, 32, 147, 297, 138, 263, 0]]) sns.heatmap(dlp, annot=True, fmt='d', cmap="YlGnBu") plt.ylabel("NaI") plt.xlabel("NaI")
def summary(self, stdout=True, plot=False): ''' Displays diagnostics to the user Args: stdout (bool): print results to the console plot (bool): use Seaborn to plot results ''' if stdout: print('Collinearity summary:') print(pd.concat([self.results['Eigenvalues'], self.results['ConditionIndices'], self.results['VIFs'], self.results['CorrelationMatrix']], axis=1)) print('Outlier summary:') print(self.results['RowMahalanobisDistances']) print(self.results['ColumnMahalanobisDistances']) print('Validity summary:') print(self.results['Variances']) if plot: for key, result in self.results.items(): if key == 'CorrelationMatrix': ax = plt.axes() sns.heatmap(result, cmap='Blues', ax=ax) ax.set_title(key) sns.plt.show() else: result.plot(kind='bar', title=key) plt.show()
def _fill(data, colors, ax): if len(data) == 0: return data = data.copy() # Subset data and colors. data = data.ix[data['alteration'].isin(colors)] colors = {k: v for k, v in colors.items() if k in set(data['alteration'])} # Build number/color maps. num_map, colors_ord = {}, [] for i, (type_, color) in enumerate(colors.items()): num_map[type_] = i colors_ord.append(color) # Map to numeric and pivot. data['value'] = data['alteration'].map(num_map) mat = _pivot(data, values='value') # Mask na-values. mask = pd.isnull(mat) # Draw fills using heatmap. cmap = discrete_cmap(colors_ord) sns.heatmap(mat, cmap=cmap, ax=ax, mask=mask, cbar=False, linewidths=0.5)
def show_heatmap(filename): """Show confusion matrix given of a partis-generated tab-delimited db.""" true_labels, estimated_labels = get_clones_real_estimated(filename) cm, rows, cols = confusion_matrix(true_labels, estimated_labels) df = pd.DataFrame(cm, index=rows, columns=cols) sns.heatmap(df) sns.plt.show()
def plot_correlation_matrix(df, title='Correlation matrix', cmap=plt.cm.coolwarm): plt.figure(figsize=(12, 10)) sns.heatmap(df.corr(), annot=False, cmap=cmap) plt.yticks(rotation=0, fontsize=8) plt.xticks(rotation=90, fontsize=8) plt.title(title) plt.savefig('corr_matrix.png')
def spectrograms(D, p_local, p_global): if p_local['eog_in']: D = D[p_global['eeg_chans'], :] C = D.shape[0] T = D.shape[1] for c in range(C)[:3]: f, t, Sxx = spectrogram(D[c, :], p_global['sample_freq']) sns.heatmap(np.log(Sxx[::-1, :]), xticklabels = t.astype(int), yticklabels = f.astype(int)[::-1]) # There is probably a better way to do this for label in plt.gca().get_xticklabels(): label.set_visible(False) for label in plt.gca().get_xticklabels()[::Sxx.shape[1] / 6]: label.set_visible(True) for label in plt.gca().get_yticklabels(): label.set_visible(False) for label in plt.gca().get_yticklabels()[::Sxx.shape[0] / 6]: label.set_visible(True) cbar = plt.gca().collections[0].colorbar plt.title('Spectrogram for channel ' + str(c + 1)) plt.xlabel('Time in seconds') plt.ylabel('Frequency') cbar.set_label(r"$\log(\hat{f})$", labelpad=20, rotation=270) path = p_global['plot_folders']['spectrogram_dir'] \ + '/' + 'spectrogram-%03d' % (c + 1) if p_global['plotting']['notebook']: show_and_close() else: save_and_close(path, p_local)
def plot_metrics_correlation(): data = load_data_to_correlate().rename(columns={ 'quit_score': 'quit score', 'survival_answers_10': 'survival (10 ans.)', 'survival_answers_100': 'survival (100 ans.)', 'survival_time_60': 'survival (1 min.)', 'survival_time_600': 'survival (10 min.)', 'learning_slope_5': 'learning (5)', 'learning_slope_10': 'learning (10)', 'learning_slope_20': 'learning (20)', }) data = data[~data['context'].apply(lambda c: 'region_cz' in c)] plt.title('Correlation of different metrics') sns.heatmap(data.corr().abs(), annot=True, fmt='.2f') output.savefig('abexp_metric_corr') g = sns.PairGrid( data[[ # 'quit score', 'survival (100 ans.)', 'survival (10 min.)', 'survival (10 ans.)', 'survival (1 min.)', # 'learning (10)', 'experiment', ]], hue='experiment') g = g.map_diag(plt.hist) g = g.map_offdiag(plt.scatter) g = g.add_legend() output.savefig('abexp_metrics', tight_layout=False)
def main(): activities = ["EDA", "Plots", "Model Building"] choice = st.sidebar.selectbox("Select Activities", activities) if choice == 'EDA': st.subheader("Exploratory Data Analysis") data = st.file_uploader("Upload a Dataset", type=["csv", "txt"]) if data is not None: df = pd.read_csv(data) st.dataframe(df.head()) if st.checkbox("Show Shape"): st.write(df.shape) if st.checkbox("Show Columns"): all_columns = df.columns.to_list() st.write(all_columns) if st.checkbox("Summary"): st.write(df.describe()) if st.checkbox("Show Selected Columns"): selected_columns = st.multiselect("Select Columns", all_columns) new_df = df[selected_columns] st.dataframe(new_df) if st.checkbox("Show Value Counts"): st.write(df.iloc[:, -1].value_counts()) if st.checkbox("Correlation Plot(Matplotlib)"): plt.matshow(df.corr()) st.pyplot() if st.checkbox("Correlation Plot(Seaborn)"): st.write(sns.heatmap(df.corr(), annot=True)) st.pyplot() if st.checkbox("Pie Plot"): all_columns = df.columns.to_list() column_to_plot = st.selectbox("Select 1 Column", all_columns) pie_plot = df[column_to_plot].value_counts().plot.pie( autopct="%1.1f%%") st.write(pie_plot) st.pyplot() elif choice == 'Plots': st.subheader("Data Visualization") data = st.file_uploader("Upload a Dataset", type=["csv", "txt"]) if data is not None: df = pd.read_csv(data) st.dataframe(df.head()) if st.checkbox("Show Value Counts"): st.write(df.iloc[:, -1].value_counts().plot(kind='bar')) st.pyplot() # Customizable Plot all_columns_names = df.columns.tolist() type_of_plot = st.selectbox( "Select Type of Plot", ["area", "bar", "line", "hist", "box", "kde"]) selected_columns_names = st.multiselect("Select Columns To Plot", all_columns_names) if st.button("Generate Plot"): st.success("Generating Customizable Plot of {} for {}".format( type_of_plot, selected_columns_names)) # Plot By Streamlit if type_of_plot == 'area': cust_data = df[selected_columns_names] st.area_chart(cust_data) elif type_of_plot == 'bar': cust_data = df[selected_columns_names] st.bar_chart(cust_data) elif type_of_plot == 'line': cust_data = df[selected_columns_names] st.line_chart(cust_data) # Custom Plot elif type_of_plot: cust_plot = df[selected_columns_names].plot( kind=type_of_plot) st.write(cust_plot) st.pyplot() elif choice == 'Model Building': st.subheader("Building ML Models") data = st.file_uploader("Upload a Dataset", type=["csv", "txt"]) if data is not None: df = pd.read_csv(data) st.dataframe(df.head()) # Model Building X = df.iloc[:, 0:-1] Y = df.iloc[:, -1] seed = 7 # prepare models models = [] models.append(('LR', LogisticRegression())) models.append(('LDA', LinearDiscriminantAnalysis())) models.append(('KNN', KNeighborsClassifier())) models.append(('CART', DecisionTreeClassifier())) models.append(('NB', GaussianNB())) models.append(('SVM', SVC())) # evaluate each model in turn model_names = [] model_mean = [] model_std = [] all_models = [] scoring = 'accuracy' for name, model in models: kfold = model_selection.KFold(n_splits=10, random_state=seed) cv_results = model_selection.cross_val_score(model, X, Y, cv=kfold, scoring=scoring) model_names.append(name) model_mean.append(cv_results.mean()) model_std.append(cv_results.std()) accuracy_results = { "model name": name, "model_accuracy": cv_results.mean(), "standard deviation": cv_results.std() } all_models.append(accuracy_results) if st.checkbox("Metrics As Table"): st.dataframe( pd.DataFrame(zip(model_names, model_mean, model_std), columns=["Algo", "Mean of Accuracy", "Std"])) if st.checkbox("Metrics As JSON"): st.json(all_models)
# In[40]: # "convert" ungly state values in dict to 1d-array (only without usable ace) v_matrix = np.zeros(100) i = 0 for playersum in range(10): for dealercard in range (10): v_matrix[i] = state_values[(playersum, dealercard, 0)] i += 1 # In[41]: import seaborn as sns plt.figure(figsize=(4, 4)) sns.heatmap(v_matrix.reshape(10, 10), cmap="YlGnBu", annot=True, cbar=False, square=True); # In[ ]:
sns.distplot(stock2['Gain'], hist = False, color = 'r' ) sns.distplot(stock3['Gain'], hist = False, color = 'g' ) sns.distplot(stock4['Gain'], hist = False, color = 'y' ) # ## Correlation All_Stocks = pd.concat([stock1['Gain'],stock2['Gain'],stock3['Gain'],stock4['Gain']], axis=1) names = [symbol1, symbol2, symbol3, symbol4] All_Stocks.columns = names All_Stocks = All_Stocks.dropna() print (All_Stocks.corr()) #Heat map sns.set(rc={"figure.figsize": (6, 4)}); sns.heatmap( All_Stocks.corr()) # ### Monthly Returns Stock1_Monthly = stock1.asfreq('M').ffill() Stock2_Monthly = stock2.asfreq('M').ffill() Stock3_Monthly = stock3.asfreq('M').ffill() Stock4_Monthly = stock4.asfreq('M').ffill() print('Monthly Returns') print('Stock '+ symbol1 + ' Mean:', Stock1_Monthly["Gain"].mean()) print('Stock '+ symbol1 + ' Variances:', Stock1_Monthly["Gain"].var()) print('Monthly Returns') print('Stock '+ symbol2 + ' Mean:', Stock2_Monthly["Gain"].mean()) print('Stock '+ symbol2 + ' Variances:', Stock2_Monthly["Gain"].var())
"wt_bc==0").query("lib_type.isin(['sub','del'])") antibody_selection_mean_for_heatmap = antibody_selection_mean_for_heatmap.loc[:, a20_postions_subs] antibody_selection_mean_for_heatmap.index = antibody_selection_mean_for_heatmap.index.droplevel( [1, 2]) antibody_selection_mean_for_heatmap = antibody_selection_mean_for_heatmap.reindex( DESIRED_AA_ORD) antibody_selection_mean_for_heatmap.head() # In[31]: sns.set(**PAPER_PRESET) fig, ax = plt.subplots(figsize=[3.174 * 1.5, 1.625 * 1.5]) sns.heatmap(antibody_selection_mean_for_heatmap.apply(np.log2), cmap='RdBu_r', vmin=-10, vmax=10, ax=ax, yticklabels=True) ax.set_ylabel("Amino Acid") ax.set_xlabel("VP Position") # In[32]: all_residues = antibody_selection_mean.apply(np.log2).dropna() a20_residues = antibody_selection_mean.loc[a20_postions_subs].apply( np.log2).dropna() a20_fraction = a20[a20 > 2.5].count() / float(a20.count()) all_residues_fraction = all_residues[all_residues > 2.5].count() / float( all_residues.count()) fig = plt.figure(figsize=(1.2, 1.1))
# In[16]: sns.pairplot(customers) # In[30]: customers.isna().sum() # In[31]: customers.corr() # In[35]: sns.heatmap(customers.corr(), annot=True) # In[37]: customers.columns # In[69]: customers['Avatar'].unique() # In[94]: customers_i1 = customers.copy(deep=True) # In[95]:
#%% data_911_date_reason = data_911.groupby(['date', 'reason']) data_911_date_reason_count = data_911_date_reason.count() data_911_date_reason_count = data_911_date_reason_count.reset_index() fg = sns.FacetGrid(data=data_911_date_reason_count, col='reason') fg.map(sns.lineplot, 'date', 'timeStamp') #%% data_dayofweek_hour_group = data_911.groupby(['dayofweek_str', 'hour']) data_dayofweek_hour_group_count = data_dayofweek_hour_group.count() #data_dayofweek_hour_group_count data_dayofweek_hour_group_count = data_dayofweek_hour_group_count[ 'timeStamp'].unstack(level=-1) #%% sns.heatmap(data=data_dayofweek_hour_group_count) #%% sns.clustermap(data=data_dayofweek_hour_group_count) #%% data_dayofweek_month_group = data_911.groupby(['month', 'dayofweek_str']) data_dayofweek_month_group_count = data_dayofweek_month_group.count() #data_dayofweek_hour_group_count data_dayofweek_month_group_count = data_dayofweek_month_group_count[ 'timeStamp'].unstack(level=0) #%% sns.heatmap(data=data_dayofweek_month_group_count) #%%
# Compute the correlation matrix corr = dataset1[["revenue", "budget", "popularity", "runtime", "num_of_cast", "num_of_male_cast", "num_of_female_cast", "num_genres", "num_of_production_countries", "day_of_week", "month", "year", "week_of_year", "season", "title_len", "overview_len", "tagline_len", "num_of_directors", "num_of_producers", "num_of_editors", "num_of_art_crew", "num_of_sound_crew", "num_of_costume_crew", "num_of_camera_crew", "num_of_visual_effects_crew", "num_of_lighting_crew", "num_of_other_crew"]].corr() # Generate a mask for the upper triangle mask = np.zeros_like(corr, dtype=np.bool) mask[np.triu_indices_from(mask)] = True sns.heatmap(corr, mask=mask, annot=True, fmt=".2f", cmap='coolwarm') plt.title("Correlation between numerical features") # Bivariate Analysis for log-transformed numerical features sns.set(rc={'figure.figsize':(18,20)}) # Compute the correlation matrix corr = dataset1[["log_revenue", "log_budget", "log_popularity", "log_runtime", "log_num_of_cast", "log_num_of_male_cast", "log_num_of_female_cast", "num_genres", "num_of_production_countries", "day_of_week", "month", "year", "week_of_year", "season",
"""Inference : <br> Before Attrition, There were a total of 14,999 Employees working for the company. <br> And after the Attrition, 3571 employees left the company leaving behind 11428 employees. 1. Analysis for Existing Employees. Visualising and Dropping off the Completely Null Columns """ # to see how many values are missing in each column. df_exist.isnull().sum() # visualizing and observing the null elements in the dataset plt.figure(figsize=(10,10)) sns.heatmap(df_exist.isnull(), cbar = False, cmap = 'YlGnBu') # ploting missing data && # cbar, cmap = colour bar, colour map """Inference : <br> There are null entries in both the datasets. Checking for duplicate value columns """ x = set() # set() as to store only the unique values for i in range(df_exist.shape[1]): c1 = df_exist.iloc[:, i] for j in range(i + 1, df_exist.shape[1]): c2 = df_exist.iloc[:, j] if c1.equals(c2): x.add(df_exist.columns.values[j]) for col in x:
model.add(Dense(32, activation='relu', input_dim=8)) model.add(Dense(16, activation='relu')) model.add(Dense(1, activation='sigmoid')) model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy']) model.fit(X_train, y_train, epochs=200, verbose=False) # 결과 - 정확도 scores = model.evaluate(X_train, y_train, verbose=False) print("Training Accuracy: %.2f%%\n" % (scores[1]*100)) scores = model.evaluate(X_test, y_test, verbose=False) print("Testing Accuracy: %.2f%%\n" % (scores[1]*100)) # 결과 - 혼동 행렬 y_test_pred = model.predict_classes(X_test) c_matrix = confusion_matrix(y_test, y_test_pred) ax = sns.heatmap(c_matrix, annot=True, xticklabels=['No Diabetes', 'Diabetes'], yticklabels=['No Diabetes', 'Diabetes'], cbar=False, cmap='Blues') ax.set_xlabel("Prediction") ax.set_ylabel("Actual") plt.show() plt.clf() # 결과 - ROC 곡선 y_test_pred_probs = model.predict(X_test) FPR, TPR, _ = roc_curve(y_test, y_test_pred_probs) plt.plot(FPR, TPR) plt.plot([0,1],[0,1],'--', color='black') #diagonal line plt.title('ROC Curve') plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.show() plt.clf()
plt.title('Scatter plot between Age vs Estimated Salary') sns.set_style('whitegrid') sns.FacetGrid(ads,hue='Purchased',height=7).map(plt.scatter,'Age','EstimatedSalary').add_legend() plt.show() ads.drop(columns=['User ID'],inplace=True) sns.set_style('whitegrid') sns.pairplot(ads,hue='Purchased',height=5) plt.show() sns.boxplot(x='Purchased',y='EstimatedSalary',data=ads) sns.jointplot(x='Age',y='EstimatedSalary',data=ads) sns.regplot(x='Age',y='EstimatedSalary',data=ads,fit_reg=True,order =6) sns.lmplot(x='Age',y='EstimatedSalary',data=ads,hue='Purchased') sns.clustermap(ads.corr(),figsize=(7,7),annot=True) plt.show() sns.heatmap(ads.corr(),annot =True,cbar_kws={'orientation':'horizontal'}) plt.show() ads.isnull().sum() ads.info() ads.describe()
def associations(dataset, nominal_columns='auto', mark_columns=False, theil_u=False, plot=True, return_results=False, clustering=False, nan_strategy=REPLACE, nan_replace_value=DEFAULT_REPLACE_VALUE, ax=None, **kwargs): """ Calculate the correlation/strength-of-association of features in data-set with both categorical (eda_tools) and continuous features using: * Pearson's R for continuous-continuous cases * Correlation Ratio for categorical-continuous cases * Cramer's V or Theil's U for categorical-categorical cases **Returns:** a DataFrame of the correlation/strength-of-association between all features **Example:** see `associations_example` under `dython.examples` Parameters ---------- dataset : NumPy ndarray / Pandas DataFrame The data-set for which the features' correlation is computed nominal_columns : string / list / NumPy ndarray Names of columns of the data-set which hold categorical values. Can also be the string 'all' to state that all columns are categorical, 'auto' (default) to try to identify nominal columns, or None to state none are categorical mark_columns : Boolean, default = False if True, output's columns' names will have a suffix of '(nom)' or '(con)' based on there type (eda_tools or continuous), as provided by nominal_columns theil_u : Boolean, default = False In the case of categorical-categorical feaures, use Theil's U instead of Cramer's V plot : Boolean, default = True If True, plot a heat-map of the correlation matrix return_results : Boolean, default = False If True, the function will return a Pandas DataFrame of the computed associations clustering : Boolean, default = False If True, hierarchical clustering is applied in order to sort features into meaningful groups nan_strategy : string, default = 'replace' How to handle missing values: can be either 'drop_samples' to remove samples with missing values, 'drop_features' to remove features (columns) with missing values, or 'replace' to replace all missing values with the nan_replace_value. Missing values are None and np.nan. nan_replace_value : any, default = 0.0 The value used to replace missing values with. Only applicable when nan_strategy is set to 'replace' ax : matplotlib ax, default = None Matplotlib Axis on which the heat-map will be plotted kwargs : any key-value pairs Arguments to be passed to used function and methods """ dataset = convert(dataset, 'dataframe') if nan_strategy == REPLACE: dataset.fillna(nan_replace_value, inplace=True) elif nan_strategy == DROP_SAMPLES: dataset.dropna(axis=0, inplace=True) elif nan_strategy == DROP_FEATURES: dataset.dropna(axis=1, inplace=True) columns = dataset.columns if nominal_columns is None: nominal_columns = list() elif nominal_columns == 'all': nominal_columns = columns elif nominal_columns == 'auto': nominal_columns = identify_nominal_columns(dataset) corr = pd.DataFrame(index=columns, columns=columns) for i in range(0, len(columns)): for j in range(i, len(columns)): if i == j: corr[columns[i]][columns[j]] = 1.0 else: if columns[i] in nominal_columns: if columns[j] in nominal_columns: if theil_u: corr[columns[j]][columns[i]] = theils_u( dataset[columns[i]], dataset[columns[j]], nan_strategy=SKIP) corr[columns[i]][columns[j]] = theils_u( dataset[columns[j]], dataset[columns[i]], nan_strategy=SKIP) else: cell = cramers_v(dataset[columns[i]], dataset[columns[j]], nan_strategy=SKIP) corr[columns[i]][columns[j]] = cell corr[columns[j]][columns[i]] = cell else: cell = correlation_ratio(dataset[columns[i]], dataset[columns[j]], nan_strategy=SKIP) corr[columns[i]][columns[j]] = cell corr[columns[j]][columns[i]] = cell else: if columns[j] in nominal_columns: cell = correlation_ratio(dataset[columns[j]], dataset[columns[i]], nan_strategy=SKIP) corr[columns[i]][columns[j]] = cell corr[columns[j]][columns[i]] = cell else: cell, _ = ss.pearsonr(dataset[columns[i]], dataset[columns[j]]) corr[columns[i]][columns[j]] = cell corr[columns[j]][columns[i]] = cell corr.fillna(value=np.nan, inplace=True) if mark_columns: marked_columns = [ '{} (nom)'.format(col) if col in nominal_columns else '{} (con)'.format(col) for col in columns ] corr.columns = marked_columns corr.index = marked_columns if clustering: corr, _ = cluster_correlations(corr) if plot: if ax is None: plt.figure(figsize=kwargs.get('figsize', None)) sns.heatmap( corr, cmap=kwargs.get('cmap', None), annot=kwargs.get('annot', True), fmt=kwargs.get('fmt', '.2f'), ax=ax ) if ax is None: plt.show() if return_results: return corr
def run(argv): args = parseArgs(argv) # print(args.work_dir) # print(args.config_dir) global workpath global ntpath global threadsnum global positivepath global negativepath workpath = args.work_dir threadsnum = str(args.processes) if workpath[-1] != "/": workpath = workpath + "/" controlpath = args.config_dir try: #Create Work Directory os.mkdir(workpath) except FileExistsError: print("File exists: " + workpath) ntpath = args.database_path ''' Get Positive/Negative controls ''' print("Work Path:" + workpath) print("Config file path: " + controlpath) with open(controlpath, "r") as f: l = f.readlines() [positive, positive_paths, negative, negative_paths, whole, group] = getinfo(l) positivepath = workpath + "positive/" negativepath = workpath + "negative/" if detectdataexist(workpath): print("Found generated positive/negative controls, skipping data\ gathering process...") else: print("#############################################################") print("Taxon group: " + group) print("Positive control group:") for i in positive: print(i) print() print("Negative control group:") for i in negative: print(i) download_db(whole, group) #Download database from NCBI database os.system("mkdir " + positivepath) os.system("mkdir " + negativepath) for i in positive_paths: for j in os.listdir(i): os.system("cp " + i + "/" + j + " " + positivepath) for i in negative_paths: for j in os.listdir(i): os.system("cp " + i + "/" + j + " " + negativepath) ''' Generate and Print Tree ''' if args.tree_help: printTree() print("#############################################################") print("Concatenating Negative Control...") randrefindex = np.random.randint(len(os.listdir(positivepath))) randreddir = os.listdir(positivepath)[randrefindex] gatherdatapath = workpath + "gatherdata/" os.system("mkdir " + gatherdatapath) with open(gatherdatapath + "concatenatedneg.fasta", "w") as f: with open(positivepath + randreddir, "r") as k: f.writelines(k.readlines()) for i in os.listdir(negativepath): with open(negativepath + i, "r") as j: f.writelines(j.readlines()) os.system("cp " + positivepath + "*" + " " + gatherdatapath) if "MUMs" not in os.listdir(workpath): print("#############################################################") print("running parsnp") print("python ./parsnp/Parsnp.py -c -r ! -d " + gatherdatapath + " -o " + workpath + "MUMs/ -p " + threadsnum) os.system("python ./parsnp/Parsnp.py -c -r " + gatherdatapath + randreddir + " -d " + gatherdatapath + " -o " + workpath + "MUMs/ -p " + threadsnum) with open(workpath + "MUMs/parsnp.xmfa", "r") as f: readlist = f.readlines() clusters = generate_read_list(readlist, 40) # clusters = {} # for i in range(len(nlist)): # cluster = nlist[i].split(" ")[2] # if cluster not in clusters: #Exclude all LCBs # if ('A' in rlist[i]) or ('T' in rlist[i]) or ('C' in rlist[i])\ # or ('G' in rlist[i]) or ('-' in rlist[i]): # continue # clusters.update({cluster: rlist[i]}) os.system("mkdir " + workpath + "finalMUMs/") for i in clusters: with open(workpath + "finalMUMs/" + i + ".fasta", "w") as f: f.write(">" + i + "\n") f.write(clusters[i]) clusterl = os.listdir(workpath + "finalMUMs/") if args.MUMS_only: print('-m: not entering into Blastn step, exiting...') exit() print("Running Blastn agianst nt database") os.system("mkdir " + workpath + "blastresult/") for i in pbar(clusterl): if ntpath == None: os.system( "blastn -max_target_seqs 2000 -query -db nt " + workpath + "finalMUMs/" + i + " -out " + workpath + "blastresult/" + i + ".out -outfmt '6 qseqid sseqid pident evalue stitle' -num_threads " + threadsnum + " -remote") else: os.system( "blastn -max_target_seqs 2000 -db " + ntpath + " -query " + workpath + "finalMUMs/" + i + " -out " + workpath + "blastresult/" + i + ".out -outfmt '6 qseqid sseqid pident evalue stitle' -num_threads " + threadsnum) # os.system("mv *.out " + workpath + "blastresult/") os.system('find ' + workpath + 'blastresult/ -name "*" -type\ f -size 0c | xargs -n 1 rm -f') #Remove all damaged blast results blastresultfile = os.listdir(workpath + 'blastresult/') [percent2strain, strain] = percenttostrain(workpath + 'blastresult/',\ blastresultfile, clusters) positivelist = [] others = [] for i in positive: for j in strain: if set(j.split(" ")) > set(i.split(" ")): positivelist.append(j) else: others.append(j) whole = positivelist + others print("generating whole result...") res = pd.DataFrame(columns=["MUM"] + whole) for i in percent2strain: res = res.append(i, ignore_index=True) res = res.fillna(0) res.set_index(["MUM"], inplace=True) print("generating whole complete genome result...") rescg = res for i in strain: keys = i.split(" ") if keys[-1] == "genome" and keys[-2] == "complete": continue rescg = rescg.drop(i, axis=1) with open(workpath + "all_strains.csv", "w") as f: f.write(res.to_csv()) with open(workpath + "complete_genomes.csv", "w") as f: f.write(rescg.to_csv()) print("Generating heatmap with all over\ 95% alignment scores strains' complete genomes") newcg = pd.read_csv(workpath + "complete_genomes.csv").drop("MUM", axis=1) complete_genomes = dropless90(newcg) f, ax = plt.subplots(figsize=(200, 50)) sns_plot = sns.heatmap(complete_genomes, cmap=sns.color_palette("Blues", 500), linewidths=0.1, ax=ax) ax.set_title('Blast result for all MUMs') ax.set_xlabel('Strains') ax.set_ylabel('MUMs') sns_plot.figure.savefig(workpath + "competegenomes.png")
scaled_date_score.columns = [ s + '_s' for s in ['attr', 'sinc', 'intel', 'fun', 'amb', 'shar']] data = pd.concat([data, scaled_date_score], axis=1) data_algorithm = data.drop(['match', 'iid', "id","idg", "condtn", "wave", "round", "position", "partner", "pid", "career_c", "sports", "tvsports", 'exercise', 'dining', 'museums', 'art', 'hiking', 'gaming', 'clubbing','reading', 'tv', 'theater', 'movies','concerts', 'music', 'shopping', 'yoga'], axis=1) data1 = data_algorithm.drop(list(data_algorithm.filter(regex="field")), axis=1) data1 = data1.drop(list(data1.filter(regex="goal")), axis=1) data1 = data1.drop(list(data1.filter(regex="_o")), axis=1) data1 = data1.drop(list(data1.filter(regex="race")), axis=1) corr = data1.corr() corr_dec = corr['dec'].sort_values(ascending=False) plt.subplots(figsize=(20,15)) ax = plt.axes() ax.set_title("Correlation Heatmap") sns.heatmap(corr,xticklabels=corr.columns.values,yticklabels=corr.columns.values) #%% Check to see what the different genders value most on paper from Plots import PlotBarSeries male_rows = data[data['gender'] == 1] female_rows = data[data["gender"] == 0] male_avg = male_rows.mean() female_avg = female_rows.mean() self_look_for_before_average_male = male_avg[['attr1_1', 'sinc1_1', 'intel1_1', 'fun1_1', 'amb1_1', 'shar1_1']] self_look_for_before_average_female = female_avg[['attr1_1', 'sinc1_1', 'intel1_1', 'fun1_1', 'amb1_1', 'shar1_1']] dataframe = pd.concat([self_look_for_before_average_male, self_look_for_before_average_female],axis=1).T dataframe.index = ["male", "female"] PlotBarSeries(dataframe, "Mean value","Attribute value mean by gender (round 1_1)", [0,30]) #%% Mean values by attribute for dec = 1
# In[61]: #Let's break it down and reevaluate for closing prices returns_fig = sns.PairGrid(closing_df) #Lets have scatter plot returns_fig.map_upper(plt.scatter, color='purple') # Lets call KDE plot on the lower returns_fig.map_lower(sns.kdeplot, cmap='cool_d') #On the diagnal lets call the histogram returns_fig.map_diag(plt.hist, bins=30) # In[62]: #To check out the actual value of co-relation , we do the actual heatmap with corelation functiom sns.heatmap(tech_rets.dropna().corr(), annot=True) # In[63]: #Lets check on closing date sns.heatmap(closing_df.corr(), annot=True) # In[64]: # Let's clean the version of the tech_rets DataFrame rets = tech_rets.dropna() # In[65]: # Let's start by defining a new DataFrame as a clenaed version of the orignal tech_rets DataFrame area = np.pi * 20
def show_heat_map(dataset): pca = PCA(n_components=2) comps = pd.DataFrame(columns=dataset['feature_names']) print(comps) sb.heatmap(comps, annot=False, linewidths=.5) plt.show()
def plot_cat(attr, labels=None): if (attr == 'JobRole'): sns.factorplot(data=df, kind='count', size=5, aspect=3, x=attr) return sns.factorplot(data=df, kind='count', size=5, aspect=1.5, x=attr) #plot_cat('Attrition') #corelation matrix. cor_mat = df.corr() mask = np.array(cor_mat) mask[np.tril_indices_from(mask)] = False fig = plt.gcf() fig.set_size_inches(30, 12) sns.heatmap(data=cor_mat, mask=mask, square=True, annot=True, cbar=True) #df.drop(['BusinessTravel','DailyRate','EmployeeCount','EmployeeNumber','HourlyRate','MonthlyRate','NumCompaniesWorked','Over18','StandardHours', 'StockOptionLevel','TrainingTimesLastYear',],axis=1,inplace=True) df.drop([ 'DailyRate', 'EmployeeCount', 'EmployeeNumber', 'HourlyRate', 'MonthlyRate', 'Over18', 'StandardHours', 'StockOptionLevel', 'TrainingTimesLastYear', 'EducationField', 'EnvironmentSatisfaction', 'JobInvolvement', 'JobRole', 'JobSatisfaction', 'PercentSalaryHike', 'PerformanceRating', 'RelationshipSatisfaction', 'YearsInCurrentRole', 'YearsSinceLastPromotion', 'YearsWithCurrManager', 'WorkLifeBalance' ], axis=1, inplace=True) le = LabelEncoder()
axes[0, 1].set(xlabel='atemp', title='体感温度分布') axes[1, 0].set(xlabel='humidity', title='湿度分布') axes[1, 1].set(xlabel='windspeed', title='风速分布') plt.savefig('修正后分布分析.png') # 计算相关系数 correlation = train.corr() influence_order = correlation['count'].sort_values(ascending=False) influence_order_abs = abs(correlation['count']).sort_values(ascending=False) print(influence_order) print(influence_order_abs) # 作相关性分析的热力图 f, ax = plt.subplots(figsize=(16, 16)) cmap = sn.cubehelix_palette(light=1, as_cmap=True) sn.heatmap(correlation, center=1, annot=True, cmap=cmap, linewidths=2, ax=ax) plt.show() plt.savefig('相关性分析.png') # 每个特征对租赁量的影响 # 时间对租赁量的影响 # (1)时间维度——年份 sn.boxplot(train['year'], train['count']) plt.title("The influence of year") plt.show() plt.savefig('年份对租赁总数的影响.png') # (2)时间维度——月份 sn.pointplot(train['month'], train['count']) plt.title("The influence of month") plt.show() plt.savefig('月份对租赁总数的影响.png')
# **The above graph shows that the Indian restaurants have recieved more positive reviews** # In[48]: # calculating the mean values of the numerical columns, grouping it by the category, stars stars = df.groupby('review_stars').mean() stars # In[49]: # Visualising the correlation between the dataframe stars sns.heatmap(stars.corr(),cmap='coolwarm',annot=True) # # 5.2.3 Sentiment Detection # In[37]: # Classifying the dataset and splitting it into the reviews and stars. # Here, we will classify the dataset into 3 types of rating. Rating 1 = "negative" , 3 ="Average", and 5 ="Positive". data_class = df[(df.review_stars==1) | (df.review_stars==3) | (df.review_stars==5)] # In[38]:
def main(): activities=['Analysis and Statistics of Chat','Visualization of chat'] choice=st.sidebar.selectbox("Select Activity",activities) if choice=='Analysis and Statistics of Chat': st.header("ANALYSIS OF CHAT") data=st.file_uploader("Upload a file",type=['txt','csv']) if data is None: steps() else: df=process_data(data) ##Show processed data st.header("Processed Data") st.write(df) #media msg info st.header("Who Share more media msg ?") st.subheader(' media msgs info') mediamsg=df[df['Text']==' media shared'] st.write(mediamsg) no_mediamsg=mediamsg['Name'].value_counts() st.write(no_mediamsg) st.bar_chart(no_mediamsg) ##word and letter count st.header("Words and Letters used by Each person") df['Letter_Count']=df['Text'].apply(char_counter) df['Word_Count']=df['Text'].apply(word_counter) st.write(df[['Text','Letter_Count','Word_Count']]) ##most active user st.header("most active user :"******"Number of messages send by each user") st.write(df[['Name','Text']].groupby('Name').count()) #Words used by each person st.header("Words used by each person: ") name_value_count=df['Name'].value_counts().head(4) st.write(name_value_count) ##chat go time #st.header("how long did the chat go?") ##delete file # st.header("Delete File") # if st.button("Delete uploaded file"): # # os.remove("") # st.write("File Removed") else: st.header("CHAT VISUALIZATION") data=st.file_uploader("Upload a file",type=['txt','csv']) if data is None: steps() else: df=process_data(data) df['Letter_Count']=df['Text'].apply(char_counter) df['Word_Count']=df['Text'].apply(word_counter) all_columns_name=df.columns.tolist() #Seaborn plot if st.checkbox("Correlation plot[Seaborn]"): st.write(sns.heatmap(df.corr(),annot=True)) st.pyplot() #Count plot if st.checkbox("Plot of Value counts"): st.text("Value count by targets") all_columns_name=df.columns.tolist() primary_col=st.selectbox("Primary Column To GroupBy",all_columns_name) selected_columns_names=st.multiselect("Select Columns",all_columns_name) if st.button("Plot"): st.text("Generate a plot") if selected_columns_names: vc_plot=df.groupby(primary_col)[selected_columns_names].count() else: vc_plot=df.iloc[:,-1].value_counts() st.write(vc_plot.plot(kind="bar")) st.pyplot() #pie chart if st.checkbox("Pie plot"): all_columns_name=df.columns.tolist() if st.button("Generate pie plot"): st.write(df.iloc[:,-1].value_counts().plot.pie(autopct="%1.1f%%")) st.pyplot() #customizable plot type_of_plot=st.selectbox("Select type of plot",["area","bar","line","hist","box","kde"]) selected_columns_names=st.multiselect("Select Columns To Plot",all_columns_name) if st.button("Generate a Plot"): st.success("Generating a plot of {} for {}".format(type_of_plot,selected_columns_names)) if type_of_plot=='area': cust_data=df[selected_columns_names] st.area_chart(cust_data) if type_of_plot=='bar': cust_data=df[selected_columns_names] st.bar_chart(cust_data) if type_of_plot=='line': cust_data=df[selected_columns_names] st.line_chart(cust_data) # custom plot if type_of_plot: cust_plot=df[selected_columns_names].plot(kind=type_of_plot) st.pyplot() if st.button("It's Completed"): st.balloons()
def main(): input_imgs = 'data/fs-20sbj-output/in_bin_img/' output_image_file = 'figures/brain_segmentation_mni.png' ref_image_file = 'data/fs-20sbj-output/mni_reference.nii' # Load images using nibabel img_sum = '' for img in os.listdir(input_imgs): if os.path.splitext(os.path.basename(img))[1] in ['.nii', '.gz']: img1 = nibabel.load(os.path.join(input_imgs, img)) data1 = img1.get_data() if img_sum == '': img_sum = data1 continue img_sum = img_sum + data1 im_ref = nibabel.load(ref_image_file) im_data_ref = im_ref.get_data() # Check that both images have the same dimensions # shape1 = im1.header.get_data_shape() # shape2 = im_ref.header.get_data_shape() hor_view = img_sum[129, :, :] hor_view_ref = im_data_ref[98, :, :] ver_view = img_sum[:, 155, :] ver_view_ref = im_data_ref[:, 116, :] axi_view = img_sum[:, :, 130] axi_view_ref = im_data_ref[:, :, 94] # Heatmap plots startcolor = '#990033' midcolor = '#ffff00' endcolor = '#FFFFFF' own_cmap1 = mpl.colors.LinearSegmentedColormap.from_list( 'own2', [startcolor, midcolor, endcolor]) fig = plt.figure(figsize=(25, 10), facecolor='white') ax1 = fig.add_subplot(131) ax2 = fig.add_subplot(132) ax3 = fig.add_subplot(133) plt.subplots_adjust(hspace=0.05, wspace=0.005) cbar_ax = fig.add_axes([.91, .18, .03, .65]) cbar_ax.tick_params(labelsize=28, color='black', labelcolor='black') cbar_ax.yaxis.label.set_size(32) cbar_ax.yaxis.label.set_color('black') own_cmap1.set_under("0.5", alpha=0) hmax = sns.heatmap(np.rot90(ver_view), cbar_ax=cbar_ax, cmap=own_cmap1, xticklabels='', yticklabels='', cbar_kws={'label': 'Number of subjects'}, ax=ax3, vmin=1, vmax=7) hmax.imshow(np.rot90(ver_view_ref), cmap='gray', aspect='equal', extent=hmax.get_xlim() + hmax.get_ylim()) hmax2 = sns.heatmap(np.rot90(hor_view), cmap=own_cmap1, xticklabels='', yticklabels='', cbar=False, ax=ax2, vmin=1, vmax=8) hmax2.imshow(np.rot90(hor_view_ref), cmap='gray', aspect='equal', extent=hmax2.get_xlim() + hmax2.get_ylim()) hmax3 = sns.heatmap(np.rot90(axi_view), cmap=own_cmap1, xticklabels='', yticklabels='', cbar=False, ax=ax1, vmin=1, vmax=8) hmax3.imshow(np.rot90(axi_view_ref), cmap='gray', aspect='equal', extent=hmax3.get_xlim() + hmax3.get_ylim()) plt.rcParams['axes.facecolor'] = 'black' plt.savefig(output_image_file, facecolor=fig.get_facecolor(), bbox_inches='tight')
weather = pd.read_csv('Weather.csv') weather.head() patientInfo.head() import seaborn as sns import matplotlib.pyplot as plt %matplotlib inline plt.figure(figsize = (25,4)) sns.countplot(patientInfo['province']) #No. of cases in state of South Korea sns.countplot(patientInfo['sex']) #No. of male and female affected in south Korea sns.countplot(patientInfo['sex'],hue = patientInfo['age']) #Male and female based on ages for i in range(3326): patientInfo['age'][i] = str(patientInfo['age'][i])[:2] for i in range(3326): if patientInfo['age'][i][1] == 's': patientInfo['age'][i] = str(patientInfo['age'][i])[:1] sns.heatmap(patientInfo.isna()) patientInfo.drop(['global_num','disease','infection_case','infection_order','infected_by', 'contact_number','symptom_onset_date','confirmed_date','released_date','deceased_date'],axis = 1,inplace = True) patientInfo.drop(['age'],inplace = True,axis = 1) patientInfo['avg_temp'] = None for i in range(3326): if patientInfo['sex'][i] == 'male': patientInfo['sex'][i] = 1 else: patientInfo['sex'][i] = 0 for i in range(3326): for j in range(25135): if patientInfo['province'][i] == weather['province'][j]: patientInfo['avg_temp'][i] = weather['avg_temp'][j] patientInfo.drop(['country','province','city','state'],axis = 1,inplace = True) patientInfo.dropna(inplace = True)
data.region = le.transform(data.region) # A few words about coding "region". In general, categorical variables with large variability are best encoded using OneHotEncoder and so on. But in this case, nothing will change, because there is no special order in which the regions would be listed. So I'm being very lazy only used the Label Encoder. # In[ ]: data.corr()['charges'].sort_values() # In[ ]: f, ax = pl.subplots(figsize=(10, 8)) corr = data.corr() sns.heatmap(corr, mask=np.zeros_like(corr, dtype=np.bool), cmap=sns.diverging_palette(240,10,as_cmap=True), square=True, ax=ax) # A strong correlation is observed only with the fact of smoking the patient. To be honest, I expected a higher correlation with bmi. Well. We investigate smoking in more detail. # ![![image.png](attachment:image.png)](https://img-s2.onedio.com/id-5aa155e69065f7cf10132bc5/rev-0/w-500/s-5c6ec7366c0b35f7b310eae5c1ee17526982e700.gif) # First, let's look at the distribution of charges. This will help us to know how much patients spend on treatment on average. # We're importing another useful library that we'll need a few more times. Although it's not necessary, why not :D # In[ ]: from bokeh.io import output_notebook, show from bokeh.plotting import figure output_notebook() import scipy.special from bokeh.layouts import gridplot
print(unique_country) ###Let's check for country alpha = 1.0 plt.figure(figsize=(10, 25)) sns.countplot(y='country', data=dataset, alpha=alpha) plt.title('Data by country') plt.show() # Between Genders Male vs Female plt.figure(figsize=(7, 7)) sex = sns.countplot(x='sex', data=dataset) # Corelation between the Data plt.figure(figsize=(16, 7)) cor = sns.heatmap(dataset.corr(), annot=True) g = sns.jointplot(dataset.year, dataset.suicides_no, kind="kde", color="#bfa9e0", size=7) plt.savefig('graph.png') # Visualizing which age of people Suicide the most plt.figure(figsize=(16, 7)) bar_age = sns.barplot(x='sex', y='suicides_no', hue='age', data=dataset) # Visualizing which Generation of people Suicide the most plt.figure(figsize=(16, 7)) bar_gen = sns.barplot(x='sex', y='suicides_no', hue='generation', data=dataset)
#%% # a sorted dataframe to get highest PNLs in the first rows idx = pd.unique([i[1] for i in np.flip(t.values)]) col = pd.unique([i[0] for i in np.flip(t.values)]) sorted_df = data[col].loc[idx] #%% [markdown] # ### Taking best pairs, plot NLP # Heatmap helps to visualilze results #%% # plot a heat map with top 30 PNLs # light colors indicate high, while dark colors low values m = np.array(sorted_df) figure(num=None, figsize=(8, 8), dpi=80, facecolor='w', edgecolor='k') ax = sns.heatmap(m, linewidth=0.01, cmap="RdYlGn") ax.set_xticklabels(col, rotation=90) ax.set_yticklabels(idx, rotation = 45) plt.show() #%% [markdown] # # Testing for different windows sizes #%% [markdown] # ### Before the backtesting was for only one window size. It's also interesting to see how the strategy would work with different windows sizes #%% # recompute the NLP matrix for the above found combinations for window sizes from 1 to number of days specified number_of_days = 30 top_n = 20 t_10 = so_st.tail(n=top_n).index empty = np.zeros([top_n, number_of_days])
# Flatten axes = axes.flatten() # Loop for i, f in enumerate(FEATURES): # Show print("\n\n%s:" % f) print(b[f].T) # Draw a heatmap with the numeric values in each cell sns.heatmap(b[f].T, annot=True, fmt='.0f', annot_kws={'fontsize': 8}, linewidths=.5, ax=axes[i], cmap=sns.cm.rocket_r) # Configure axes axes[i].set(xlabel="", title='<%s>' % f) # Adjust plt.tight_layout() ########################################################################## # Lets plot the ``normalized`` count of samples for each individual dataset # and the corresponding day #
Outliers_to_drop = detect_outliers(train, 2, ["Age", "SibSp", "Parch", "Fare"]) train.loc[Outliers_to_drop] # Show the outliers rows train = train.drop(Outliers_to_drop, axis=0).reset_index(drop=True) train_len = len(train) dataset = pd.concat(objs=[train, test], axis=0).reset_index(drop=True) dataset = dataset.fillna(np.nan) dataset.isnull().sum() train.info() train.isnull().sum() train.head() train.dtypes train.describe() g = sns.heatmap( train[["Survived", "SibSp", "Parch", "Age", "Fare"]].corr(), annot=True, fmt=".2f", cmap="coolwarm", ) g = sns.factorplot(x="SibSp", y="Survived", data=train, kind="bar", size=6, palette="muted") g.despine(left=True) g = g.set_ylabels("survival probability") g = sns.factorplot(x="Parch", y="Survived", data=train, kind="bar", size=6,
def divergence(file,docs,ref_string): documents_list = [] document_ids = [] for d in range(len(docs)-1): document_ids.append(d) string = docs[d] # print("Doc :",d) # print(string) tokens = regex.tokenize(string) doc = [] for j in tokens: if j.lower() not in stop: lem = lemmatizer.lemmatize(j.lower()) doc.append(lem) documents_list.append(doc) string = ref_string tokens = regex.tokenize(string) doc = [] for j in tokens: if j.lower() not in stop: lem = lemmatizer.lemmatize(j.lower()) doc.append(lem) documents_list.append(doc) common_dictionary = Dictionary(documents_list) common_corpus = [common_dictionary.doc2bow(text) for text in documents_list] n,a,e = parameter_tuning(common_corpus,common_dictionary,documents_list) final_lda = gensim.models.LdaMulticore(common_corpus,id2word = common_dictionary,alpha = a,eta = e, workers=5,num_topics=4,chunksize=100,passes=10,random_state=100) # ref_vector = final_lda[common_corpus[len(common_corpus)-1]] # for i in range(len(common_corpus)): # print(final_lda[common_corpus[i]]) print("Topic is :",file) print() print("topic distribution by words :") topic_words_dist = final_lda.show_topics(num_words=10, log=False, formatted=True) for i in range(len(topic_words_dist)): print(topic_words_dist[i]) print() lda_array = np.full((len(common_corpus),n),0.001) for i in range(lda_array.shape[0]): vector = final_lda[common_corpus[i]] for j in vector: col = j[0] lda_array[i,col] = j[1] print("topic array :") for i in range(lda_array.shape[0]): if i!=lda_array.shape[0]-1: print(document_ids[i],":",lda_array[i:i+1,:]) else: print("Reference summary :",lda_array[i:i+1,:]) print() # print(np.sum(lda_array[10:11,:])) relevance = [] for i in range(0,lda_array.shape[0]-1): document = lda_array[i:i+1,:] reference = lda_array[lda_array.shape[0]-1:lda_array.shape[0],:] cur_rel = find_rel(reference,document) relevance.append(cur_rel) redundancy = 0 ref_vector = lda_array[lda_array.shape[0]-1:lda_array.shape[0],:] for i in range(ref_vector.shape[1]): redundancy = redundancy + (ref_vector[0,i]*math.log2(ref_vector[0,i])) intra_topic_r = np.zeros((lda_array.shape[0]-1,lda_array.shape[0]-1)) r,c = intra_topic_r.shape for i in range(r): for j in range(c): if i==j: intra_topic_r[i,j] = np.inf else: doc_1 = lda_array[i:i+1,:] doc_2 = lda_array[j:j+1,:] intra_topic_r[i,j] = find_rel(doc_1,doc_2) redundancy_vector = [] for i in range(0,lda_array.shape[0]-1): red = 0 d_vector = lda_array[i:i+1,:] for j in range(d_vector.shape[1]): red = red + (d_vector[0,j]*math.log2(d_vector[0,j])) redundancy_vector.append(red) intra_topic_d = np.zeros((lda_array.shape[0]-1,lda_array.shape[0]-1)) r,c = intra_topic_d.shape for i in range(r): for j in range(c): if i==j: intra_topic_d[i,j] = np.inf else: intra_topic_d[i,j] = -(intra_topic_r[i,j] - redundancy_vector[i]) mx = maximum(intra_topic_r) mn = minimum(intra_topic_r) normalized_intra_topic_r = normalize_r(intra_topic_r,mn,mx) print("Per document relevance is :") perdoc_rel = expectation(normalized_intra_topic_r) print() print("Intra-topic relevance is :") sns.set(font_scale=1.5) ax = sns.heatmap(normalized_intra_topic_r,vmin=-1, vmax=0 ,cmap = "YlGnBu",annot=False,linewidth=2.5) plt.savefig(file[0:len(file)-4]+".svg") plt.show() print() mx = maximum(intra_topic_d) mn = minimum(intra_topic_d) normalized_intra_topic_d = normalize_d(intra_topic_d,mn,mx) print("Per document divergence is :") perdoc_div = expectation(normalized_intra_topic_d) print() print("Intra-topic divergence is :") ax = sns.heatmap(normalized_intra_topic_d,vmin=0, vmax=1 ,cmap = "YlGnBu",annot=True,linewidth=0.5) plt.show() print() print("Redundancy vector is :") print(redundancy_vector) print() redundancy_dataset.append(sum(redundancy_vector)/len(redundancy_vector)) relevance_dataset.append(sum(perdoc_rel)/len(perdoc_rel))
def periodAnalysisPlot(folder, name='PeriodicityAll5.npz'): npzfile = np.load(os.path.join(folder, name)) period_array = npzfile['arr_0'] QD_array = npzfile['arr_1'] width_array = npzfile['arr_2'] count_array = npzfile['arr_3'] period_list = period_array.tolist() period_list = [0 if x is None else x for x in period_list] period_array = np.array(period_list) #print(period_array.tolist()) #print(width_array[np.where(period_array==0)], QD_array[np.where(period_array==0)], count_array[np.where(period_array==0)]) fig, ax = plt.subplots(1, 1) total = period_array.size bin_number = np.ceil(np.sqrt(total)) // 2 * 2 + 1 bins = np.linspace(-.5, 15, num=17) print(period_array.tolist()) ax.hist(period_array, log=True, normed=True, bins=bins, alpha=1, label='Period of Memory', rwidth=0.8) ax.set_xlabel('Period') ax.set_ylabel('Probability of occurring') fig, ax = plt.subplots(1, 1) total = period_array.size bin_number = np.ceil(np.sqrt(total)) // 2 * 2 + 1 bins = np.linspace(-0.5, 15, num=7) print(period_array.tolist()) ax.hist(period_array, log=True, bins=bins, alpha=1, label='Period of Memory', rwidth=0.8) ax.set_xlabel('Period') ax.set_ylabel('Number of occurrences') data_dict = { 'version': count_array, 'QD': QD_array * 100, 'Width': width_array * 1e-9, 'Count': period_array } data_df = pd.DataFrame(data_dict) grouped = data_df.groupby('version') data = data_df.groupby(by=['Width', 'QD']).mean() piv = pd.pivot_table(data, values='Count', index=['QD'], columns=['Width'], fill_value=0) plt.figure() yticks = piv.index.values.round(2).tolist()[::4] ax = sns.heatmap(piv, vmin=0, vmax=6, square=True, yticklabels=yticks[::1], cbar_kws={'label': 'Average period of lattice'}, cmap='Blues', xticklabels=4) ax.set_yticks(np.array(yticks) * ax.get_ylim()[1] / 10.) ax.invert_yaxis() plt.tight_layout() ax.set_ylabel('Quenched Disorder (%)') ax.set_xlabel('Width (nm)') plt.setp(ax.get_xticklabels(), rotation=90, horizontalalignment='right') plt.setp(ax.get_yticklabels(), rotation=0, verticalalignment='top') for vers, group in grouped: print(vers) data = group.groupby(by=['Width', 'QD']).mean() piv = pd.pivot_table(data, values='Count', index=['QD'], columns=['Width'], fill_value=0) plt.figure() print(piv.index.values.round(2)) print(list(piv.index.values.round(2))[::4]) print(piv.index.values.round(2).tolist()[::4]) yticks = piv.index.values.round(2).tolist()[::4] ax = sns.heatmap(piv, vmin=0, vmax=6, square=True, yticklabels=yticks[::1], cbar_kws={'label': 'Average period of lattice'}, cmap='Blues', xticklabels=4) ax.set_yticks(np.array(yticks) * ax.get_ylim()[1] / 10.) ax.invert_yaxis() plt.tight_layout() ax.set_ylabel('Quenched Disorder (%)') ax.set_xlabel('Width (nm)') plt.setp(ax.get_xticklabels(), rotation=90, horizontalalignment='right') plt.setp(ax.get_yticklabels(), rotation=0, verticalalignment='top')
import matplotlib.pyplot as plt import pandas as pd import seaborn as sns %matplotlib inline #get data dataset = pd.read_csv('Social_Network_Ads.csv') #checking out data dataset.head() #data info dataset.info() #checking for null values sns.heatmap(dataset.isnull()) #droping userIID dataset.drop(['User ID'], axis=1, inplace=True) #getting dummies for gender gender=pd.get_dummies(dataset['Gender'],drop_first=True) #concating gender dataframe with dataset dataset=pd.concat([dataset,gender],axis=1) #drop Gender dataset.drop(['Gender'],inplace=True,axis=1) dataset.head()
print("\ninfo :") print('#############################################################') print(ad_data.info) print('#############################################################') sns.pairplot(data=ad_data, x_vars=['TV', 'Radio', 'Newspaper'], y_vars='Sales') plt.show() print("\ncorrelation :") print('#############################################################') print(ad_data.corr()) print('#############################################################') plt.title('Correlation b/w Data Attributes') sns.heatmap(ad_data.corr(), annot=True) plt.show() #creating data X = ad_data['TV'] y = ad_data['Sales'] X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=100) print("\nSample of train data :") print('#############################################################') print(X_train.head()) print('#############################################################')