def _plot_kernels(self, kernel_name, sequence_alphabet): figure_outputs = [] table_outputs = [] friendly_kernel_name = copy(kernel_name).replace( "chain_1", self.method.chain_names[0]).replace("chain_2", self.method.chain_names[1]) for i in range(self.method.kernel_count): kernel = getattr(self.method.CNN, kernel_name) kernel_df = pd.DataFrame(kernel.weight[i].detach().numpy(). T[:, :len(sequence_alphabet)], columns=sequence_alphabet) kernel_csv_path = self.result_path / f"{friendly_kernel_name}_{i + 1}.csv" kernel_df.to_csv(kernel_csv_path, index=False) table_outputs.append( ReportOutput(kernel_csv_path, friendly_kernel_name + f"_{i + 1}")) logo = logomaker.Logo(kernel_df, shade_below=0.5, fade_below=0.5, font_name='Arial Rounded MT Bold', vpad=0.05, vsep=0.01) logo_path = self.result_path / f"{friendly_kernel_name}_{i + 1}.png" logo.style_spines(visible=False) logo.style_spines(spines=('left', 'bottom'), visible=True) logo.style_xticks(fmt='%d', anchor=0) logo.fig.savefig(str(logo_path)) plt.close(logo.fig) figure_outputs.append( ReportOutput(logo_path, f"{friendly_kernel_name}_{i + 1}")) return figure_outputs, table_outputs
def plot_sequence_logo(self, seq_logo_df, save_fig_path=None): """ input : max_act_region ; list of str """ # plot plt.figure(dpi=300) MA_C = logomaker.Logo(seq_logo_df); ax = MA_C.fig.gca() ax.spines['right'].set_visible(False) ax.spines['top'].set_visible(False) ax.spines['bottom'].set_visible(False) # save if save_fig_path is not None: MA_C.fig.savefig(save_fig_path,transparent=True,dpi=600) save_dir = os.path.dirname(save_fig_path) try: self.save_dir except: # which is the first time we save self.save_dir = save_dir print('fig saved to',self.save_dir) plt.close(MA_C.fig)
def plot_logomaker_abs(samp): inpath = "%s/FPassignment/%s/%s/analysis/kpLogo/%s/%s_%supTO%sdown_kpLogo.pass.p.cutoff.txt" % ( rootpath, genome_name, experiment, samp, samp, upstreamNts, downstreamNts) df = pd.read_csv(inpath, sep='\t') print df df = df.sort_values(by=['position', '#kmer']) df = df.reset_index(drop=True) ### get raw p-values from -log10 vals pvals = [] for i in df['p-value']: pvals.append(10**(-i)) df['pvals'] = pvals ### bonferroni correction reject, pvals_cor, sidak, bonfMeth = mtest.multipletests( pvals, alpha=0.1, method='bonferroni') df['bonf_pval'] = pvals_cor ### FRD, Benjamini Hochberg adjusted pvalues reject, pvals_cor, sidak, bonfMeth = mtest.multipletests(pvals, alpha=0.05, method='fdr_bh') pvals_cor_log10 = [] for i in pvals_cor: p10 = -math.log10(i) pvals_cor_log10.append(p10) df['bh_pval'] = pvals_cor df['bh_pval_neglog10'] = pvals_cor_log10 ### extract desired values dp = df.copy() dp = dp[[ '#kmer', 'position', 'shift', 'statistics', 'pvals', 'p-value', 'bh_pval', 'bh_pval_neglog10' ]] dp['difflog10'] = dp['p-value'] * dp['bh_pval_neglog10'] dp['log10p'] = dp['p-value'] * np.sign(dp['statistics']) dp['log10pBH'] = dp['bh_pval_neglog10'] * np.sign(dp['statistics']) print dp['bh_pval_neglog10'] ### set up matrix for lm dl = dp.copy() dl = dl[['#kmer', 'position', 'bh_pval_neglog10', 'log10pBH']] dl.loc[dl['position'] > 0, 'position'] -= 1 dl.loc[dl['#kmer'] == 'T', '#kmer'] = 'U' # dlm = pd.DataFrame(np.zeros((4,upstreamNts+1+downstreamNts))) # dlm.index = ['A', 'C', 'G', 'T'] # dlm.columns = range(-upstreamNts, downstreamNts+1) # # dlm.drop(0, axis=1,inplace=True) # dlm # for i in dl.index: # nt = dl.loc[i,'#kmer'] # pos = dl.loc[i, 'position'] # val = dl.loc[i, 'log10pBH'] # dlm.loc[nt, pos] = val # dlm = dlm.T # dlm # fig, ax = plt.subplots(figsize=(6,6)) # logo = lm.Logo(dlm, # font_name = 'Arial', # ax=ax # ) # logo.draw_baseline(linewidth = 1, color ='black', linestyle="-") # outpath = '%s/FPassignment/%s/%s/analysis/kpLogo/%s/%s_%supTO%sdown_sccLogo.pdf' % ( # rootpath, genome_name, experiment, samp, samp, upstreamNts, downstreamNts) # plt.savefig("/home/jamie/misc/rt/logoTest.pdf", format='pdf') ### rep sc 3 positions print dl print "" dlm = pd.DataFrame(np.zeros((4, upstreamNts + 3 + downstreamNts))) dlm.index = ['A', 'C', 'G', 'U'] dlm.columns = range(-upstreamNts, downstreamNts + 3) # dlm.drop(0, axis=1,inplace=True) print dlm for i in dl.index: nt = dl.loc[i, '#kmer'] pos = dl.loc[i, 'position'] val = dl.loc[i, 'bh_pval_neglog10'] if pos < 0: dlm.loc[nt, pos] = val elif pos == 0: if nt == 'A': dlm.loc['A', 0] = val dlm.loc['A', 1] = val dlm.loc['A', 2] = val elif nt == 'C': dlm.loc['C', 0] = val dlm.loc['C', 1] = val dlm.loc['C', 2] = val elif nt == 'G': dlm.loc['G', 0] = val dlm.loc['G', 1] = val dlm.loc['G', 2] = val else: print "stop not set" elif pos > 0: newpos = pos + 2 dlm.loc[nt, newpos] = val print dlm dlm = dlm.T dlm fig, ax = plt.subplots(figsize=(12, 6)) logo = lm.Logo(dlm, font_name='Arial', ax=ax) # for i in dl.index: # this is not really working... # if np.sign(dl.loc[i,'log10pBH']) == -1.0: # # print 'neg', dl.loc[i,['log10pBH']] # nt = dl.loc[i,'#kmer'] # pos = dl.loc[i, 'position'] # logo.style_single_glyph(pos, nt, flip=True, edgecolor='black', color='white') logo.draw_baseline(linewidth=1, color='black', linestyle="-") logo.highlight_position_range(0, 2, alpha=0.5, color='lightgray') ax.set_ylim(0, 15) outpath = '%s/FPassignment/%s/%s/analysis/kpLogo/%s/%s_%supTO%sdown_sccLogo_ABS.pdf' % ( rootpath, genome_name, experiment, samp, samp, upstreamNts, downstreamNts) plt.savefig(outpath, format='pdf')
with logomaker.open_example_datafile('ars_wt_sequence.txt', print_description=False) as f: lines = f.readlines() lines = [l.strip() for l in lines if '#' not in l] ars_seq = ''.join(lines) # trim matrix and sequence start = 10 stop = 100 ars_df = ars_df.iloc[start:stop, :] ars_df.reset_index(inplace=True, drop=True) ars_seq = ars_seq[start:stop] # create Logo object ars_logo = logomaker.Logo(ars_df, color_scheme='dimgray', font_name='Luxi Mono') # color wild-type ARS1 sequence within logo ars_logo.style_glyphs_in_sequence(sequence=ars_seq, color='darkorange') # highlight functional regions of ARS1 ars_logo.highlight_position_range(pmin=7, pmax=22, color='lightcyan') ars_logo.highlight_position_range(pmin=33, pmax=40, color='honeydew') ars_logo.highlight_position_range(pmin=64, pmax=81, color='lavenderblush') # additional styling using Logo methods ars_logo.style_spines(visible=False) # style using Axes methods ars_logo.ax.set_ylim([-4, 4])
delta_ax = axs[2] motif_ax = axs[3] y_min = min([ ref.min().min(), alt.min().min(), ref_minus_alt.min().min() ]) y_max = max([ ref.max().max(), alt.max().max(), ref_minus_alt.max().max() ]) ref_logo = logomaker.Logo(ref, ax=ref_ax, baseline_width=0, show_spines=True, vsep=0, width=.95) #ref_logo.highlight_position(50, color=(1, 0, 0, 0.5)) alt_logo = logomaker.Logo(alt, ax=alt_ax, baseline_width=0, show_spines=True, vsep=0, width=.95) delta_logo = logomaker.Logo(ref_minus_alt, ax=delta_ax, baseline_width=0, show_spines=True, vsep=0, width=.95)
def plot_shift(results, config, out_folder, z_tail, z_head, mc_samples=None, seq_ref=None): """Plot shift in sequence space across latent vector z.""" # Setup. FactorMuE_variational = results['FactorMuE_variational'] if mc_samples is None: mc_samples = int(config['train']['mc_samples']) else: mc_samples = int(mc_samples) # Load hyperparameters. (dtype, bt_scale, b0_scale, l_conc, u_conc, r_conc, z_distr, latent_dims, latent_alphabet_size, alphabet, alphabet_size, latent_length) = _load_hyperparameters(config) # Load reference file. if seq_ref is not None: data_ref = dataloader.load(seq_ref, filetype=config['data']['filetype'], alphabet=config['data']['alphabet'], dtype=dtype) for x_batch, xlen_batch in data_ref.batch(1): x, xlen = x_batch[0], xlen_batch[0] prefix = 'aligned_' else: x, xlen = None, None prefix = '' # Load latent vector. z_head = tf.convert_to_tensor(json.loads(z_head), dtype=dtype) z_tail = tf.convert_to_tensor(json.loads(z_tail), dtype=dtype) zs = tf.concat((z_tail[None, :], z_head[None, :]), axis=0) # Plot embedding. z = results['embed_mean'] plt.figure(figsize=(8, 8)) plt.scatter(z[:, 0], z[:, -1], s=5) plt.arrow(z_tail[0], z_tail[-1], z_head[0] - z_tail[0], z_head[-1] - z_tail[-1], length_includes_head=True, head_width=0.03, color='black') plt.xlabel(r'$z_1$', fontsize=18) plt.ylabel(r'$z_2$', fontsize=18) plt.savefig(os.path.join(out_folder, 'z1m1_shift.pdf')) # Get projection. nus = FactorMuE.project_latent_to_sequence(zs, FactorMuE_variational, latent_dims, latent_length, latent_alphabet_size, alphabet_size, bt_scale, b0_scale, u_conc, r_conc, l_conc, x=x, xlen=xlen, mc_samples=mc_samples, z_distr=z_distr, dtype=dtype) # Plot shift magnitude. nu_shift = np.sqrt(np.sum((nus[1] - nus[0]).numpy()**2, axis=1)) plt.figure(figsize=(8, 6)) plt.plot(nu_shift, linewidth=2) plt.xlabel('conserved position', fontsize=18) plt.ylabel('preference shift magnitude', fontsize=18) plt.savefig(os.path.join(out_folder, prefix + 'shift_magnitude.pdf')) plt.close() # Plot tail logo. df = pd.DataFrame(nus[0].numpy(), columns=alphabet) logomaker.Logo(df) plt.savefig(os.path.join(out_folder, prefix + 'tail_logo.pdf')) plt.close() df.to_csv(os.path.join(out_folder, prefix + 'tail_logo.csv')) # Plot head logo. df = pd.DataFrame(nus[1].numpy(), columns=alphabet) logomaker.Logo(df) plt.savefig(os.path.join(out_folder, prefix + 'head_logo.pdf')) plt.close() df.to_csv(os.path.join(out_folder, prefix + 'head_logo.csv')) # Plot shift logo. df = pd.DataFrame(nus[1].numpy() - nus[0].numpy(), columns=alphabet) logomaker.Logo(df) plt.savefig(os.path.join(out_folder, prefix + 'shift_logo.pdf')) plt.close() df.to_csv(os.path.join(out_folder, prefix + 'shift_logo.csv'))
def calculate_nodes(self): """Method to calculate the different internal node scores for a given calculus method, and store those values both in a dictionary (if the user wants to) and in an instance of a processed tree. """ try: tree = PhyloTree(self.tree_in, alignment=self.align_in, alg_format="fasta") md = tree.get_midpoint_outgroup() tree.set_outgroup(md) leaf_deleting_list = set() if self.position_matrix == None: uniprot_hit_hash, leaf_deleting_list = fp.retrieve_features( self.study_features, self.table_info, self.min_eval, self.uniprot_info) self.position_matrix = fp.get_positions_matrix( uniprot_hit_hash, tree ) # If we want to update the features, we have to delete the position matrix (with update method) for leaf in tree.iter_leaves(): if leaf.name in leaf_deleting_list: leaf.delete() node_number = 0 node_scores = {} node_haplotypes = {} node_haplotype_matrices = {} node_haplotype_logos = {} for index, node in enumerate(tree.traverse("preorder")): node._nid = index if node.is_leaf() == False: node_sequence_matrix = fp.annotated_sequence_extractor( node, self.position_matrix, self.differentiate_gaps) node_score = round( fp.calculate_node_score(node_sequence_matrix, self.calc_alg), 2) node.add_feature("node_score", node_score) node_scores[node_number] = node_score node_haplotype = fp.haplotype_parse(node_sequence_matrix) node.add_feature("node_haplotype", node_haplotype) node_haplotypes[node_number] = node_haplotype if self.compute_logos == "Y": node_haplotype_matrix = fp.haplotype_matrix_calculator( node_sequence_matrix) node.add_feature("node_haplotype_matrix", node_haplotype_matrix) node_haplotype_matrices[ node_number] = node_haplotype_matrix if node_haplotype_matrix is not None: node_haplotype_logo = logomaker.Logo( node_haplotype_matrix, color_scheme="dmslogo_funcgroup", show_spines=False) node_haplotype_logo = node_haplotype_logo.fig else: node_haplotype_logo = None node.add_feature("node_haplotype_logo", node_haplotype_logo) node_haplotype_logos[node_number] = node_haplotype_logo node_number += 1 self.processed_tree = tree self.node_scores = node_scores self.node_haplotypes = node_haplotypes self.node_haplotype_matrices = node_haplotype_matrices self.node_haplotype_logos = node_haplotype_logos except: sys.stderr.write("Error at calculating nodes.\n") sys.exit(1) return
# Check if matches_df = pd.DataFrame(matches_prot) matches_df # %% [markdown] # Count the number of each type of amino acid at each position. # %% prot_aa = [] for ind, pri_seq in matches_df.iteritems(): aa_abundance = pri_seq.value_counts() prot_aa.append(aa_abundance) prot_df = pd.DataFrame(prot_aa).fillna(0) prot_df # %% # Display Consensus Sequence crp_logo = logomaker.Logo(prot_df, figsize=(10, 2), color_scheme='chemistry') # %% [markdown] # Identify the error. # %% # style and show figure crp_logo.ax.set_xlabel('Percentage') crp_logo.ax.set_title('Primary consensus sequence') crp_logo.fig crp_logo.fig.show() # %%
def plotPromoters(): ######################## #command line arguments# ######################## parser = argparse.ArgumentParser() #PARAMETERS parser.add_argument( "--sequences", help="Full path to a fasta-file containing the promoter sequences.", type=str) parser.add_argument("--outdir", help="Full path to the output directory.") parser.add_argument( "--N", help= "How many references are used for averaging single signal sequence contributions.", type=int, default=10) parser.add_argument("--model", help="Full path to the trained keras model.", type=str, default=None) parser.add_argument( "--background", help="Full path to a fasta-file containing the background sequences.", type=str) parser.add_argument("--target_layer", help="Target layer index for deeplift (default=-3).", type=int, default=-3) parser.add_argument("--ylim", help="Limits for y-axis.", type=float, nargs=2, default=None) parser.add_argument( "--labels", help= "Full path to a file containing labels used as figure titles. If not given, use fasta IDs.", type=str, default=None) parser.add_argument("--logoType", help="Logo image file extension (default=pdf).", type=str, default='pdf', choices=['png', 'pdf']) args = parser.parse_args() #reading in the promoter sequences ids = [] signal = [] signal_seq = [] for seq in pyfastx.Fasta(args.sequences): ids.append(seq.name) signal_seq.append(str(seq.seq).upper()) #and one-hot encoding for i in range(0, len(signal_seq)): signal.append(vectorizeSequence(signal_seq[i])) signal = np.array(signal) #reading in the background sequences bg = [] for seq in pyfastx.Fasta(args.background): bg.append(str(seq.seq).upper()) #and one-hot encoding for i in range(0, len(bg)): bg[i] = vectorizeSequence(bg[i]) bg = np.array(bg) #reading in labels if given if args.labels != None: labels = [] f = open(args.labels, 'rt') for row in f: labels.append(row) f.close() else: labels = ids #initialize the deeplift model deeplift_model = kc.convert_model_from_saved_files( args.model, nonlinear_mxts_mode=deeplift.layers.NonlinearMxtsMode. DeepLIFT_GenomicsDefault) find_scores_layer_idx = 0 #computes importance scores for inpur layer input deeplift_contribs_func = deeplift_model.get_target_contribs_func( find_scores_layer_idx=find_scores_layer_idx, target_layer_idx=args.target_layer) #and then score each sequence against args.N different background sequences scores = np.zeros(shape=(args.N, signal.shape[0], signal.shape[1])) for i in range(0, args.N): scores[i, :, :] = np.sum(deeplift_contribs_func( task_idx=1, input_data_list=[signal], input_references_list=[bg[:signal.shape[0], :, :]], batch_size=10, progress_update=None), axis=2) bg = np.roll(bg, 1, axis=0) scores = np.mean(scores, axis=0) #now the contributions have been calculated, next plotting the sequence logos weighted by the contributions for ind in range(0, len(signal_seq)): #first plotting the sequence seq = signal_seq[ind] fig, ax = plt.subplots() matrix_df = lm.saliency_to_matrix( seq, scores[ind, :]) #pd.DataFrame(scores[i,:]) logo = lm.Logo(df=matrix_df, color_scheme='classic') logo.ax.set_xlabel('position') logo.ax.set_ylabel('contribution') title = labels[ind] logo.ax.set_title(title) if args.ylim != None: logo.ax.set_ylim(args.ylim) plt.tight_layout() plt.savefig(args.outdir + ids[ind] + '.' + args.logoType, dpi=150, bbox_inches='tight', pad_inches=0) plt.close(fig) plt.clf() plt.cla() #and then saving the importance scores to a file np.savetxt(args.outdir + ids[ind] + '.txt', scores[ind, :])
def plot_shift(results, config, out_folder, z_tail, z_head, mc_samples=None, covar_ref=None, seq_ref=None): """Plot shift in sequence space across latent vector z.""" # Setup. RegressMuE_variational = results['RegressMuE_variational'] if mc_samples is None: mc_samples = int(config['train']['mc_samples']) else: mc_samples = int(mc_samples) # Load hyperparameters. (dtype, bt_scale, b0_scale, l_conc, u_conc, r_conc, latent_alphabet_size, alphabet, alphabet_size, latent_length) = _load_hyperparameters(config) # Load latent vector. z_head = tf.convert_to_tensor(json.loads(z_head), dtype=dtype) z_tail = tf.convert_to_tensor(json.loads(z_tail), dtype=dtype) zs = tf.concat((z_tail[None, :], z_head[None, :]), axis=0) latent_dims = zs.shape[1] # Load reference file. if seq_ref is not None: data_ref = dataloader.load_joint( covar_ref, seq_ref, cov_filetype=config['data']['covariate_filetype'], cov_header=config['data']['covariate_header'], seq_filetype=config['data']['sequence_filetype'], alphabet=config['data']['alphabet'], dtype=dtype) for z_batch, x_batch, xlen_batch in data_ref.batch(1): z_covar, x, xlen = z_batch[0], x_batch[0], xlen_batch[0] prefix = 'aligned_' else: z_covar, x, xlen = None, None, None prefix = '' # Get projection. nus = RegressMuE.project_latent_to_sequence(zs, RegressMuE_variational, latent_dims, latent_length, latent_alphabet_size, alphabet_size, bt_scale, b0_scale, u_conc, r_conc, l_conc, z_covar=z_covar, x=x, xlen=xlen, mc_samples=mc_samples, dtype=dtype) # Plot shift magnitude. nu_shift = np.sqrt(np.sum((nus[1] - nus[0]).numpy()**2, axis=1)) plt.figure(figsize=(8, 6)) plt.plot(nu_shift, linewidth=2) plt.xlabel('conserved position', fontsize=18) plt.ylabel('preference shift magnitude', fontsize=18) plt.savefig(os.path.join(out_folder, prefix + 'shift_magnitude.pdf')) plt.close() # Plot tail logo. df = pd.DataFrame(nus[0].numpy(), columns=alphabet) logomaker.Logo(df) plt.savefig(os.path.join(out_folder, prefix + 'tail_logo.pdf')) plt.close() df.to_csv(os.path.join(out_folder, prefix + 'tail_logo.csv')) # Plot head logo. df = pd.DataFrame(nus[1].numpy(), columns=alphabet) logomaker.Logo(df) plt.savefig(os.path.join(out_folder, prefix + 'head_logo.pdf')) plt.close() df.to_csv(os.path.join(out_folder, prefix + 'head_logo.csv')) # Plot shift logo. df = pd.DataFrame(nus[1].numpy() - nus[0].numpy(), columns=alphabet) logomaker.Logo(df) plt.savefig(os.path.join(out_folder, prefix + 'shift_logo.pdf')) plt.close() df.to_csv(os.path.join(out_folder, prefix + 'shift_logo.csv'))
def main(env, args): # type: (Environment, argparse.Namespace) -> None gil = GenomeInfoList.init_from_file(args.pf_genome_list) pd_figures = os_join(env["pd-work"], "figures") mkdir_p(pd_figures) list_run_info = list() for gi in tqdm(gil, total=len(gil)): # get gms2 and toolp models mod_gms2, mod_toolp = compare_gms2_and_toolp_motifs_for_gi(env, gi) group = mod_gms2.items["GENOME_TYPE"].split("-")[1].upper() mm_gms2 = MotifModel(mod_gms2.items["RBS_MAT"], None) mm_toolp = MotifModel(mod_toolp.items["RBS_MAT"], None) non_gms2 = GMS2Noncoding(mod_gms2.items["NON_MAT"]) df_gms2 = mm_gms2.pwm_to_df() df_toolp = mm_toolp.pwm_to_df() fig, axes = plt.subplots(1, 2, sharex="all", sharey="all", figsize=(8, 4)) # relative rel_mat = lm.transform_matrix(df_gms2, from_type="probability", to_type="information") lm.Logo(rel_mat, color_scheme="classic", ax=axes[0]) axes[0].set_ylim(*[0, 2]) axes[0].set_title("GeneMarkS-2") # shannon sha_mat = lm.transform_matrix(df_toolp, from_type="probability", to_type="information") lm.Logo(sha_mat, color_scheme="classic", ax=axes[1]) axes[1].set_ylim(*[0, 2]) axes[1].set_title("StartLink+") plt.tight_layout() plt.savefig(next_name(pd_figures)) plt.show() rel_gms2 = relative_entropy(mm_gms2, non_gms2) rel_toolp = relative_entropy(mm_toolp, non_gms2) gc = 100 * compute_gc_from_file(os_join(env["pd-data"], gi.name, "sequence.fasta")) if not args.verified: list_run_info.append({ "GC": gc, "Accuracy": 100 - compare_gms2_start_predictions_with_motif_from_toolp(env, gi), "RE GMS2": rel_gms2, "RE toolp": rel_toolp }) else: # verified comp = compare_gms2_start_predictions_with_motif_from_toolp_verified(env, gi, group=group) list_run_info.append({ "Genome": fix_names(gi.name), "Error": 100 - comp[0], "Tool": "GMS2", "RE": rel_gms2, "GC": gc }) list_run_info.append({ "Genome": fix_names(gi.name), "Error": 100 - comp[1], "Tool": "GMS2 with SL", "RE": rel_toolp, "GC": gc }) print(list_run_info[-2:]) import sbsp_viz.sns as sns if args.verified: df = pd.DataFrame(list_run_info) df.to_csv(next_name(env["pd-work"], ext="csv")) sns.lineplot(df, "Genome", "Error", hue="Tool", figure_options=FigureOptions( save_fig=next_name(env["pd-work"]), xlabel="Genome", ylabel="Error")) sns.lineplot(df, "Genome", "RE", hue="Tool", figure_options=FigureOptions( save_fig=next_name(env["pd-work"]), xlabel="Genome", ylabel="Relative entropy", )) else: df = pd.DataFrame(list_run_info) sns.scatterplot(df, "GC", "Accuracy", figure_options=FigureOptions( save_fig=next_name(env["pd-work"]), xlabel="GC", ylabel="Percentage of different 5' ends", ylim=[0,10], )) df.to_csv(next_name(env["pd-work"], ext="csv")) sns.scatterplot(df, "RE GMS2", "RE toolp", identity=True, figure_options=FigureOptions( save_fig=next_name(env["pd-work"]) )) print("Average Error: {}".format(df["Accuracy"].mean())) df = pd.DataFrame(list_run_info) df = df[df["Accuracy"] < 2].copy() sns.scatterplot(df, "GC", "Accuracy", figure_options=FigureOptions( save_fig=next_name(env["pd-work"]), xlabel="GC", ylabel="Percentage of different 5' ends", ylim=[0,10], )) sns.scatterplot(df, "RE GMS2", "RE toolp", identity=True, figure_options=FigureOptions( save_fig=next_name(env["pd-work"]) )) print("Average Error: {}".format(df["Accuracy"].mean())) df.to_csv(next_name(env["pd-work"], ext="csv"))
# do imports import matplotlib.pyplot as plt import numpy as np import logomaker as logomaker # load saliency matrix nn_df = logomaker.get_example_matrix('nn_saliency_matrix', print_description=False) # create Logo object nn_logo = logomaker.Logo(nn_df) # style using Logo methods nn_logo.style_spines(visible=False) nn_logo.style_spines(spines=['left'], visible=True, bounds=[0, .75]) # style using Axes methods nn_logo.ax.set_xlim([20, 115]) nn_logo.ax.set_xticks([]) nn_logo.ax.set_ylim([-.6, .75]) nn_logo.ax.set_yticks([0, .75]) nn_logo.ax.set_yticklabels(['0', '0.75']) nn_logo.ax.set_ylabel(' saliency', labelpad=-1) # set parameters for drawing gene exon_start = 55 - .5 exon_stop = 90 + .5 y = -.2 xs = np.arange(-3, len(nn_df), 10) ys = y * np.ones(len(xs))
# do imports import matplotlib.pyplot as plt import logomaker as logomaker # load crp energy matrix crp_df = -logomaker.get_example_matrix('crp_energy_matrix', print_description=False) # create Logo object crp_logo = logomaker.Logo(crp_df, shade_below=.5, fade_below=.5, font_name='Arial Rounded MT Bold') # style using Logo methods crp_logo.style_spines(visible=False) crp_logo.style_spines(spines=['left', 'bottom'], visible=True) crp_logo.style_xticks(rotation=90, fmt='%d', anchor=0) # style using Axes methods crp_logo.ax.set_ylabel("$-\Delta \Delta G$ (kcal/mol)", labelpad=-1) crp_logo.ax.xaxis.set_ticks_position('none') crp_logo.ax.xaxis.set_tick_params(pad=-1) # style and show figure crp_logo.fig.show()
I = np.log2(4) + np.sum( kmer_motif * np.log2(kmer_motif + 1e-10), axis=1, keepdims=True) logo = np.maximum(I * kmer_motif, 1e-7) # setup dataframe for logmaker L = len(kmer_motif) counts_df = pd.DataFrame(data=0.0, columns=list(alphabet), index=list(range(L))) for l in range(L): for a in range(4): counts_df.iloc[l, a] = logo[l, a] fig = plt.figure(figsize=(3, 2)) ax = plt.subplot(111) logomaker.Logo(counts_df, ax=ax) ax = plt.gca() ax.spines['right'].set_visible(False) ax.spines['top'].set_visible(False) ax.spines['bottom'].set_visible(False) ax.spines['left'].set_visible(False) ax.yaxis.set_ticks_position('none') ax.xaxis.set_ticks_position('none') plt.xticks([]) plt.yticks([]) outfile = os.path.join(motif_path, experiment + '_kmer_motif_logo.png') fig.savefig(outfile, format='pdf', dpi=200, bbox_inches='tight') #----------------------------------------------------------------------------- # kmer mutagenesis print("performing k-mer mutagenesis analysis")
def line_wrapped_logo( tidy_df, *, site_col='site', letter_col='letter', height_col='height', color_col='color', sitelabel_col=None, highlight_color_col=None, highlight_alpha_col=None, sites_per_line=100, scalewidth=1, scaleheight=1, fade_letters_by_height=None, logo_kwargs=None, ylims=None, all_letters=('A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y'), missing_letter='error', letters_to_drop=('*', ), style_xticks_kwargs=None, xlabel=None, ylabel=None, label_fontsize=16, xlabelpad=10, baseline_on_top=True, ): """Draw logo wrapping several lines with custom colors and overlays. Parameters ----------- tidy_df : pandas.DataFrame Holds data in tidy format, one line per letter. site_col : str Column in `tidy_df` with site number. letter_col : str Column in `tidy_df` with letter identity (e.g., amino acid). height_col : str Column in `tidy_df` with letter height. color_col : str Column in `tidy_df` with letter color. sitelabel_col : None or str Column in `tidy_df` with labels for site ticks if different than `site_col`. highlight_color_col : None or str Column in `tidy_df` with background highlight color, or `None` or `NA` if site not highlighted. Only one color can be assigned per site. highlight_alpha_col : None or float Column in `tidy_df` with background color alpha (transparency) for site highlighting. If not present, defaults to 0.25. sites_per_line : int Number of sites per line. scalewidth : float Scale overall figure height by this much. scaleheight : float Scale overall figure width by this much. fade_letters_by_height : None or 2-tuple If not `None`, set alpha transparency of letters proportional to their height, going from `(min_alpha, max_alpha)`. logo_kwargs : None or dict Keyword arguments to ``logomaker.Logo``. Key ones include 'width', 'vpad', and 'font_name'. ylims : 2-tuple or None Y-axis limits, or `None` to auto-determine. all_letters : tuple or list All letters for which we plot heights. missing_letter : {'zero_height', 'error'} If letter is missing at a site, assign zero height or raise error? letters_to_drop : tuple or list Do not plot these letters. style_xticks_kwargs : None or dict Keyword arguments to pass to ``logomaker.Logo.style_xticks``. For instance, to change spacing between tick labels to every 10th site, use ``style_xticks_args={'spacing': 10}``. xlabel : str or None Label for x-axis (shared over entire plot). ylabel : str or None Label for y-axis (shared over entire plot). label_fontsize : int Size of labels drawn for `xlabel`, `ylabel`. xlabelpad : float Padding above x-axis label. baseline_on_top : bool Draw baseline (horizontal line at 0 height) on top of letters. """ expect_cols = [site_col, letter_col, height_col, color_col] for col in [sitelabel_col, highlight_color_col, highlight_alpha_col]: if col is not None: expect_cols.append(col) for col in expect_cols: if col not in tidy_df.columns: raise ValueError(f"`tidy_df` lacks column {col}") if set(letters_to_drop).intersection(set(all_letters)): raise ValueError('overlap between `letters_to_drop` and `all_letters`') # drop any extra letters tidy_df = tidy_df.query(f"{letter_col} not in {letters_to_drop}") # make wide data frame for logomaker wide_df = tidy_to_wide_df(tidy_df, site_col, letter_col, height_col, all_letters) # dict matching (site, letter) to color colors = tidy_df.set_index([site_col, letter_col])[color_col].to_dict() # dicts matching sites to labels, highlight color, and alpha sitelabels = collections.defaultdict(lambda: '') highlight_colors = {} highlight_alphas = collections.defaultdict(lambda: 0.25) for d, col in [ (sitelabels, sitelabel_col), (highlight_colors, highlight_color_col), (highlight_alphas, highlight_alpha_col), ]: if col is not None: site_vals = (tidy_df[tidy_df[col].notnull()][[site_col, col ]].drop_duplicates()) dup_site_vals = (site_vals.groupby(site_col).aggregate( n=pd.NamedAgg(col, 'count')).query('n > 1')) if len(dup_site_vals): raise ValueError(f"multiple {col} for sites:\n{dup_site_vals}") for k, v in site_vals.set_index(site_col)[col].to_dict().items(): d[k] = v # set up figure nsites = len(wide_df) nlines = math.ceil(nsites / sites_per_line) sites_per_line = min(sites_per_line, nsites) # reduce if needed fig = plt.figure(figsize=(scalewidth * sites_per_line * 0.3, scaleheight * nlines * 1.75), ) # map letters to fading if fade_letters_by_height: letter_fading = {} min_alpha, max_alpha = fade_letters_by_height if not 0 <= min_alpha < max_alpha <= 1: raise ValueError('fade_letters_by_height must span non-zero' 'range betweeen 0 and 1') min_height = wide_df.abs().min().min() max_height = wide_df.abs().max().max() for site, letter in itertools.product(wide_df.index, wide_df.columns): abs_height = abs(wide_df.at[site, letter]) norm_fade = (abs_height - min_height) / (max_height - min_height) assert 0 <= norm_fade <= 1, norm_fade fade = norm_fade * (max_alpha - min_alpha) + min_alpha assert min_alpha <= fade <= max_alpha letter_fading[(site, letter)] = fade # auto-determine y-axis limits ypad = 1.02 if ylims is None: if all(tidy_df[height_col] >= 0): ymin = 0 ymax = ypad * tidy_df[height_col].max() elif all(tidy_df[height_col] <= 0): ymax = 0 ymin = ypad * tidy_df[height_col].min() else: ymax = tidy_df[height_col].max() ymin = tidy_df[height_col].min() ymax += ypad * (ymax - ymin) ymin -= ypad * (ymax - ymin) # arguments for xtick styling xticks_kwargs = { 'spacing': 5, # number every five sites 'rotation': 90, # rotated tick marks 'fontdict': { 'verticalalignment': 'top', 'horizontalalignment': 'center', 'fontsize': 10 }, } if style_xticks_kwargs is not None: for key, val in style_xticks_kwargs.items(): xticks_kwargs[key] = val # draw logos for each line of figure for iline in range(nlines): # loop over lines df = wide_df.iloc[iline * sites_per_line:(iline + 1) * sites_per_line] isites = df.index.tolist() # sites being plotted on this axis ax = plt.subplot2grid( shape=(nlines, sites_per_line), loc=(iline, 0), colspan=len(df), # number of sites for this line fig=fig, ) logo = logomaker.Logo( df=df, ax=ax, **logo_kwargs, ) # color letters for site, letter in itertools.product(isites, all_letters): style_kwargs = {} if (site, letter) in colors: style_kwargs['color'] = colors[(site, letter)] if fade_letters_by_height: style_kwargs['alpha'] = letter_fading[(site, letter)] if style_kwargs: logo.style_single_glyph(p=site, c=letter, **style_kwargs) # highlight sites for site, highlight_color in highlight_colors.items(): if site in isites: logo.highlight_position( p=site, color=highlight_color, alpha=highlight_alphas[site], ) # format axes and ticks logo.style_spines(visible=False) if sitelabels: xticks_kwargs['spacing'] = 1 logo.style_xticks(**xticks_kwargs) ax.set_xticklabels([ str(sitelabels[site]) for site in range(min(isites), max(isites) + 1) ]) else: logo.style_xticks(**xticks_kwargs) ax.tick_params( axis='x', length=0, # no xtick lines pad=0, # no padding between xtick labels and axis ) ax.set_ylim(ylims) ax.set_yticks([]) # draw baseline on top of letters? if baseline_on_top: logo.draw_baseline(zorder=1) # set figure-wide axis labels: https://stackoverflow.com/a/53172335 if xlabel or ylabel: ax_fig = fig.add_subplot(111, facecolor='none', frameon=False) ax_fig.tick_params(labelcolor='none', top=False, bottom=False, left=False, right=False) if xlabel: ax_fig.set_xlabel(xlabel, fontsize=label_fontsize, labelpad=xlabelpad) if ylabel: ax_fig.set_ylabel(ylabel, fontsize=label_fontsize) fig.tight_layout(h_pad=1.5) return fig
def plot_letter_over_position(env, df, col, title=""): # type: (Environment, pd.DataFrame, str, str) -> None collect = dict() array, update_shifts = create_numpy_for_column_with_extended_motif( env, df, col, collect) df_original = df binned_arrays = [{ "GC": df["GC"], "motifs": array, "shifts": update_shifts }] example = df.at[df.index[0], col] # type: Dict[str, List[float]] w = len(next(iter(example.values()))) # width (numbere of positions) b = len(example) # number of bases (letters) letters = example.keys() letter_to_idx = {x: x_pos for x_pos, x in enumerate(sorted(letters))} # fig, axes = plt.subplots(2, math.ceil(len(letters) / 2), sharex="all", sharey="all") fig = plt.figure(figsize=(10, 12)) shape = (4, 2) ax1 = plt.subplot2grid(shape, (0, 0)) ax2 = plt.subplot2grid(shape, (0, 1)) ax3 = plt.subplot2grid(shape, (1, 0)) ax4 = plt.subplot2grid(shape, (1, 1)) ax_logo = plt.subplot2grid(shape, (3, 0)) ax_counts = plt.subplot2grid(shape, (2, 0)) ax_pos_dist = plt.subplot2grid(shape, (2, 1)) ax_text = plt.subplot2grid(shape, (3, 1)) axes = [ax1, ax2, ax3, ax4] # for each letter # for l, ax in zip(letters, axes.ravel()[:len(letters)]): ylim = [-0.1, 1.1] for l, ax in zip(letters, axes): # for each position in motif # go through df and accumulate values all_gc = list() all_probs = list() for w_pos in range(array.shape[1]): for ba in binned_arrays: arr = ba["motifs"] gc = ba["GC"].values shifts = ba["shifts"] for index in range(len(shifts)): shifted_position = w_pos # print(w_pos, shifted_position) # shifted_pos = w_pos - shifts[index] # if shifted_pos < 0 or shifted_pos >= w: # continue if w_pos < shifts[index] or w_pos >= shifts[index] + 6: continue all_gc.append(shifted_position) if arr[index, shifted_position, letter_to_idx[l]] < 0 or arr[index, shifted_position, letter_to_idx[l]] > 1: raise ValueError("Something's up") all_probs.append(arr[index, shifted_position, letter_to_idx[l]]) # ax.scatter(all_gc, all_probs, marker="+") # seaborn.regplot(all_gc, all_probs, ax=ax, lowess=True, scatter_kws={"s": 5, "alpha": 0.3}) ax.set_title(f"{l}") df = pd.DataFrame({"Position": all_gc, "Probability": all_probs}) df.sort_values("Position", inplace=True) # seaborn.kdeplot(df["Position"], df["Probability"], cmap="Reds", ax=ax) df_mean = df.groupby("Position", as_index=False).mean() seaborn.boxplot("Position", "Probability", data=df, ax=ax, color="red", fliersize=0) seaborn.lineplot(df_mean["Position"], df_mean["Probability"], ax=ax, color="blue") ax.set_ylim(ylim) # loess_with_stde(df, "Position", "Probability", ax, None) # plt.show() # add logo ax = ax_logo msa_t = collect["msa_t"] seqs = [x.seq._data for x in msa_t.list_alignment_sequences] counts_mat = lm.alignment_to_matrix(sequences=seqs, to_type='counts', characters_to_ignore='.-X') # Counts matrix -> Information matrix info_mat = lm.transform_matrix(counts_mat, from_type='counts', to_type='information') lm.Logo(info_mat, ax=ax, color_scheme="classic") ax.set_ylim([0, 2]) # add distplot of starting positions ax = ax_counts # seaborn.distplot(update_shifts, ax=ax) counter = Counter(update_shifts) total = sum(counter.values()) to_add = sorted(set(range(4)).difference(counter.keys())) normalized = [[x, 100 * counter[x] / total] for x in counter] + [[x, 0] for x in to_add] normalized = np.array(normalized) seaborn.barplot(normalized[:, 0], normalized[:, 1], ax=ax, color="blue") ax.set_ylim([0, 100]) ax.set_ylabel("Probability") ax.set_xlabel("Shift in consensus") ### Plot position distribution col_pos = col.replace("_MAT", "_POS_DISTR") ax = ax_pos_dist shift_to_pos_dist = get_position_distributions_by_shift( df_original, col_pos, update_shifts) for s in sorted(shift_to_pos_dist.keys()): list_pos_dist = shift_to_pos_dist[s] # average positions values = dict() for l in list_pos_dist: try: for i in l.keys(): if i not in values.keys(): values[i] = list() values[i].append(l[i]) except Exception: continue for i in values.keys(): values[i] = np.mean(values[i]) total = sum(values.values()) for i in values.keys(): values[i] /= total x = sorted(values.keys()) y = [values[a] for a in x] seaborn.lineplot(x, y, label=s, ax=ax) ax.legend() # TEXT ax = ax_text from matplotlib.font_manager import FontProperties fp = FontProperties() fp.set_family("monospace") print("here") print(print_reduced_msa(msa_t, True, n=10)) ax.text(0, 0, print_reduced_msa(msa_t, True, n=10), horizontalalignment='left', verticalalignment='center', fontproperties=fp) ax.set_xlim([-0.2, 0.4]) ax.set_ylim([-0.4, 0.4]) # ax.axis("off",) ax.get_xaxis().set_visible(False) ax.get_yaxis().set_visible(False) plt.suptitle("Gc range: {}. Num Data points: {}".format( title, msa_t.number_of_sequences())) # save_figure(FigureOptions(save_fig=next_name(env["pd-work"]))) plt.tight_layout() plt.subplots_adjust(top=0.9) plt.savefig(next_name(env["pd-work"])) plt.show()
def plot_frequency(samp): inpath = "%s/FPassignment/%s/%s/analysis/kpLogo/%s/%s_%supTO%sdown_kpLogo.pass.p.cutoff.txt" % ( rootpath, genome_name, experiment, samp, samp, upstreamNts, downstreamNts) df = pd.read_csv(inpath, sep='\t') print df df = df.sort_values(by=['position', '#kmer']) df = df.reset_index(drop=True) ### get raw p-values from -log10 vals pvals = [] freqs = [] for i in df.index: count1 = float(df.loc[i, 'n1']) countOther = float(df.loc[i, 'n2']) freqOne = (count1 / (count1 + countOther)) freqs.append(freqOne) df['freq'] = freqs ### extract desired values ### set up matrix for lm dl = df.copy() dl = dl[['#kmer', 'position', 'freq']] dl.loc[dl['position'] > 0, 'position'] -= 1 dl.loc[dl['#kmer'] == 'T', '#kmer'] = 'U' print dl print "" dlm = pd.DataFrame(np.zeros((4, upstreamNts + 3 + downstreamNts))) dlm.index = ['A', 'C', 'G', 'U'] dlm.columns = range(-upstreamNts, downstreamNts + 3) # dlm.drop(0, axis=1,inplace=True) print dlm for i in dl.index: nt = dl.loc[i, '#kmer'] pos = dl.loc[i, 'position'] val = dl.loc[i, 'freq'] if pos < 0: dlm.loc[nt, pos] = val elif pos == 0: if nt == 'A': dlm.loc['A', 0] = val dlm.loc['A', 1] = val dlm.loc['A', 2] = val elif nt == 'C': dlm.loc['C', 0] = val dlm.loc['C', 1] = val dlm.loc['C', 2] = val elif nt == 'G': dlm.loc['G', 0] = val dlm.loc['G', 1] = val dlm.loc['G', 2] = val else: print "stop not set" elif pos > 0: newpos = pos + 2 dlm.loc[nt, newpos] = val print dlm dlm = dlm.T dlm fig, ax = plt.subplots(figsize=(30, 6)) logo = lm.Logo(dlm, font_name='Arial', ax=ax, vpad=0.05) logo.draw_baseline(linewidth=1, color='black', linestyle="-") # logo.highlight_position_range(0,2,alpha=0.5,color='lightgray') ax.set_ylim(0, 1) # outpath = '%s/FPassignment/%s/%s/analysis/kpLogo/%s/%s_%supTO%sdown_FREQ.pdf' % ( # rootpath, genome_name, experiment, samp, samp, upstreamNts, downstreamNts) outpath = "%s/figures/Fig5B.pdf" % rootDir plt.savefig(outpath, format='pdf', bbox_inches="tight")
# do imports import matplotlib.pyplot as plt import logomaker as logomaker # load ss probability matrix ss_df = logomaker.get_example_matrix('ss_probability_matrix', print_description=False) # create Logo object ss_logo = logomaker.Logo(ss_df, width=.8, vpad=.05, fade_probabilities=True, stack_order='small_on_top', color_scheme='dodgerblue', font_name='Rosewood Std') # style using Logo methods ss_logo.style_spines(spines=['left', 'right'], visible=False) # style using Axes methods ss_logo.ax.set_xticks(range(len(ss_df))) ss_logo.ax.set_xticklabels('%+d'%x for x in [-3, -2, -1, 1, 2, 3, 4, 5, 6]) ss_logo.ax.set_yticks([0, .5, 1]) ss_logo.ax.axvline(2.5, color='k', linewidth=1, linestyle=':') ss_logo.ax.set_ylabel('probability') # show plot ss_logo.fig.show()
def cluster_logo_plot(adata, obs_col, obs_val, lengths="all"): length_args = ["all", "dominant"] if lengths not in length_args: raise ValueError("length argument must be one of %s" % length_args) # Lets take an example cluster logo_clust = adata.obs[adata.obs[obs_col] == obs_val][[ "TRA_cdr3", "TRB_cdr3", "TRA_cdr3_length", "TRB_cdr3_length" ]] # Figure out the dominant lengths of the clusters if lengths == "dominant": num_alpha_lengths = 1 num_beta_lengths = 1 else: num_beta_lengths = len(set(logo_clust["TRB_cdr3_length"])) num_alpha_lengths = len(set(logo_clust["TRA_cdr3_length"])) figRows = max([num_beta_lengths, num_alpha_lengths]) # NEED TO FIGURE OUT HOW TO MAKE A GOOD FIGSIZE CALCULATION fig, ax = plt.subplots(nrows=figRows, ncols=2, figsize=(10 * figRows, 3 * figRows)) for num, seqtype in enumerate(["TRA", "TRB"]): seq_df = logo_clust[[ "{seqtype}_cdr3".format(seqtype=seqtype), "{seqtype}_cdr3_length".format(seqtype=seqtype) ]] if lengths == "dominant": chain_lengths = [ seq_df["{seqtype}_cdr3_length".format( seqtype=seqtype)].value_counts().idxmax() ] else: chain_lengths = sorted( set(seq_df["{seqtype}_cdr3_length".format(seqtype=seqtype)])) for row, seqlen in enumerate(chain_lengths): # Get the seqs logo_seqs = seq_df[seq_df["{seqtype}_cdr3_length".format( seqtype=seqtype)] == seqlen]["{seqtype}_cdr3".format( seqtype=seqtype)] # Concatenate and determine all used AA in seqs unique_AA = set("".join(seq for seq in logo_seqs)) # Probability matrix prob_df = pd.DataFrame(index=range(seqlen), columns=unique_AA) for indx in range(len(logo_seqs[0])): # Get the letter at the position for each seq AAs = [seq[indx] for seq in logo_seqs] # Calculate probabilities prob_dict = dict(Counter(AAs)) for key, val in prob_dict.items(): prob = val / len(AAs) prob_df.loc[indx, key] = prob prob_df = prob_df.fillna(0) prob_df.sum(axis=1) if figRows == 1: logomaker.Logo(prob_df, ax=ax[num], width=.8, vpad=.05, fade_probabilities=True, stack_order='small_on_top', color_scheme='dodgerblue', font_name='Rosewood Std') ax[num].set_title( "Number of seqs: {seqlen}".format(seqlen=len(logo_seqs)), { "fontsize": 10, "fontweight": "bold" }) # Add additional title # Get the center of the plot center = seqlen / 1.75 height = 1.1 + (figRows / 15) ax[num].text(center, height, "{seqtype} CDR3".format(seqtype=seqtype), { "fontsize": 15, "fontweight": "bold" }, horizontalalignment="right") continue else: logomaker.Logo(prob_df, ax=ax[row, num], width=.8, vpad=.05, fade_probabilities=True, stack_order='small_on_top', color_scheme='dodgerblue', font_name='Rosewood Std') ax[row, num].set_title( "Number of seqs: {seqlen}".format(seqlen=len(logo_seqs)), { "fontsize": 10, "fontweight": "bold" }) # If the first of either alpha or beta, add additional title if row == 0: # Get the center of the plot center = (seqlen + .75) / 2 height = 1 + (figRows / 15) ax[row, num].text(center, height, "{seqtype} CDR3".format(seqtype=seqtype), { "fontsize": 15, "fontweight": "bold" }, horizontalalignment="right") fig.tight_layout() # Determine which chain has more seq lengths if num_beta_lengths > num_alpha_lengths: # get rid of excess alpha plots for noLogo in range(num_alpha_lengths, num_beta_lengths): ax[noLogo, 0].axis("off") elif num_beta_lengths < num_alpha_lengths: # Get rid of excess beta plots for noLogo in range(num_beta_lengths, num_alpha_lengths): ax[noLogo, 1].axis("off") return (fig)
# get attribution scores attr_score = saliency(model, X, class_index=0, layer=-2, batch_size=256) attr_score = attr_score * X # plot attribution scores for sequences with top predictions N, L, A = attr_score.shape for i in range(len(X)): counts_df = pd.DataFrame(data=0.0, columns=list('ACGU'), index=list(range(L))) for a in range(A): for l in range(L): counts_df.iloc[l, a] = attr_score[i][l, a] logomaker.Logo(counts_df, figsize=(25, 2)) ax = plt.gca() ax.spines['right'].set_visible(False) ax.spines['top'].set_visible(False) ax.yaxis.set_ticks_position('none') ax.xaxis.set_ticks_position('none') plt.xticks([]) plt.yticks([]) fig = plt.gcf() fig.savefig(str(data_path) + '/Top_Predictions/' + str(i)) fig, W, logo = plot_filters(model, x_test, layer=3, threshold=0.5, window=20,
def test_Logo_highlight_position_range(): good_crp_df = logomaker.get_example_matrix('crp_energy_matrix', print_description=False) # test parameter pmin test_parameter_values( func=logomaker.Logo(good_crp_df).highlight_position_range, var_name='pmin', fail_list=['x', 20], success_list=[0, 1, 10], pmax=15) # test parameter pmax test_parameter_values( func=logomaker.Logo(good_crp_df).highlight_position_range, var_name='pmax', fail_list=['x', 1], success_list=[5.5, 6, 10], pmin=5) # test parameter padding test_parameter_values( func=logomaker.Logo(good_crp_df).highlight_position_range, var_name='padding', fail_list=['x', -1], success_list=[-0.5, 0, 10], pmin=5, pmax=10) # test parameter color test_parameter_values( func=logomaker.Logo(good_crp_df).highlight_position_range, var_name='color', fail_list=['x', 1, True, 'wrong_color'], success_list=['pink', 'red', [1, 1, 1]], pmin=5, pmax=10) # test parameter edgecolor test_parameter_values( func=logomaker.Logo(good_crp_df).highlight_position_range, var_name='edgecolor', fail_list=['x', 1, True, 'wrong_color'], success_list=[None, 'pink', 'red', [1, 1, 1]], pmin=5, pmax=10) # test parameter floor test_parameter_values( func=logomaker.Logo(good_crp_df).highlight_position_range, var_name='floor', fail_list=['x', 10], success_list=[-1, 1, None], pmin=5, pmax=10) # test parameter ceiling test_parameter_values( func=logomaker.Logo(good_crp_df).highlight_position_range, var_name='ceiling', fail_list=['x', -10], success_list=[-1, 1, None], pmin=5, pmax=10) # test parameter zorder. Note that a value of False passes for this parameter. This should be fixed. test_parameter_values( func=logomaker.Logo(good_crp_df).highlight_position_range, var_name='zorder', fail_list=['x', None], success_list=[-1, 0.5, 1], pmin=5, pmax=10)
def weblogologomaker(request): if request.method == "POST": # seqs = unquote(request.GET.get('seq')) data = request.data seqs = data['seqs'] try: type_output = data['output'] except: type_output = "png" try: type_os = data['os'] except: type_os = "linux" ########################## # type_os = "windows" ####################### output = weblogo_aux(seqs, type_os) in_file = "unaligned.fasta" out_file = "aligned.fasta" file = open(out_file, "r") seqs = read_seq_data(file, alphabet="ACDEFGHIKLMNPQRSTVWY-") logodata = LogoData.from_seqs(seqs) logooptions = LogoOptions() logooptions.title = "VFP WEBSERVER" logoformat = LogoFormat(logodata, logooptions) weblogo_txt = txt_formatter(logodata, logoformat) # weblogo_jpeg = jpeg_formatter(logodata, logoformat) weblogo_file = "weblogo.txt" weblogo = open(weblogo_file, "w") data_weblogo = str(weblogo_txt)[2:len(str(weblogo_txt)) - 1].replace('\\n', '\n').replace('\\t', '\t') weblogo.write(data_weblogo) weblogo.close() filename = 'weblogo.txt' weblogoDf = pd.read_csv(filename, skiprows=7, sep='\t') weblogoDf = weblogoDf[:-1] columns = [] for i in weblogoDf.columns: j = i.replace(' ', '') columns.append(j) weblogoDf.columns = columns weblogo_entropyes = weblogoDf.loc[:, weblogoDf.columns[1:len(weblogoDf.columns) - 4]] entropies = list((np.log2(20) - weblogoDf.loc[:, 'Entropy'])) weblogo_entropyes = weblogo_entropyes.mul(entropies, axis=0) family_weblogo = weblogo_entropyes.drop(['-'], axis=1) if type_output == "txt": weblogo = open(weblogo_file) data = weblogo.read() return HttpResponse(data, content_type="text/plain") # return JsonResponse(family_weblogo.to_json(orient="index"), safe=False) else: data = logomaker.transform_matrix(family_weblogo) # create figure height_per_row = 2 width_per_col = 1.5 line_size = 25 num_rows = int(data.shape[0] / line_size) + 1 fig = plt.figure(figsize=[width_per_col * line_size, height_per_row * num_rows]) max_df = data.sum(axis=1).max() for i in range(0, int(data.shape[0] / line_size)): # set axes limits and label ax = plt.subplot2grid((num_rows, 1), (i, 0)) ax.spines['right'].set_visible(False) ax.spines['top'].set_visible(False) ax.set_ylim(bottom=0, top=max_df) # ax.set_xlabel("Type of peptide") ax.set_ylabel('Bits') logo = logomaker.Logo(data.loc[range(i * line_size, (i + 1) * line_size), :], ax=ax, color_scheme='NajafabadiEtAl2017', ) # style using Axes methods # logo.ax.set_ylabel("$-\Delta \Delta G$ (kcal/mol)", labelpad=-1) # logo.ax.xaxis.set_ticks_position('none') logo.ax.set_ylim([0, max_df]) # style using Logo methods # logo.style_glyphs(ceiling = max_df) if i * line_size != data.shape[0]: i += 1 data_aux = data for j in range(i * line_size, (i + 1) * line_size): data_aux = data_aux.append(pd.Series(0, index=data_aux.columns), ignore_index=True) ax = plt.subplot2grid((num_rows, 1), (i, 0)) ax.spines['right'].set_visible(False) ax.spines['top'].set_visible(False) ax.set_ylim(bottom=0, top=max_df) # ax.set_xlabel("Type of peptide") ax.set_ylabel('Bits') logo = logomaker.Logo(data_aux.loc[range(i * line_size, (i + 1) * line_size), :], ax=ax, color_scheme='NajafabadiEtAl2017', ) # style using Axes methods # logo.ax.set_ylabel("$-\Delta \Delta G$ (kcal/mol)", labelpad=-1) # logo.ax.xaxis.set_ticks_position('none') logo.ax.set_xlim([i * line_size - 0.5, (i + 1) * line_size - 0.5]) logo.ax.set_ylim([0, max_df]) image_path = "weblogo.png" fig.savefig("weblogo.png") with open(image_path, "rb") as image_file: image_data = base64.b64encode(image_file.read()).decode('utf-8') # base64data = open("base64.txt","w") # base64data.write(image_data) # print(image_data) # base64data = open("base64.txt") # send_data = base64data.read() # return HttpResponse(image_data, content_type="image/png") return HttpResponse(image_data, content_type="text/plain") # return JsonResponse({'data': output}, safe=False) raise Http404
df = all_df[all_df['characters'] == char_set].copy() df.sort_values(by='color_scheme', inplace=True) df.reset_index(inplace=True, drop=True) # for each color scheme for row_num, row in df.iterrows(): # set axes col_num = sum(colspans[:j]) col_span = colspans[j] ax = plt.subplot2grid((num_rows, num_cols), (row_num, col_num), colspan=col_span) # get color scheme color_scheme = row['color_scheme'] # make matrix for character set mat_df = logomaker.sequence_to_matrix(char_set) # make and style logo logomaker.Logo(mat_df, ax=ax, color_scheme=color_scheme, show_spines=False) ax.set_xticks([]) ax.set_yticks([]) ax.set_title(repr(color_scheme)) # style and show figure fig.tight_layout() fig.show()
li = [ 'atagccggtacggca', 'ttagctgcaaccgca', 'tcagccactagagca', 'ataaccgcgaccgca', 'ttagccgctaaggta', 'taagcctcgtacgta', 'ttagccgttacggcc', 'atatccggtacagta', 'atagcaggtaccgaa', 'acatccgtgacggaa' ] new_li = [] for i in range(len(li[0])): r = '' for j in range(len(li)): r += li[j][i] new_li.append(r) position_weight_matrix = np.zeros((4, 15)) alphabets = ['a', 'c', 'g', 't'] for seq in range(len(new_li)): for alphabet in range(len(alphabets)): position_weight_matrix[alphabet][seq] = new_li[seq].count( alphabets[alphabet]) / 5 df = pd.DataFrame(position_weight_matrix.T, columns=['A', 'C', 'G', 'T']) df.index = np.arange(1, len(df) + 1) logos = logomaker.Logo(df) logos.ax.set_xticks(np.arange(1, 16)) logos.ax.set_yticks(np.arange(3)) logos.ax.set_ylabel('Bits') logos.ax.set_xlabel('Sequence Position') pml.savefig('seqlogo.pdf') plt.show()
logo_df = logomaker.get_example_matrix('logomaker_logo_matrix', print_description=False) # create color scheme color_scheme = { 'L': [0, .5, 0], 'O': [1, 0, 0], 'G': [1, .65, 0], 'maker': 'gray' } # create Logo object logo_logo = logomaker.Logo(logo_df, ax=ax, color_scheme=color_scheme, baseline_width=0, font_name='Arial', show_spines=False, vsep=.005, width=.95) # color the 'O' at the end of the logo a different color logo_logo.style_single_glyph(c='O', p=3, color=[0, 0, 1]) # change the font of 'maker' and flip characters upright. logo_logo.style_glyphs_below(font_name='OCR A Std', flip=False, width=1.0) # remove tick marks ax.set_xticks([]) ax.set_yticks([]) # tighten layout
# do imports import matplotlib.pyplot as plt import logomaker as logomaker # load ww information matrix ww_df = logomaker.get_example_matrix('ww_information_matrix', print_description=False) # create Logo object ww_logo = logomaker.Logo(ww_df, font_name='Stencil Std', color_scheme='NajafabadiEtAl2017', vpad=.1, width=.8) # style using Logo methods ww_logo.style_xticks(anchor=0, spacing=5, rotation=45) ww_logo.highlight_position(p=4, color='gold', alpha=.5) ww_logo.highlight_position(p=26, color='gold', alpha=.5) # style using Axes methods ww_logo.ax.set_ylabel('information (bits)') ww_logo.ax.set_xlim([-1, len(ww_df)]) # show plot ww_logo.fig.show()
def plot_residue_data_logo(residue_index, logos, interactions, gap=1000, letter_map=None, color_scheme="chemistry", ylabel=None, title=None, fn=None, fig_close=False): """Plot interactions using `logomaker.Logo <https://logomaker.readthedocs.io/en/latest/implementation.html#logo-class>`_. Parameters ----------- residue_index : list Residue indices in an ascending order. If a residue index is smaller than its preceding one, the plotting function will consider it as the start of a new chain and will plot the following data in a new figure. A gap in residue index that is less than ``gap`` will be marked as gray areas in the figure, but a gap that is larger than ``gap`` will start a new figure. logos : list of str Single letter logos in the corresponding order as ``residue_index``. The height of logos in the figure will be determined by values given to ``interactions``. Three-letter name of the 20 common amino acids are accepted and will be converted to their corresponding single-letter names in this function by the default. Other mappings can be defined via ``letter_map``. interactions : list Plotting values in the corresponding order as ``residue_index``. gap : int, optional, default=1000 The number of missing residues in ``residue_index`` that starts a new figure. A gap between two adjacent index in ``residue_index`` that is smaller than the provided value will be considered as missing residues and will be marked as gray areas in the figure, whereas a gap that is larger than the provided value will start a new figure and plot the following data in that new figure. This can help to make figures more compressed. The gap needs to be greater than 1000. The default is 1000. letter_map : dict, optional, default=None A dictionary that maps provided names to single-letter logos in the form of {"provided name": "single_letter logo"}. color_scheme : str, optional, default="chemistry" The color scheme used by logomaker.Logo(). See `Color Schemes <https://logomaker.readthedocs.io/en/latest/examples.html#color-schemes>`_ for accepted values. Default is "chemistry". ylabel : str, optional, default=None y axis label. Default is "Interactions". fn : str, optional, default=None Figure name. By default the figure is saved as "Figure_interactions_logo.pdf" as the current working directory. fig_close : bool, optional, default=False Use plt.close() to close the figure. Can be used to save memory if many figures are opened. """ # single-letter dictionary single_letter = { 'CYS': 'C', 'ASP': 'D', 'SER': 'S', 'GLN': 'Q', 'LYS': 'K', 'ILE': 'I', 'PRO': 'P', 'THR': 'T', 'PHE': 'F', 'ASN': 'N', 'GLY': 'G', 'HIS': 'H', 'LEU': 'L', 'ARG': 'R', 'TRP': 'W', 'ALA': 'A', 'VAL': 'V', 'GLU': 'E', 'TYR': 'Y', 'MET': 'M' } if letter_map is not None: single_letter.update(letter_map) logos_checked = [] for name in logos: if len(name) == 1: logos_checked.append(name) else: logos_checked.append(single_letter[name]) if ylabel is None: ylabel = "Interactions" if fn is None: fn = os.path.join(os.getcwd(), "Figure_interactions_logo.pdf") length = 100 # check for chain breaks, gray_areas and axis breaks axis_obj = AxisIndex(residue_index, logos_checked, interactions, length, gap) axis_obj.sort() # plot for page_idx in axis_obj.breaks.keys(): n_rows = len(axis_obj.breaks[page_idx]) fig, axes = plt.subplots(n_rows, 1, figsize=(4.5, 1.3 * n_rows), sharey=True) plt.subplots_adjust(hspace=0.5, left=0.2) ymax = [] for ax_idx, ax in enumerate(np.atleast_1d(axes)): resi_selected = [ item[0] for item in axis_obj.breaks[page_idx][ax_idx] ] logos_selected = [ item[1] for item in axis_obj.breaks[page_idx][ax_idx] ] interaction_selected = [ item[2] for item in axis_obj.breaks[page_idx][ax_idx] ] ymax.append(np.max(interaction_selected)) if np.sum(interaction_selected) > 0: df = pd.DataFrame({ "Resid": resi_selected, "Resn": logos_selected, "Data": interaction_selected }) matrix = df.pivot(index="Resid", columns='Resn', values="Data").fillna(0) logomaker.Logo(matrix, color_scheme=color_scheme, ax=ax) if ax_idx == (n_rows - 1): ax.set_xlabel("Residue Index", fontsize=8, weight="bold") ax.xaxis.set_major_locator(MultipleLocator(20)) ax.xaxis.set_minor_locator(MultipleLocator(1)) ax.set_xlim(resi_selected[0] - 0.5, resi_selected[-1] + 0.5) ax.set_ylabel(ylabel, fontsize=8, weight="bold", va="center") for label in ax.xaxis.get_ticklabels() + ax.yaxis.get_ticklabels(): plt.setp(label, fontsize=8, weight="bold") np.atleast_1d(axes)[-1].set_ylim(0, np.max(ymax) * 1.05) # plot missing areas if page_idx in axis_obj.gray_areas.keys(): for item in axis_obj.gray_areas[page_idx]: np.atleast_1d(axes)[item[0]].axvspan(item[1], item[2], facecolor="#c0c0c0", alpha=0.3) if title is not None: np.atleast_1d(axes)[0].set_title(title, fontsize=10, weight="bold") plt.tight_layout() if len(axis_obj.breaks.keys()) == 1: fig.savefig(fn, dpi=300) else: name, ext = os.path.splitext(fn) fig.savefig("{}_{}{}".format(name, page_idx, ext), dpi=300) if fig_close: plt.close() return
def logo(file, limit=(), output_file=None, old_format=False, min_beta=.001, max_beta=100., num_betas=1000): """Plot sequence logos using logomaker. Parameters ---------- file : str path to file containing energy matrix limit : Tuple, default () first and last base of sequence that is converted into logo output_file : str, default None path where plot is saved to, if not None old_format : boolean, default False If True, file is loaded with extra argument "delim_whitespace" min_beta : float, default 0.001 minimal scaling factor max_beta : float, default 100 maximal scaling factor max_beta : int, default 1000 number of tested scaling factors Returns ------- binding_logo : logomaker.src.Logo.Logo """ # Load in a binding site matrix. arraydf = pd.read_csv(file, index_col="pos", delim_whitespace=old_format) # Rename columns to be useable by the logomaker package arraydf = arraydf.rename(columns={ 'val_A': 'A', 'val_C': 'C', 'val_G': 'G', 'val_T': 'T' }) if len(limit) != 0: if len(limit) != 2: raise RuntimeError("limit must have length 2.") else: arraydf = arraydf.iloc[limit[0]:limit[1] + 1] # finding scaling factor target_info = len(arraydf.index) beta = information.get_beta_for_effect_df(arraydf, target_info, min_beta=min_beta, max_beta=max_beta, num_betas=num_betas) # use logomaker to convert energy matrix to information matrix binding_info = logomaker.transform_matrix(df=beta * arraydf, from_type='weight', to_type='information') binding_logo = logomaker.Logo( binding_info, #font_name='Stencil Std', vpad=.1, width=.8) # style using Logo methods binding_logo.style_spines(visible=False) binding_logo.style_spines(spines=['left', 'bottom'], visible=True) binding_logo.style_xticks(rotation=90, fmt='%d', anchor=0) # style using Axes methods binding_logo.ax.set_ylabel("Information (bits)", labelpad=-1) binding_logo.ax.xaxis.set_ticks_position('none') binding_logo.ax.xaxis.set_tick_params(pad=-1) binding_logo.ax.grid(False) binding_logo.ax.set_xticklabels(np.arange(limit[0], limit[1] + 1)) if output_file != None: plt.savefig(output_file) return binding_logo
def output_line_plot(arguments): (mpbs_name, mpbs_num, signals, conditions, pwm, output_location, window_size, colors) = arguments mpbs_name = mpbs_name.replace("(", "_").replace(")", "") # output signal output_filename = os.path.join(output_location, "{}.txt".format(mpbs_name)) with open(output_filename, "w") as f: f.write("\t".join(conditions) + "\n") for i in range(window_size): res = [] for j, condition in enumerate(conditions): res.append(signals[j][i]) f.write("\t".join(map(str, res)) + "\n") # to create a motif loge, we only use A, C, G, T pwm = {k: pwm[k] for k in ('A', 'C', 'G', 'T')} pwm = pd.DataFrame(data=pwm) pwm = pwm.add(1) pwm_prob = (pwm.T / pwm.T.sum()).T pwm_prob_log = np.log2(pwm_prob) pwm_prob_log = pwm_prob_log.mul(pwm_prob) info_content = pwm_prob_log.T.sum() + 2 icm = pwm_prob.mul(info_content, axis=0) start = -(window_size // 2) end = (window_size // 2) - 1 x = np.linspace(start, end, num=window_size) plt.close('all') fig, ax = plt.subplots() for i, condition in enumerate(conditions): ax.plot(x, signals[i], color=colors[i], label=condition) ax.text(0.15, 0.9, 'n = {}'.format(mpbs_num), verticalalignment='bottom', horizontalalignment='right', transform=ax.transAxes, fontweight='bold') ax.xaxis.set_ticks_position('bottom') ax.yaxis.set_ticks_position('left') ax.spines['top'].set_visible(False) ax.spines['right'].set_visible(False) ax.spines['left'].set_position(('outward', 15)) ax.tick_params(direction='out') ax.set_xticks([start, 0, end]) ax.set_xticklabels([str(start), 0, str(end)]) min_signal = np.min(signals) max_signal = np.max(signals) ax.set_yticks([min_signal, max_signal]) ax.set_yticklabels([str(round(min_signal, 2)), str(round(max_signal, 2))], rotation=90) ax.set_title(mpbs_name, fontweight='bold') ax.set_xlim(start, end) ax.set_ylim([min_signal, max_signal]) ax.legend(loc="upper right", frameon=False) ax.spines['bottom'].set_position(('outward', 70)) ax = plt.axes([0.105, 0.085, 0.85, .2]) logo = logomaker.Logo(icm, ax=ax, show_spines=False, baseline_width=0) ax.set_xticks([]) ax.set_yticks([]) fig.tight_layout() output_filename = os.path.join(output_location, "{}.pdf".format(mpbs_name)) plt.savefig(output_filename)
def protein_logo(positions): """Draws a sequence logo of the positions requested showing differences between ABCG family members Arguments: positions -- a list of positions within the sequence alignment """ ABCG1 = [] ABCG2 = [] ABCG4 = [] ABCG5 = [] ABCG8 = [] for seq in ABCG1_sequences: tmp = '' for i in positions: tmp = tmp + seq[1][i] ABCG1.append(tmp) for seq in ABCG2_sequences: tmp = '' for i in positions: tmp = tmp + seq[1][i] ABCG2.append(tmp) for seq in ABCG4_sequences: tmp = '' for i in positions: tmp = tmp + seq[1][i] ABCG4.append(tmp) for seq in ABCG5_sequences: tmp = '' for i in positions: tmp = tmp + seq[1][i] ABCG5.append(tmp) for seq in ABCG8_sequences: tmp = '' for i in positions: tmp = tmp + seq[1][i] ABCG8.append(tmp) fig = plt.figure(figsize=[0.5 * len(ABCG1[0]), 5]) ax = plt.subplot2grid((5, 1), (0, 0)) ABCG1_logo = lm.Logo(lm.alignment_to_matrix(ABCG1), ax=ax, color_scheme='black') ax.set_xticks(range(len(positions))) ax.set_xticklabels(positions) ax.xaxis.tick_top() ax1 = plt.subplot2grid((5, 1), (1, 0)) ABCG2_logo = lm.Logo(lm.alignment_to_matrix(ABCG2), ax=ax1, color_scheme='black') ax1.set_xticks([]) ax2 = plt.subplot2grid((5, 1), (2, 0)) ABCG4_logo = lm.Logo(lm.alignment_to_matrix(ABCG4), ax=ax2, color_scheme='black') ax2.set_xticks([]) ax3 = plt.subplot2grid((5, 1), (3, 0)) ABCG5_logo = lm.Logo(lm.alignment_to_matrix(ABCG5), ax=ax3, color_scheme='black') ax3.set_xticks([]) ax4 = plt.subplot2grid((5, 1), (4, 0)) ABCG8_logo = lm.Logo(lm.alignment_to_matrix(ABCG8), ax=ax4, color_scheme='black') ax4.set_xticks(range(len(positions))) plt.xticks(rotation=45, ha='right') this_conservation_pattern = [] for i in positions: this_conservation_pattern.append(conservation_pattern[i]) ax4.set_xticklabels(this_conservation_pattern) ax4.tick_params(labelsize=8) ax.set_yticks([]) ax1.set_yticks([]) ax2.set_yticks([]) ax3.set_yticks([]) ax4.set_yticks([]) ax.set_ylabel('ABCG1', rotation=0, ha='right', fontsize=20) ax1.set_ylabel('ABCG2', rotation=0, ha='right', fontsize=20) ax2.set_ylabel('ABCG4', rotation=0, ha='right', fontsize=20) ax3.set_ylabel('ABCG5', rotation=0, ha='right', fontsize=20) ax4.set_ylabel('ABCG8', rotation=0, ha='right', fontsize=20) conservation_colours = conserved_colours(positions) for pos in range(len(conservation_colours[0])): ABCG1_logo.highlight_position(p=pos, color=conservation_colours[0][pos]) for pos in range(len(conservation_colours[0])): ABCG2_logo.highlight_position(p=pos, color=conservation_colours[1][pos]) for pos in range(len(conservation_colours[0])): ABCG4_logo.highlight_position(p=pos, color=conservation_colours[2][pos]) for pos in range(len(conservation_colours[0])): ABCG5_logo.highlight_position(p=pos, color=conservation_colours[3][pos]) for pos in range(len(conservation_colours[0])): ABCG8_logo.highlight_position(p=pos, color=conservation_colours[4][pos]) return fig