Beispiel #1
0
    def _plot_kernels(self, kernel_name, sequence_alphabet):
        figure_outputs = []
        table_outputs = []
        friendly_kernel_name = copy(kernel_name).replace(
            "chain_1",
            self.method.chain_names[0]).replace("chain_2",
                                                self.method.chain_names[1])

        for i in range(self.method.kernel_count):
            kernel = getattr(self.method.CNN, kernel_name)
            kernel_df = pd.DataFrame(kernel.weight[i].detach().numpy().
                                     T[:, :len(sequence_alphabet)],
                                     columns=sequence_alphabet)
            kernel_csv_path = self.result_path / f"{friendly_kernel_name}_{i + 1}.csv"
            kernel_df.to_csv(kernel_csv_path, index=False)
            table_outputs.append(
                ReportOutput(kernel_csv_path,
                             friendly_kernel_name + f"_{i + 1}"))

            logo = logomaker.Logo(kernel_df,
                                  shade_below=0.5,
                                  fade_below=0.5,
                                  font_name='Arial Rounded MT Bold',
                                  vpad=0.05,
                                  vsep=0.01)
            logo_path = self.result_path / f"{friendly_kernel_name}_{i + 1}.png"

            logo.style_spines(visible=False)
            logo.style_spines(spines=('left', 'bottom'), visible=True)
            logo.style_xticks(fmt='%d', anchor=0)

            logo.fig.savefig(str(logo_path))
            plt.close(logo.fig)
            figure_outputs.append(
                ReportOutput(logo_path, f"{friendly_kernel_name}_{i + 1}"))

        return figure_outputs, table_outputs
Beispiel #2
0
 def plot_sequence_logo(self, seq_logo_df,  save_fig_path=None):
     """
     input : max_act_region ; list of str
     """
     # plot
     plt.figure(dpi=300)
     MA_C = logomaker.Logo(seq_logo_df);
     ax = MA_C.fig.gca()
     ax.spines['right'].set_visible(False)
     ax.spines['top'].set_visible(False)
     ax.spines['bottom'].set_visible(False)
     
     # save
     if save_fig_path is not None:
         MA_C.fig.savefig(save_fig_path,transparent=True,dpi=600)
         save_dir = os.path.dirname(save_fig_path)
         
         try:
             self.save_dir
         except:
             # which is the first time we save
             self.save_dir = save_dir
             print('fig saved to',self.save_dir)
         plt.close(MA_C.fig)
def plot_logomaker_abs(samp):
    inpath = "%s/FPassignment/%s/%s/analysis/kpLogo/%s/%s_%supTO%sdown_kpLogo.pass.p.cutoff.txt" % (
        rootpath, genome_name, experiment, samp, samp, upstreamNts,
        downstreamNts)

    df = pd.read_csv(inpath, sep='\t')
    print df
    df = df.sort_values(by=['position', '#kmer'])
    df = df.reset_index(drop=True)

    ### get raw p-values from -log10 vals

    pvals = []

    for i in df['p-value']:
        pvals.append(10**(-i))
    df['pvals'] = pvals

    ### bonferroni correction
    reject, pvals_cor, sidak, bonfMeth = mtest.multipletests(
        pvals, alpha=0.1, method='bonferroni')
    df['bonf_pval'] = pvals_cor

    ### FRD, Benjamini Hochberg adjusted pvalues
    reject, pvals_cor, sidak, bonfMeth = mtest.multipletests(pvals,
                                                             alpha=0.05,
                                                             method='fdr_bh')

    pvals_cor_log10 = []
    for i in pvals_cor:
        p10 = -math.log10(i)
        pvals_cor_log10.append(p10)

    df['bh_pval'] = pvals_cor
    df['bh_pval_neglog10'] = pvals_cor_log10

    ### extract desired values
    dp = df.copy()
    dp = dp[[
        '#kmer', 'position', 'shift', 'statistics', 'pvals', 'p-value',
        'bh_pval', 'bh_pval_neglog10'
    ]]
    dp['difflog10'] = dp['p-value'] * dp['bh_pval_neglog10']
    dp['log10p'] = dp['p-value'] * np.sign(dp['statistics'])
    dp['log10pBH'] = dp['bh_pval_neglog10'] * np.sign(dp['statistics'])

    print dp['bh_pval_neglog10']

    ### set up matrix for lm
    dl = dp.copy()
    dl = dl[['#kmer', 'position', 'bh_pval_neglog10', 'log10pBH']]
    dl.loc[dl['position'] > 0, 'position'] -= 1

    dl.loc[dl['#kmer'] == 'T', '#kmer'] = 'U'

    # dlm = pd.DataFrame(np.zeros((4,upstreamNts+1+downstreamNts)))
    # dlm.index = ['A', 'C', 'G', 'T']
    # dlm.columns = range(-upstreamNts, downstreamNts+1)
    # # dlm.drop(0, axis=1,inplace=True)

    # dlm

    # for i in dl.index:
    # 	nt = dl.loc[i,'#kmer']
    # 	pos = dl.loc[i, 'position']
    # 	val = dl.loc[i, 'log10pBH']

    # 	dlm.loc[nt, pos] = val

    # dlm = dlm.T
    # dlm

    # fig, ax = plt.subplots(figsize=(6,6))

    # logo = lm.Logo(dlm,
    # 			   font_name = 'Arial',
    # 			   ax=ax
    # 			  )

    # logo.draw_baseline(linewidth = 1, color ='black', linestyle="-")

    # outpath = '%s/FPassignment/%s/%s/analysis/kpLogo/%s/%s_%supTO%sdown_sccLogo.pdf' % (
    # 	rootpath, genome_name, experiment, samp, samp, upstreamNts, downstreamNts)

    # plt.savefig("/home/jamie/misc/rt/logoTest.pdf", format='pdf')

    ### rep sc 3 positions

    print dl
    print ""

    dlm = pd.DataFrame(np.zeros((4, upstreamNts + 3 + downstreamNts)))
    dlm.index = ['A', 'C', 'G', 'U']
    dlm.columns = range(-upstreamNts, downstreamNts + 3)
    # dlm.drop(0, axis=1,inplace=True)

    print dlm

    for i in dl.index:
        nt = dl.loc[i, '#kmer']
        pos = dl.loc[i, 'position']
        val = dl.loc[i, 'bh_pval_neglog10']

        if pos < 0:
            dlm.loc[nt, pos] = val
        elif pos == 0:
            if nt == 'A':
                dlm.loc['A', 0] = val
                dlm.loc['A', 1] = val
                dlm.loc['A', 2] = val
            elif nt == 'C':
                dlm.loc['C', 0] = val
                dlm.loc['C', 1] = val
                dlm.loc['C', 2] = val
            elif nt == 'G':
                dlm.loc['G', 0] = val
                dlm.loc['G', 1] = val
                dlm.loc['G', 2] = val
            else:
                print "stop not set"
        elif pos > 0:
            newpos = pos + 2
            dlm.loc[nt, newpos] = val

    print dlm

    dlm = dlm.T
    dlm

    fig, ax = plt.subplots(figsize=(12, 6))

    logo = lm.Logo(dlm, font_name='Arial', ax=ax)

    # for i in dl.index: # this is not really working...
    # 	if np.sign(dl.loc[i,'log10pBH']) == -1.0:
    # 		# print 'neg', dl.loc[i,['log10pBH']]
    # 		nt = dl.loc[i,'#kmer']
    # 		pos = dl.loc[i, 'position']

    # 		logo.style_single_glyph(pos, nt, flip=True, edgecolor='black', color='white')

    logo.draw_baseline(linewidth=1, color='black', linestyle="-")
    logo.highlight_position_range(0, 2, alpha=0.5, color='lightgray')

    ax.set_ylim(0, 15)

    outpath = '%s/FPassignment/%s/%s/analysis/kpLogo/%s/%s_%supTO%sdown_sccLogo_ABS.pdf' % (
        rootpath, genome_name, experiment, samp, samp, upstreamNts,
        downstreamNts)

    plt.savefig(outpath, format='pdf')
Beispiel #4
0
with logomaker.open_example_datafile('ars_wt_sequence.txt',
                                     print_description=False) as f:
    lines = f.readlines()
    lines = [l.strip() for l in lines if '#' not in l]
    ars_seq = ''.join(lines)

# trim matrix and sequence
start = 10
stop = 100
ars_df = ars_df.iloc[start:stop, :]
ars_df.reset_index(inplace=True, drop=True)
ars_seq = ars_seq[start:stop]

# create Logo object
ars_logo = logomaker.Logo(ars_df,
                          color_scheme='dimgray',
                          font_name='Luxi Mono')

# color wild-type ARS1 sequence within logo
ars_logo.style_glyphs_in_sequence(sequence=ars_seq, color='darkorange')

# highlight functional regions of ARS1
ars_logo.highlight_position_range(pmin=7, pmax=22, color='lightcyan')
ars_logo.highlight_position_range(pmin=33, pmax=40, color='honeydew')
ars_logo.highlight_position_range(pmin=64, pmax=81, color='lavenderblush')

# additional styling using Logo methods
ars_logo.style_spines(visible=False)

# style using Axes methods
ars_logo.ax.set_ylim([-4, 4])
        delta_ax = axs[2]
        motif_ax = axs[3]
        y_min = min([
            ref.min().min(),
            alt.min().min(),
            ref_minus_alt.min().min()
        ])
        y_max = max([
            ref.max().max(),
            alt.max().max(),
            ref_minus_alt.max().max()
        ])

        ref_logo = logomaker.Logo(ref,
                                   ax=ref_ax,
                                   baseline_width=0,
                                   show_spines=True,
                                   vsep=0,
                                   width=.95)
        #ref_logo.highlight_position(50, color=(1, 0, 0, 0.5))
        alt_logo = logomaker.Logo(alt,
                                   ax=alt_ax,
                                   baseline_width=0,
                                   show_spines=True,
                                   vsep=0,
                                   width=.95)
        delta_logo = logomaker.Logo(ref_minus_alt,
                                   ax=delta_ax,
                                   baseline_width=0,
                                   show_spines=True,
                                   vsep=0,
                                   width=.95)
def plot_shift(results,
               config,
               out_folder,
               z_tail,
               z_head,
               mc_samples=None,
               seq_ref=None):
    """Plot shift in sequence space across latent vector z."""
    # Setup.
    FactorMuE_variational = results['FactorMuE_variational']
    if mc_samples is None:
        mc_samples = int(config['train']['mc_samples'])
    else:
        mc_samples = int(mc_samples)

    # Load hyperparameters.
    (dtype, bt_scale, b0_scale, l_conc, u_conc, r_conc, z_distr, latent_dims,
     latent_alphabet_size, alphabet, alphabet_size,
     latent_length) = _load_hyperparameters(config)

    # Load reference file.
    if seq_ref is not None:
        data_ref = dataloader.load(seq_ref,
                                   filetype=config['data']['filetype'],
                                   alphabet=config['data']['alphabet'],
                                   dtype=dtype)
        for x_batch, xlen_batch in data_ref.batch(1):
            x, xlen = x_batch[0], xlen_batch[0]
        prefix = 'aligned_'
    else:
        x, xlen = None, None
        prefix = ''

    # Load latent vector.
    z_head = tf.convert_to_tensor(json.loads(z_head), dtype=dtype)
    z_tail = tf.convert_to_tensor(json.loads(z_tail), dtype=dtype)
    zs = tf.concat((z_tail[None, :], z_head[None, :]), axis=0)

    # Plot embedding.
    z = results['embed_mean']
    plt.figure(figsize=(8, 8))
    plt.scatter(z[:, 0], z[:, -1], s=5)
    plt.arrow(z_tail[0],
              z_tail[-1],
              z_head[0] - z_tail[0],
              z_head[-1] - z_tail[-1],
              length_includes_head=True,
              head_width=0.03,
              color='black')
    plt.xlabel(r'$z_1$', fontsize=18)
    plt.ylabel(r'$z_2$', fontsize=18)
    plt.savefig(os.path.join(out_folder, 'z1m1_shift.pdf'))

    # Get projection.
    nus = FactorMuE.project_latent_to_sequence(zs,
                                               FactorMuE_variational,
                                               latent_dims,
                                               latent_length,
                                               latent_alphabet_size,
                                               alphabet_size,
                                               bt_scale,
                                               b0_scale,
                                               u_conc,
                                               r_conc,
                                               l_conc,
                                               x=x,
                                               xlen=xlen,
                                               mc_samples=mc_samples,
                                               z_distr=z_distr,
                                               dtype=dtype)

    # Plot shift magnitude.
    nu_shift = np.sqrt(np.sum((nus[1] - nus[0]).numpy()**2, axis=1))
    plt.figure(figsize=(8, 6))
    plt.plot(nu_shift, linewidth=2)
    plt.xlabel('conserved position', fontsize=18)
    plt.ylabel('preference shift magnitude', fontsize=18)
    plt.savefig(os.path.join(out_folder, prefix + 'shift_magnitude.pdf'))
    plt.close()

    # Plot tail logo.
    df = pd.DataFrame(nus[0].numpy(), columns=alphabet)
    logomaker.Logo(df)
    plt.savefig(os.path.join(out_folder, prefix + 'tail_logo.pdf'))
    plt.close()
    df.to_csv(os.path.join(out_folder, prefix + 'tail_logo.csv'))

    # Plot head logo.
    df = pd.DataFrame(nus[1].numpy(), columns=alphabet)
    logomaker.Logo(df)
    plt.savefig(os.path.join(out_folder, prefix + 'head_logo.pdf'))
    plt.close()
    df.to_csv(os.path.join(out_folder, prefix + 'head_logo.csv'))

    # Plot shift logo.
    df = pd.DataFrame(nus[1].numpy() - nus[0].numpy(), columns=alphabet)
    logomaker.Logo(df)
    plt.savefig(os.path.join(out_folder, prefix + 'shift_logo.pdf'))
    plt.close()
    df.to_csv(os.path.join(out_folder, prefix + 'shift_logo.csv'))
    def calculate_nodes(self):
        """Method to calculate the different internal node scores
        for a given calculus method, and store those values both in
        a dictionary (if the user wants to) and in an instance
        of a processed tree.
        """
        try:
            tree = PhyloTree(self.tree_in,
                             alignment=self.align_in,
                             alg_format="fasta")
            md = tree.get_midpoint_outgroup()
            tree.set_outgroup(md)
            leaf_deleting_list = set()
            if self.position_matrix == None:
                uniprot_hit_hash, leaf_deleting_list = fp.retrieve_features(
                    self.study_features, self.table_info, self.min_eval,
                    self.uniprot_info)
                self.position_matrix = fp.get_positions_matrix(
                    uniprot_hit_hash, tree
                )  # If we want to update the features, we have to delete the position matrix (with update method)
            for leaf in tree.iter_leaves():
                if leaf.name in leaf_deleting_list:
                    leaf.delete()

            node_number = 0
            node_scores = {}
            node_haplotypes = {}
            node_haplotype_matrices = {}
            node_haplotype_logos = {}
            for index, node in enumerate(tree.traverse("preorder")):
                node._nid = index
                if node.is_leaf() == False:
                    node_sequence_matrix = fp.annotated_sequence_extractor(
                        node, self.position_matrix, self.differentiate_gaps)

                    node_score = round(
                        fp.calculate_node_score(node_sequence_matrix,
                                                self.calc_alg), 2)
                    node.add_feature("node_score", node_score)
                    node_scores[node_number] = node_score

                    node_haplotype = fp.haplotype_parse(node_sequence_matrix)
                    node.add_feature("node_haplotype", node_haplotype)
                    node_haplotypes[node_number] = node_haplotype

                    if self.compute_logos == "Y":
                        node_haplotype_matrix = fp.haplotype_matrix_calculator(
                            node_sequence_matrix)
                        node.add_feature("node_haplotype_matrix",
                                         node_haplotype_matrix)
                        node_haplotype_matrices[
                            node_number] = node_haplotype_matrix
                        if node_haplotype_matrix is not None:
                            node_haplotype_logo = logomaker.Logo(
                                node_haplotype_matrix,
                                color_scheme="dmslogo_funcgroup",
                                show_spines=False)
                            node_haplotype_logo = node_haplotype_logo.fig
                        else:
                            node_haplotype_logo = None
                        node.add_feature("node_haplotype_logo",
                                         node_haplotype_logo)
                        node_haplotype_logos[node_number] = node_haplotype_logo

                    node_number += 1

            self.processed_tree = tree
            self.node_scores = node_scores
            self.node_haplotypes = node_haplotypes
            self.node_haplotype_matrices = node_haplotype_matrices
            self.node_haplotype_logos = node_haplotype_logos

        except:
            sys.stderr.write("Error at calculating nodes.\n")
            sys.exit(1)

        return
Beispiel #8
0
# Check if
matches_df = pd.DataFrame(matches_prot)
matches_df

# %% [markdown]
# Count the number of each type of amino acid at each position.

# %%
prot_aa = []
for ind, pri_seq in matches_df.iteritems():
    aa_abundance = pri_seq.value_counts()
    prot_aa.append(aa_abundance)

prot_df = pd.DataFrame(prot_aa).fillna(0)
prot_df

# %%
# Display Consensus Sequence
crp_logo = logomaker.Logo(prot_df, figsize=(10, 2), color_scheme='chemistry')

# %% [markdown]
# Identify the error.

# %%
# style and show figure
crp_logo.ax.set_xlabel('Percentage')
crp_logo.ax.set_title('Primary consensus sequence')
crp_logo.fig
crp_logo.fig.show()
# %%
def plotPromoters():

    ########################
    #command line arguments#
    ########################

    parser = argparse.ArgumentParser()

    #PARAMETERS
    parser.add_argument(
        "--sequences",
        help="Full path to a fasta-file containing the promoter sequences.",
        type=str)
    parser.add_argument("--outdir", help="Full path to the output directory.")
    parser.add_argument(
        "--N",
        help=
        "How many references are used for averaging single signal sequence contributions.",
        type=int,
        default=10)
    parser.add_argument("--model",
                        help="Full path to the trained keras model.",
                        type=str,
                        default=None)
    parser.add_argument(
        "--background",
        help="Full path to a fasta-file containing the background sequences.",
        type=str)
    parser.add_argument("--target_layer",
                        help="Target layer index for deeplift (default=-3).",
                        type=int,
                        default=-3)
    parser.add_argument("--ylim",
                        help="Limits for y-axis.",
                        type=float,
                        nargs=2,
                        default=None)
    parser.add_argument(
        "--labels",
        help=
        "Full path to a file containing labels used as figure titles. If not given, use fasta IDs.",
        type=str,
        default=None)
    parser.add_argument("--logoType",
                        help="Logo image file extension (default=pdf).",
                        type=str,
                        default='pdf',
                        choices=['png', 'pdf'])

    args = parser.parse_args()

    #reading in the promoter sequences
    ids = []
    signal = []
    signal_seq = []
    for seq in pyfastx.Fasta(args.sequences):
        ids.append(seq.name)
        signal_seq.append(str(seq.seq).upper())
    #and one-hot encoding
    for i in range(0, len(signal_seq)):
        signal.append(vectorizeSequence(signal_seq[i]))
    signal = np.array(signal)

    #reading in the background sequences
    bg = []
    for seq in pyfastx.Fasta(args.background):
        bg.append(str(seq.seq).upper())
    #and one-hot encoding
    for i in range(0, len(bg)):
        bg[i] = vectorizeSequence(bg[i])
    bg = np.array(bg)

    #reading in labels if given
    if args.labels != None:
        labels = []
        f = open(args.labels, 'rt')
        for row in f:
            labels.append(row)
        f.close()
    else:
        labels = ids

    #initialize the deeplift model
    deeplift_model = kc.convert_model_from_saved_files(
        args.model,
        nonlinear_mxts_mode=deeplift.layers.NonlinearMxtsMode.
        DeepLIFT_GenomicsDefault)
    find_scores_layer_idx = 0  #computes importance scores for inpur layer input
    deeplift_contribs_func = deeplift_model.get_target_contribs_func(
        find_scores_layer_idx=find_scores_layer_idx,
        target_layer_idx=args.target_layer)

    #and then score each sequence against args.N different background sequences
    scores = np.zeros(shape=(args.N, signal.shape[0], signal.shape[1]))

    for i in range(0, args.N):
        scores[i, :, :] = np.sum(deeplift_contribs_func(
            task_idx=1,
            input_data_list=[signal],
            input_references_list=[bg[:signal.shape[0], :, :]],
            batch_size=10,
            progress_update=None),
                                 axis=2)
        bg = np.roll(bg, 1, axis=0)

    scores = np.mean(scores, axis=0)

    #now the contributions have been calculated, next plotting the sequence logos weighted by the contributions
    for ind in range(0, len(signal_seq)):
        #first plotting the sequence
        seq = signal_seq[ind]
        fig, ax = plt.subplots()
        matrix_df = lm.saliency_to_matrix(
            seq, scores[ind, :])  #pd.DataFrame(scores[i,:])
        logo = lm.Logo(df=matrix_df, color_scheme='classic')
        logo.ax.set_xlabel('position')
        logo.ax.set_ylabel('contribution')
        title = labels[ind]
        logo.ax.set_title(title)
        if args.ylim != None: logo.ax.set_ylim(args.ylim)
        plt.tight_layout()
        plt.savefig(args.outdir + ids[ind] + '.' + args.logoType,
                    dpi=150,
                    bbox_inches='tight',
                    pad_inches=0)
        plt.close(fig)
        plt.clf()
        plt.cla()

        #and then saving the importance scores to a file
        np.savetxt(args.outdir + ids[ind] + '.txt', scores[ind, :])
Beispiel #10
0
def plot_shift(results,
               config,
               out_folder,
               z_tail,
               z_head,
               mc_samples=None,
               covar_ref=None,
               seq_ref=None):
    """Plot shift in sequence space across latent vector z."""
    # Setup.
    RegressMuE_variational = results['RegressMuE_variational']
    if mc_samples is None:
        mc_samples = int(config['train']['mc_samples'])
    else:
        mc_samples = int(mc_samples)

    # Load hyperparameters.
    (dtype, bt_scale, b0_scale, l_conc, u_conc, r_conc, latent_alphabet_size,
     alphabet, alphabet_size, latent_length) = _load_hyperparameters(config)

    # Load latent vector.
    z_head = tf.convert_to_tensor(json.loads(z_head), dtype=dtype)
    z_tail = tf.convert_to_tensor(json.loads(z_tail), dtype=dtype)
    zs = tf.concat((z_tail[None, :], z_head[None, :]), axis=0)
    latent_dims = zs.shape[1]

    # Load reference file.
    if seq_ref is not None:
        data_ref = dataloader.load_joint(
            covar_ref,
            seq_ref,
            cov_filetype=config['data']['covariate_filetype'],
            cov_header=config['data']['covariate_header'],
            seq_filetype=config['data']['sequence_filetype'],
            alphabet=config['data']['alphabet'],
            dtype=dtype)
        for z_batch, x_batch, xlen_batch in data_ref.batch(1):
            z_covar, x, xlen = z_batch[0], x_batch[0], xlen_batch[0]
        prefix = 'aligned_'
    else:
        z_covar, x, xlen = None, None, None
        prefix = ''

    # Get projection.
    nus = RegressMuE.project_latent_to_sequence(zs,
                                                RegressMuE_variational,
                                                latent_dims,
                                                latent_length,
                                                latent_alphabet_size,
                                                alphabet_size,
                                                bt_scale,
                                                b0_scale,
                                                u_conc,
                                                r_conc,
                                                l_conc,
                                                z_covar=z_covar,
                                                x=x,
                                                xlen=xlen,
                                                mc_samples=mc_samples,
                                                dtype=dtype)

    # Plot shift magnitude.
    nu_shift = np.sqrt(np.sum((nus[1] - nus[0]).numpy()**2, axis=1))
    plt.figure(figsize=(8, 6))
    plt.plot(nu_shift, linewidth=2)
    plt.xlabel('conserved position', fontsize=18)
    plt.ylabel('preference shift magnitude', fontsize=18)
    plt.savefig(os.path.join(out_folder, prefix + 'shift_magnitude.pdf'))
    plt.close()

    # Plot tail logo.
    df = pd.DataFrame(nus[0].numpy(), columns=alphabet)
    logomaker.Logo(df)
    plt.savefig(os.path.join(out_folder, prefix + 'tail_logo.pdf'))
    plt.close()
    df.to_csv(os.path.join(out_folder, prefix + 'tail_logo.csv'))

    # Plot head logo.
    df = pd.DataFrame(nus[1].numpy(), columns=alphabet)
    logomaker.Logo(df)
    plt.savefig(os.path.join(out_folder, prefix + 'head_logo.pdf'))
    plt.close()
    df.to_csv(os.path.join(out_folder, prefix + 'head_logo.csv'))

    # Plot shift logo.
    df = pd.DataFrame(nus[1].numpy() - nus[0].numpy(), columns=alphabet)
    logomaker.Logo(df)
    plt.savefig(os.path.join(out_folder, prefix + 'shift_logo.pdf'))
    plt.close()
    df.to_csv(os.path.join(out_folder, prefix + 'shift_logo.csv'))
Beispiel #11
0
def main(env, args):
    # type: (Environment, argparse.Namespace) -> None

    gil = GenomeInfoList.init_from_file(args.pf_genome_list)

    pd_figures = os_join(env["pd-work"], "figures")
    mkdir_p(pd_figures)


    list_run_info = list()

    for gi in tqdm(gil, total=len(gil)):
        # get gms2 and toolp models
        mod_gms2, mod_toolp = compare_gms2_and_toolp_motifs_for_gi(env, gi)

        group = mod_gms2.items["GENOME_TYPE"].split("-")[1].upper()


        mm_gms2 = MotifModel(mod_gms2.items["RBS_MAT"], None)
        mm_toolp = MotifModel(mod_toolp.items["RBS_MAT"], None)
        non_gms2 = GMS2Noncoding(mod_gms2.items["NON_MAT"])

        df_gms2 = mm_gms2.pwm_to_df()
        df_toolp = mm_toolp.pwm_to_df()

        fig, axes = plt.subplots(1, 2, sharex="all", sharey="all", figsize=(8, 4))

        # relative
        rel_mat = lm.transform_matrix(df_gms2, from_type="probability", to_type="information")
        lm.Logo(rel_mat, color_scheme="classic", ax=axes[0])
        axes[0].set_ylim(*[0, 2])
        axes[0].set_title("GeneMarkS-2")

        # shannon
        sha_mat = lm.transform_matrix(df_toolp, from_type="probability", to_type="information")
        lm.Logo(sha_mat, color_scheme="classic", ax=axes[1])
        axes[1].set_ylim(*[0, 2])
        axes[1].set_title("StartLink+")
        plt.tight_layout()
        plt.savefig(next_name(pd_figures))
        plt.show()

        rel_gms2 = relative_entropy(mm_gms2, non_gms2)
        rel_toolp = relative_entropy(mm_toolp, non_gms2)
        gc = 100 * compute_gc_from_file(os_join(env["pd-data"], gi.name, "sequence.fasta"))

        if not args.verified:
            list_run_info.append({
                "GC": gc,
                "Accuracy": 100 - compare_gms2_start_predictions_with_motif_from_toolp(env, gi),
                "RE GMS2": rel_gms2,
                "RE toolp": rel_toolp
            })
        else:
            # verified
            comp = compare_gms2_start_predictions_with_motif_from_toolp_verified(env, gi, group=group)
            list_run_info.append({
                "Genome": fix_names(gi.name),
                "Error": 100 - comp[0],
                "Tool": "GMS2",
                "RE": rel_gms2,
                "GC": gc
                })
            list_run_info.append({
                "Genome": fix_names(gi.name),
                "Error": 100 - comp[1],
                "Tool": "GMS2 with SL",
                "RE": rel_toolp,
                "GC": gc
                })

            print(list_run_info[-2:])

    import sbsp_viz.sns as sns
    if args.verified:
        df = pd.DataFrame(list_run_info)
        df.to_csv(next_name(env["pd-work"], ext="csv"))

        sns.lineplot(df, "Genome", "Error", hue="Tool", figure_options=FigureOptions(
            save_fig=next_name(env["pd-work"]),
            xlabel="Genome",
            ylabel="Error"))

        sns.lineplot(df, "Genome", "RE", hue="Tool",
                        figure_options=FigureOptions(
                            save_fig=next_name(env["pd-work"]),
                            xlabel="Genome",
                            ylabel="Relative entropy",
                        ))


    else:

        df = pd.DataFrame(list_run_info)
        sns.scatterplot(df, "GC", "Accuracy",
                    figure_options=FigureOptions(
                        save_fig=next_name(env["pd-work"]),
                        xlabel="GC",
                        ylabel="Percentage of different 5' ends",
                        ylim=[0,10],
                    ))

        df.to_csv(next_name(env["pd-work"], ext="csv"))

        sns.scatterplot(df, "RE GMS2", "RE toolp", identity=True, figure_options=FigureOptions(
            save_fig=next_name(env["pd-work"])
        ))


        print("Average Error: {}".format(df["Accuracy"].mean()))

        df = pd.DataFrame(list_run_info)
        df = df[df["Accuracy"] < 2].copy()
        sns.scatterplot(df, "GC", "Accuracy",
                    figure_options=FigureOptions(
                        save_fig=next_name(env["pd-work"]),
                        xlabel="GC",
                        ylabel="Percentage of different 5' ends",
                        ylim=[0,10],
                    ))

        sns.scatterplot(df, "RE GMS2", "RE toolp", identity=True, figure_options=FigureOptions(
            save_fig=next_name(env["pd-work"])
        ))

        print("Average Error: {}".format(df["Accuracy"].mean()))

        df.to_csv(next_name(env["pd-work"], ext="csv"))
Beispiel #12
0
# do imports
import matplotlib.pyplot as plt
import numpy as np
import logomaker as logomaker

# load saliency matrix
nn_df = logomaker.get_example_matrix('nn_saliency_matrix',
                                     print_description=False)

# create Logo object
nn_logo = logomaker.Logo(nn_df)

# style using Logo methods
nn_logo.style_spines(visible=False)
nn_logo.style_spines(spines=['left'], visible=True, bounds=[0, .75])

# style using Axes methods
nn_logo.ax.set_xlim([20, 115])
nn_logo.ax.set_xticks([])
nn_logo.ax.set_ylim([-.6, .75])
nn_logo.ax.set_yticks([0, .75])
nn_logo.ax.set_yticklabels(['0', '0.75'])
nn_logo.ax.set_ylabel('                 saliency', labelpad=-1)

# set parameters for drawing gene
exon_start = 55 - .5
exon_stop = 90 + .5
y = -.2
xs = np.arange(-3, len(nn_df), 10)
ys = y * np.ones(len(xs))
Beispiel #13
0
# do imports
import matplotlib.pyplot as plt
import logomaker as logomaker

# load crp energy matrix
crp_df = -logomaker.get_example_matrix('crp_energy_matrix',
                                       print_description=False)

# create Logo object
crp_logo = logomaker.Logo(crp_df,
                          shade_below=.5,
                          fade_below=.5,
                          font_name='Arial Rounded MT Bold')

# style using Logo methods
crp_logo.style_spines(visible=False)
crp_logo.style_spines(spines=['left', 'bottom'], visible=True)
crp_logo.style_xticks(rotation=90, fmt='%d', anchor=0)

# style using Axes methods
crp_logo.ax.set_ylabel("$-\Delta \Delta G$ (kcal/mol)", labelpad=-1)
crp_logo.ax.xaxis.set_ticks_position('none')
crp_logo.ax.xaxis.set_tick_params(pad=-1)

# style and show figure
crp_logo.fig.show()
    I = np.log2(4) + np.sum(
        kmer_motif * np.log2(kmer_motif + 1e-10), axis=1, keepdims=True)
    logo = np.maximum(I * kmer_motif, 1e-7)

    # setup dataframe for logmaker
    L = len(kmer_motif)
    counts_df = pd.DataFrame(data=0.0,
                             columns=list(alphabet),
                             index=list(range(L)))
    for l in range(L):
        for a in range(4):
            counts_df.iloc[l, a] = logo[l, a]

    fig = plt.figure(figsize=(3, 2))
    ax = plt.subplot(111)
    logomaker.Logo(counts_df, ax=ax)
    ax = plt.gca()
    ax.spines['right'].set_visible(False)
    ax.spines['top'].set_visible(False)
    ax.spines['bottom'].set_visible(False)
    ax.spines['left'].set_visible(False)
    ax.yaxis.set_ticks_position('none')
    ax.xaxis.set_ticks_position('none')
    plt.xticks([])
    plt.yticks([])
    outfile = os.path.join(motif_path, experiment + '_kmer_motif_logo.png')
    fig.savefig(outfile, format='pdf', dpi=200, bbox_inches='tight')

    #-----------------------------------------------------------------------------
    # kmer mutagenesis
    print("performing k-mer mutagenesis analysis")
Beispiel #15
0
def line_wrapped_logo(
    tidy_df,
    *,
    site_col='site',
    letter_col='letter',
    height_col='height',
    color_col='color',
    sitelabel_col=None,
    highlight_color_col=None,
    highlight_alpha_col=None,
    sites_per_line=100,
    scalewidth=1,
    scaleheight=1,
    fade_letters_by_height=None,
    logo_kwargs=None,
    ylims=None,
    all_letters=('A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N',
                 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y'),
    missing_letter='error',
    letters_to_drop=('*', ),
    style_xticks_kwargs=None,
    xlabel=None,
    ylabel=None,
    label_fontsize=16,
    xlabelpad=10,
    baseline_on_top=True,
):
    """Draw logo wrapping several lines with custom colors and overlays.
    
    Parameters
    -----------
    tidy_df : pandas.DataFrame
        Holds data in tidy format, one line per letter.
    site_col : str
        Column in `tidy_df` with site number.
    letter_col : str
        Column in `tidy_df` with letter identity (e.g., amino acid).
    height_col : str
        Column in `tidy_df` with letter height.
    color_col : str
        Column in `tidy_df` with letter color.
    sitelabel_col : None or str
        Column in `tidy_df` with labels for site ticks if different than
        `site_col`.
    highlight_color_col : None or str
        Column in `tidy_df` with background highlight color, or `None`
        or `NA` if site not highlighted. Only one color can be assigned
        per site.
    highlight_alpha_col : None or float
        Column in `tidy_df` with background color alpha (transparency) for
        site highlighting. If not present, defaults to 0.25.
    sites_per_line : int
        Number of sites per line.
    scalewidth : float
        Scale overall figure height by this much.
    scaleheight : float
        Scale overall figure width by this much.
    fade_letters_by_height : None or 2-tuple
        If not `None`, set alpha transparency of letters proportional to
        their height, going from `(min_alpha, max_alpha)`.
    logo_kwargs : None or dict
        Keyword arguments to ``logomaker.Logo``. Key ones include 'width',
        'vpad', and 'font_name'.
    ylims : 2-tuple or None
        Y-axis limits, or `None` to auto-determine.
    all_letters : tuple or list
        All letters for which we plot heights.
    missing_letter : {'zero_height', 'error'}
        If letter is missing at a site, assign zero height or raise error?
    letters_to_drop : tuple or list
        Do not plot these letters.
    style_xticks_kwargs : None or dict
        Keyword arguments to pass to ``logomaker.Logo.style_xticks``. For
        instance, to change spacing between tick labels to every 10th site, use
        ``style_xticks_args={'spacing': 10}``.
    xlabel : str or None
        Label for x-axis (shared over entire plot).
    ylabel : str or None
        Label for y-axis (shared over entire plot).
    label_fontsize : int
        Size of labels drawn for `xlabel`, `ylabel`.
    xlabelpad : float
        Padding above x-axis label.
    baseline_on_top : bool
        Draw baseline (horizontal line at 0 height) on top of letters.

    """
    expect_cols = [site_col, letter_col, height_col, color_col]
    for col in [sitelabel_col, highlight_color_col, highlight_alpha_col]:
        if col is not None:
            expect_cols.append(col)
    for col in expect_cols:
        if col not in tidy_df.columns:
            raise ValueError(f"`tidy_df` lacks column {col}")

    if set(letters_to_drop).intersection(set(all_letters)):
        raise ValueError('overlap between `letters_to_drop` and `all_letters`')

    # drop any extra letters
    tidy_df = tidy_df.query(f"{letter_col} not in {letters_to_drop}")

    # make wide data frame for logomaker
    wide_df = tidy_to_wide_df(tidy_df, site_col, letter_col, height_col,
                              all_letters)

    # dict matching (site, letter) to color
    colors = tidy_df.set_index([site_col, letter_col])[color_col].to_dict()

    # dicts matching sites to labels, highlight color, and alpha
    sitelabels = collections.defaultdict(lambda: '')
    highlight_colors = {}
    highlight_alphas = collections.defaultdict(lambda: 0.25)
    for d, col in [
        (sitelabels, sitelabel_col),
        (highlight_colors, highlight_color_col),
        (highlight_alphas, highlight_alpha_col),
    ]:
        if col is not None:
            site_vals = (tidy_df[tidy_df[col].notnull()][[site_col, col
                                                          ]].drop_duplicates())
            dup_site_vals = (site_vals.groupby(site_col).aggregate(
                n=pd.NamedAgg(col, 'count')).query('n > 1'))
            if len(dup_site_vals):
                raise ValueError(f"multiple {col} for sites:\n{dup_site_vals}")
            for k, v in site_vals.set_index(site_col)[col].to_dict().items():
                d[k] = v

    # set up figure
    nsites = len(wide_df)
    nlines = math.ceil(nsites / sites_per_line)
    sites_per_line = min(sites_per_line, nsites)  # reduce if needed
    fig = plt.figure(figsize=(scalewidth * sites_per_line * 0.3,
                              scaleheight * nlines * 1.75), )

    # map letters to fading
    if fade_letters_by_height:
        letter_fading = {}
        min_alpha, max_alpha = fade_letters_by_height
        if not 0 <= min_alpha < max_alpha <= 1:
            raise ValueError('fade_letters_by_height must span non-zero'
                             'range betweeen 0 and 1')
        min_height = wide_df.abs().min().min()
        max_height = wide_df.abs().max().max()
        for site, letter in itertools.product(wide_df.index, wide_df.columns):
            abs_height = abs(wide_df.at[site, letter])
            norm_fade = (abs_height - min_height) / (max_height - min_height)
            assert 0 <= norm_fade <= 1, norm_fade
            fade = norm_fade * (max_alpha - min_alpha) + min_alpha
            assert min_alpha <= fade <= max_alpha
            letter_fading[(site, letter)] = fade

    # auto-determine y-axis limits
    ypad = 1.02
    if ylims is None:
        if all(tidy_df[height_col] >= 0):
            ymin = 0
            ymax = ypad * tidy_df[height_col].max()
        elif all(tidy_df[height_col] <= 0):
            ymax = 0
            ymin = ypad * tidy_df[height_col].min()
        else:
            ymax = tidy_df[height_col].max()
            ymin = tidy_df[height_col].min()
            ymax += ypad * (ymax - ymin)
            ymin -= ypad * (ymax - ymin)

    # arguments for xtick styling
    xticks_kwargs = {
        'spacing': 5,  # number every five sites
        'rotation': 90,  # rotated tick marks
        'fontdict': {
            'verticalalignment': 'top',
            'horizontalalignment': 'center',
            'fontsize': 10
        },
    }
    if style_xticks_kwargs is not None:
        for key, val in style_xticks_kwargs.items():
            xticks_kwargs[key] = val

    # draw logos for each line of figure
    for iline in range(nlines):  # loop over lines
        df = wide_df.iloc[iline * sites_per_line:(iline + 1) * sites_per_line]
        isites = df.index.tolist()  # sites being plotted on this axis
        ax = plt.subplot2grid(
            shape=(nlines, sites_per_line),
            loc=(iline, 0),
            colspan=len(df),  # number of sites for this line
            fig=fig,
        )
        logo = logomaker.Logo(
            df=df,
            ax=ax,
            **logo_kwargs,
        )

        # color letters
        for site, letter in itertools.product(isites, all_letters):
            style_kwargs = {}
            if (site, letter) in colors:
                style_kwargs['color'] = colors[(site, letter)]
            if fade_letters_by_height:
                style_kwargs['alpha'] = letter_fading[(site, letter)]
            if style_kwargs:
                logo.style_single_glyph(p=site, c=letter, **style_kwargs)

        # highlight sites
        for site, highlight_color in highlight_colors.items():
            if site in isites:
                logo.highlight_position(
                    p=site,
                    color=highlight_color,
                    alpha=highlight_alphas[site],
                )

        # format axes and ticks
        logo.style_spines(visible=False)
        if sitelabels:
            xticks_kwargs['spacing'] = 1
            logo.style_xticks(**xticks_kwargs)
            ax.set_xticklabels([
                str(sitelabels[site])
                for site in range(min(isites),
                                  max(isites) + 1)
            ])
        else:
            logo.style_xticks(**xticks_kwargs)
        ax.tick_params(
            axis='x',
            length=0,  # no xtick lines
            pad=0,  # no padding between xtick labels and axis
        )
        ax.set_ylim(ylims)
        ax.set_yticks([])

        # draw baseline on top of letters?
        if baseline_on_top:
            logo.draw_baseline(zorder=1)

    # set figure-wide axis labels: https://stackoverflow.com/a/53172335
    if xlabel or ylabel:
        ax_fig = fig.add_subplot(111, facecolor='none', frameon=False)
        ax_fig.tick_params(labelcolor='none',
                           top=False,
                           bottom=False,
                           left=False,
                           right=False)
        if xlabel:
            ax_fig.set_xlabel(xlabel,
                              fontsize=label_fontsize,
                              labelpad=xlabelpad)
        if ylabel:
            ax_fig.set_ylabel(ylabel, fontsize=label_fontsize)

    fig.tight_layout(h_pad=1.5)

    return fig
def plot_letter_over_position(env, df, col, title=""):
    # type: (Environment, pd.DataFrame, str, str) -> None

    collect = dict()
    array, update_shifts = create_numpy_for_column_with_extended_motif(
        env, df, col, collect)
    df_original = df
    binned_arrays = [{
        "GC": df["GC"],
        "motifs": array,
        "shifts": update_shifts
    }]

    example = df.at[df.index[0], col]  # type: Dict[str, List[float]]
    w = len(next(iter(example.values())))  # width (numbere of positions)
    b = len(example)  # number of bases (letters)

    letters = example.keys()
    letter_to_idx = {x: x_pos for x_pos, x in enumerate(sorted(letters))}

    # fig, axes = plt.subplots(2, math.ceil(len(letters) / 2), sharex="all", sharey="all")
    fig = plt.figure(figsize=(10, 12))
    shape = (4, 2)

    ax1 = plt.subplot2grid(shape, (0, 0))
    ax2 = plt.subplot2grid(shape, (0, 1))
    ax3 = plt.subplot2grid(shape, (1, 0))
    ax4 = plt.subplot2grid(shape, (1, 1))
    ax_logo = plt.subplot2grid(shape, (3, 0))
    ax_counts = plt.subplot2grid(shape, (2, 0))
    ax_pos_dist = plt.subplot2grid(shape, (2, 1))
    ax_text = plt.subplot2grid(shape, (3, 1))

    axes = [ax1, ax2, ax3, ax4]

    # for each letter
    # for l, ax in zip(letters, axes.ravel()[:len(letters)]):
    ylim = [-0.1, 1.1]
    for l, ax in zip(letters, axes):
        # for each position in motif
        # go through df and accumulate values
        all_gc = list()
        all_probs = list()
        for w_pos in range(array.shape[1]):

            for ba in binned_arrays:
                arr = ba["motifs"]
                gc = ba["GC"].values
                shifts = ba["shifts"]

                for index in range(len(shifts)):

                    shifted_position = w_pos
                    # print(w_pos, shifted_position)

                    # shifted_pos = w_pos - shifts[index]
                    # if shifted_pos < 0 or shifted_pos >= w:
                    #     continue
                    if w_pos < shifts[index] or w_pos >= shifts[index] + 6:
                        continue

                    all_gc.append(shifted_position)

                    if arr[index, shifted_position,
                           letter_to_idx[l]] < 0 or arr[index,
                                                        shifted_position,
                                                        letter_to_idx[l]] > 1:
                        raise ValueError("Something's up")
                    all_probs.append(arr[index, shifted_position,
                                         letter_to_idx[l]])

            # ax.scatter(all_gc, all_probs, marker="+")
            # seaborn.regplot(all_gc, all_probs, ax=ax, lowess=True, scatter_kws={"s": 5, "alpha": 0.3})
        ax.set_title(f"{l}")

        df = pd.DataFrame({"Position": all_gc, "Probability": all_probs})
        df.sort_values("Position", inplace=True)

        # seaborn.kdeplot(df["Position"], df["Probability"], cmap="Reds", ax=ax)

        df_mean = df.groupby("Position", as_index=False).mean()
        seaborn.boxplot("Position",
                        "Probability",
                        data=df,
                        ax=ax,
                        color="red",
                        fliersize=0)
        seaborn.lineplot(df_mean["Position"],
                         df_mean["Probability"],
                         ax=ax,
                         color="blue")
        ax.set_ylim(ylim)
        # loess_with_stde(df, "Position", "Probability", ax, None)

        # plt.show()

    # add logo
    ax = ax_logo
    msa_t = collect["msa_t"]
    seqs = [x.seq._data for x in msa_t.list_alignment_sequences]
    counts_mat = lm.alignment_to_matrix(sequences=seqs,
                                        to_type='counts',
                                        characters_to_ignore='.-X')

    # Counts matrix -> Information matrix
    info_mat = lm.transform_matrix(counts_mat,
                                   from_type='counts',
                                   to_type='information')

    lm.Logo(info_mat, ax=ax, color_scheme="classic")
    ax.set_ylim([0, 2])

    # add distplot of starting positions
    ax = ax_counts
    # seaborn.distplot(update_shifts, ax=ax)
    counter = Counter(update_shifts)
    total = sum(counter.values())
    to_add = sorted(set(range(4)).difference(counter.keys()))
    normalized = [[x, 100 * counter[x] / total]
                  for x in counter] + [[x, 0] for x in to_add]
    normalized = np.array(normalized)
    seaborn.barplot(normalized[:, 0], normalized[:, 1], ax=ax, color="blue")
    ax.set_ylim([0, 100])
    ax.set_ylabel("Probability")
    ax.set_xlabel("Shift in consensus")

    ### Plot position distribution
    col_pos = col.replace("_MAT", "_POS_DISTR")
    ax = ax_pos_dist
    shift_to_pos_dist = get_position_distributions_by_shift(
        df_original, col_pos, update_shifts)
    for s in sorted(shift_to_pos_dist.keys()):
        list_pos_dist = shift_to_pos_dist[s]

        # average positions
        values = dict()
        for l in list_pos_dist:
            try:
                for i in l.keys():
                    if i not in values.keys():
                        values[i] = list()
                    values[i].append(l[i])
            except Exception:
                continue
        for i in values.keys():
            values[i] = np.mean(values[i])

        total = sum(values.values())
        for i in values.keys():
            values[i] /= total

        x = sorted(values.keys())
        y = [values[a] for a in x]

        seaborn.lineplot(x, y, label=s, ax=ax)

    ax.legend()

    # TEXT
    ax = ax_text
    from matplotlib.font_manager import FontProperties
    fp = FontProperties()
    fp.set_family("monospace")
    print("here")
    print(print_reduced_msa(msa_t, True, n=10))
    ax.text(0,
            0,
            print_reduced_msa(msa_t, True, n=10),
            horizontalalignment='left',
            verticalalignment='center',
            fontproperties=fp)
    ax.set_xlim([-0.2, 0.4])
    ax.set_ylim([-0.4, 0.4])
    # ax.axis("off",)
    ax.get_xaxis().set_visible(False)
    ax.get_yaxis().set_visible(False)

    plt.suptitle("Gc range: {}. Num Data points: {}".format(
        title, msa_t.number_of_sequences()))
    # save_figure(FigureOptions(save_fig=next_name(env["pd-work"])))
    plt.tight_layout()
    plt.subplots_adjust(top=0.9)

    plt.savefig(next_name(env["pd-work"]))
    plt.show()
def plot_frequency(samp):
    inpath = "%s/FPassignment/%s/%s/analysis/kpLogo/%s/%s_%supTO%sdown_kpLogo.pass.p.cutoff.txt" % (
        rootpath, genome_name, experiment, samp, samp, upstreamNts,
        downstreamNts)

    df = pd.read_csv(inpath, sep='\t')
    print df
    df = df.sort_values(by=['position', '#kmer'])
    df = df.reset_index(drop=True)

    ### get raw p-values from -log10 vals

    pvals = []

    freqs = []

    for i in df.index:

        count1 = float(df.loc[i, 'n1'])
        countOther = float(df.loc[i, 'n2'])
        freqOne = (count1 / (count1 + countOther))
        freqs.append(freqOne)

    df['freq'] = freqs
    ### extract desired values

    ### set up matrix for lm
    dl = df.copy()
    dl = dl[['#kmer', 'position', 'freq']]
    dl.loc[dl['position'] > 0, 'position'] -= 1

    dl.loc[dl['#kmer'] == 'T', '#kmer'] = 'U'

    print dl
    print ""

    dlm = pd.DataFrame(np.zeros((4, upstreamNts + 3 + downstreamNts)))
    dlm.index = ['A', 'C', 'G', 'U']
    dlm.columns = range(-upstreamNts, downstreamNts + 3)
    # dlm.drop(0, axis=1,inplace=True)

    print dlm

    for i in dl.index:
        nt = dl.loc[i, '#kmer']
        pos = dl.loc[i, 'position']
        val = dl.loc[i, 'freq']

        if pos < 0:
            dlm.loc[nt, pos] = val
        elif pos == 0:
            if nt == 'A':
                dlm.loc['A', 0] = val
                dlm.loc['A', 1] = val
                dlm.loc['A', 2] = val
            elif nt == 'C':
                dlm.loc['C', 0] = val
                dlm.loc['C', 1] = val
                dlm.loc['C', 2] = val
            elif nt == 'G':
                dlm.loc['G', 0] = val
                dlm.loc['G', 1] = val
                dlm.loc['G', 2] = val
            else:
                print "stop not set"
        elif pos > 0:
            newpos = pos + 2
            dlm.loc[nt, newpos] = val

    print dlm

    dlm = dlm.T
    dlm

    fig, ax = plt.subplots(figsize=(30, 6))

    logo = lm.Logo(dlm, font_name='Arial', ax=ax, vpad=0.05)

    logo.draw_baseline(linewidth=1, color='black', linestyle="-")
    # logo.highlight_position_range(0,2,alpha=0.5,color='lightgray')

    ax.set_ylim(0, 1)

    # outpath = '%s/FPassignment/%s/%s/analysis/kpLogo/%s/%s_%supTO%sdown_FREQ.pdf' % (
    # 	rootpath, genome_name, experiment, samp, samp, upstreamNts, downstreamNts)

    outpath = "%s/figures/Fig5B.pdf" % rootDir

    plt.savefig(outpath, format='pdf', bbox_inches="tight")
Beispiel #18
0
# do imports
import matplotlib.pyplot as plt
import logomaker as logomaker

# load ss probability matrix
ss_df = logomaker.get_example_matrix('ss_probability_matrix',
                                     print_description=False)

# create Logo object
ss_logo = logomaker.Logo(ss_df,
                         width=.8,
                         vpad=.05,
                         fade_probabilities=True,
                         stack_order='small_on_top',
                         color_scheme='dodgerblue',
                         font_name='Rosewood Std')

# style using Logo methods
ss_logo.style_spines(spines=['left', 'right'], visible=False)

# style using Axes methods
ss_logo.ax.set_xticks(range(len(ss_df)))
ss_logo.ax.set_xticklabels('%+d'%x for x in [-3, -2, -1, 1, 2, 3, 4, 5, 6])
ss_logo.ax.set_yticks([0, .5, 1])
ss_logo.ax.axvline(2.5, color='k', linewidth=1, linestyle=':')
ss_logo.ax.set_ylabel('probability')

# show plot
ss_logo.fig.show()
Beispiel #19
0
def cluster_logo_plot(adata, obs_col, obs_val, lengths="all"):
    length_args = ["all", "dominant"]
    if lengths not in length_args:
        raise ValueError("length argument must be one of %s" % length_args)

    # Lets take an example cluster
    logo_clust = adata.obs[adata.obs[obs_col] == obs_val][[
        "TRA_cdr3", "TRB_cdr3", "TRA_cdr3_length", "TRB_cdr3_length"
    ]]

    # Figure out the dominant lengths of the clusters
    if lengths == "dominant":
        num_alpha_lengths = 1
        num_beta_lengths = 1
    else:
        num_beta_lengths = len(set(logo_clust["TRB_cdr3_length"]))
        num_alpha_lengths = len(set(logo_clust["TRA_cdr3_length"]))

    figRows = max([num_beta_lengths, num_alpha_lengths])

    # NEED TO FIGURE OUT HOW TO MAKE A GOOD FIGSIZE CALCULATION
    fig, ax = plt.subplots(nrows=figRows,
                           ncols=2,
                           figsize=(10 * figRows, 3 * figRows))

    for num, seqtype in enumerate(["TRA", "TRB"]):
        seq_df = logo_clust[[
            "{seqtype}_cdr3".format(seqtype=seqtype),
            "{seqtype}_cdr3_length".format(seqtype=seqtype)
        ]]

        if lengths == "dominant":
            chain_lengths = [
                seq_df["{seqtype}_cdr3_length".format(
                    seqtype=seqtype)].value_counts().idxmax()
            ]
        else:
            chain_lengths = sorted(
                set(seq_df["{seqtype}_cdr3_length".format(seqtype=seqtype)]))
        for row, seqlen in enumerate(chain_lengths):
            # Get the seqs
            logo_seqs = seq_df[seq_df["{seqtype}_cdr3_length".format(
                seqtype=seqtype)] == seqlen]["{seqtype}_cdr3".format(
                    seqtype=seqtype)]
            # Concatenate and determine all used AA in seqs
            unique_AA = set("".join(seq for seq in logo_seqs))
            # Probability matrix
            prob_df = pd.DataFrame(index=range(seqlen), columns=unique_AA)

            for indx in range(len(logo_seqs[0])):
                # Get the letter at the position for each seq
                AAs = [seq[indx] for seq in logo_seqs]

                # Calculate probabilities
                prob_dict = dict(Counter(AAs))
                for key, val in prob_dict.items():
                    prob = val / len(AAs)
                    prob_df.loc[indx, key] = prob
            prob_df = prob_df.fillna(0)
            prob_df.sum(axis=1)

            if figRows == 1:
                logomaker.Logo(prob_df,
                               ax=ax[num],
                               width=.8,
                               vpad=.05,
                               fade_probabilities=True,
                               stack_order='small_on_top',
                               color_scheme='dodgerblue',
                               font_name='Rosewood Std')
                ax[num].set_title(
                    "Number of seqs: {seqlen}".format(seqlen=len(logo_seqs)), {
                        "fontsize": 10,
                        "fontweight": "bold"
                    })

                # Add additional title
                # Get the center of the plot
                center = seqlen / 1.75
                height = 1.1 + (figRows / 15)
                ax[num].text(center,
                             height,
                             "{seqtype} CDR3".format(seqtype=seqtype), {
                                 "fontsize": 15,
                                 "fontweight": "bold"
                             },
                             horizontalalignment="right")
                continue
            else:
                logomaker.Logo(prob_df,
                               ax=ax[row, num],
                               width=.8,
                               vpad=.05,
                               fade_probabilities=True,
                               stack_order='small_on_top',
                               color_scheme='dodgerblue',
                               font_name='Rosewood Std')
                ax[row, num].set_title(
                    "Number of seqs: {seqlen}".format(seqlen=len(logo_seqs)), {
                        "fontsize": 10,
                        "fontweight": "bold"
                    })
                # If the first of either alpha or beta, add additional title
                if row == 0:
                    # Get the center of the plot
                    center = (seqlen + .75) / 2
                    height = 1 + (figRows / 15)
                    ax[row, num].text(center,
                                      height,
                                      "{seqtype} CDR3".format(seqtype=seqtype),
                                      {
                                          "fontsize": 15,
                                          "fontweight": "bold"
                                      },
                                      horizontalalignment="right")
    fig.tight_layout()

    # Determine which chain has more seq lengths
    if num_beta_lengths > num_alpha_lengths:
        # get rid of excess alpha plots
        for noLogo in range(num_alpha_lengths, num_beta_lengths):
            ax[noLogo, 0].axis("off")
    elif num_beta_lengths < num_alpha_lengths:
        # Get rid of excess beta plots
        for noLogo in range(num_beta_lengths, num_alpha_lengths):
            ax[noLogo, 1].axis("off")

    return (fig)
Beispiel #20
0
# get attribution scores
attr_score = saliency(model, X, class_index=0, layer=-2, batch_size=256)
attr_score = attr_score * X

# plot attribution scores for sequences with top predictions
N, L, A = attr_score.shape
for i in range(len(X)):
    counts_df = pd.DataFrame(data=0.0,
                             columns=list('ACGU'),
                             index=list(range(L)))
    for a in range(A):
        for l in range(L):
            counts_df.iloc[l, a] = attr_score[i][l, a]

    logomaker.Logo(counts_df, figsize=(25, 2))
    ax = plt.gca()
    ax.spines['right'].set_visible(False)
    ax.spines['top'].set_visible(False)
    ax.yaxis.set_ticks_position('none')
    ax.xaxis.set_ticks_position('none')
    plt.xticks([])
    plt.yticks([])
    fig = plt.gcf()
    fig.savefig(str(data_path) + '/Top_Predictions/' + str(i))

fig, W, logo = plot_filters(model,
                            x_test,
                            layer=3,
                            threshold=0.5,
                            window=20,
def test_Logo_highlight_position_range():

    good_crp_df = logomaker.get_example_matrix('crp_energy_matrix',
                                               print_description=False)

    # test parameter pmin
    test_parameter_values(
        func=logomaker.Logo(good_crp_df).highlight_position_range,
        var_name='pmin',
        fail_list=['x', 20],
        success_list=[0, 1, 10],
        pmax=15)

    # test parameter pmax
    test_parameter_values(
        func=logomaker.Logo(good_crp_df).highlight_position_range,
        var_name='pmax',
        fail_list=['x', 1],
        success_list=[5.5, 6, 10],
        pmin=5)

    # test parameter padding
    test_parameter_values(
        func=logomaker.Logo(good_crp_df).highlight_position_range,
        var_name='padding',
        fail_list=['x', -1],
        success_list=[-0.5, 0, 10],
        pmin=5,
        pmax=10)

    # test parameter color
    test_parameter_values(
        func=logomaker.Logo(good_crp_df).highlight_position_range,
        var_name='color',
        fail_list=['x', 1, True, 'wrong_color'],
        success_list=['pink', 'red', [1, 1, 1]],
        pmin=5,
        pmax=10)

    # test parameter edgecolor
    test_parameter_values(
        func=logomaker.Logo(good_crp_df).highlight_position_range,
        var_name='edgecolor',
        fail_list=['x', 1, True, 'wrong_color'],
        success_list=[None, 'pink', 'red', [1, 1, 1]],
        pmin=5,
        pmax=10)

    # test parameter floor
    test_parameter_values(
        func=logomaker.Logo(good_crp_df).highlight_position_range,
        var_name='floor',
        fail_list=['x', 10],
        success_list=[-1, 1, None],
        pmin=5,
        pmax=10)

    # test parameter ceiling
    test_parameter_values(
        func=logomaker.Logo(good_crp_df).highlight_position_range,
        var_name='ceiling',
        fail_list=['x', -10],
        success_list=[-1, 1, None],
        pmin=5,
        pmax=10)

    # test parameter zorder. Note that a value of False passes for this parameter. This should be fixed.
    test_parameter_values(
        func=logomaker.Logo(good_crp_df).highlight_position_range,
        var_name='zorder',
        fail_list=['x', None],
        success_list=[-1, 0.5, 1],
        pmin=5,
        pmax=10)
Beispiel #22
0
def weblogologomaker(request):
    if request.method == "POST":
        # seqs = unquote(request.GET.get('seq'))
        data = request.data
        seqs = data['seqs']
        try:
            type_output = data['output']
        except:
            type_output = "png"
        try:
            type_os = data['os']
        except:
            type_os = "linux"


        ##########################
        # type_os = "windows"
        #######################

        output = weblogo_aux(seqs, type_os)

        in_file = "unaligned.fasta"
        out_file = "aligned.fasta"

        file = open(out_file, "r")
        seqs = read_seq_data(file, alphabet="ACDEFGHIKLMNPQRSTVWY-")
        logodata = LogoData.from_seqs(seqs)
        logooptions = LogoOptions()
        logooptions.title = "VFP WEBSERVER"
        logoformat = LogoFormat(logodata, logooptions)
        weblogo_txt = txt_formatter(logodata, logoformat)

        # weblogo_jpeg = jpeg_formatter(logodata, logoformat)

        weblogo_file = "weblogo.txt"
        weblogo = open(weblogo_file, "w")
        data_weblogo = str(weblogo_txt)[2:len(str(weblogo_txt)) - 1].replace('\\n', '\n').replace('\\t', '\t')
        weblogo.write(data_weblogo)
        weblogo.close()

        filename = 'weblogo.txt'

        weblogoDf = pd.read_csv(filename, skiprows=7, sep='\t')


        weblogoDf = weblogoDf[:-1]

        columns = []
        for i in weblogoDf.columns:
            j = i.replace(' ', '')
            columns.append(j)
        weblogoDf.columns = columns

        weblogo_entropyes = weblogoDf.loc[:, weblogoDf.columns[1:len(weblogoDf.columns) - 4]]

        entropies = list((np.log2(20) - weblogoDf.loc[:, 'Entropy']))

        weblogo_entropyes = weblogo_entropyes.mul(entropies, axis=0)

        family_weblogo = weblogo_entropyes.drop(['-'], axis=1)

        if type_output == "txt":

            weblogo = open(weblogo_file)

            data = weblogo.read()

            return HttpResponse(data, content_type="text/plain")

            # return JsonResponse(family_weblogo.to_json(orient="index"), safe=False)

        else:
            data = logomaker.transform_matrix(family_weblogo)

            # create figure
            height_per_row = 2
            width_per_col = 1.5

            line_size = 25

            num_rows = int(data.shape[0] / line_size) + 1

            fig = plt.figure(figsize=[width_per_col * line_size,
                                      height_per_row * num_rows])

            max_df = data.sum(axis=1).max()

            for i in range(0, int(data.shape[0] / line_size)):
                # set axes limits and label

                ax = plt.subplot2grid((num_rows, 1), (i, 0))
                ax.spines['right'].set_visible(False)
                ax.spines['top'].set_visible(False)
                ax.set_ylim(bottom=0, top=max_df)

                # ax.set_xlabel("Type of peptide")
                ax.set_ylabel('Bits')

                logo = logomaker.Logo(data.loc[range(i * line_size, (i + 1) * line_size), :],
                                      ax=ax,
                                      color_scheme='NajafabadiEtAl2017', )

                # style using Axes methods
                # logo.ax.set_ylabel("$-\Delta \Delta G$ (kcal/mol)", labelpad=-1)
                # logo.ax.xaxis.set_ticks_position('none')
                logo.ax.set_ylim([0, max_df])

                # style using Logo methods
                # logo.style_glyphs(ceiling = max_df)

            if i * line_size != data.shape[0]:

                i += 1

                data_aux = data

                for j in range(i * line_size, (i + 1) * line_size):
                    data_aux = data_aux.append(pd.Series(0, index=data_aux.columns), ignore_index=True)

                ax = plt.subplot2grid((num_rows, 1), (i, 0))

                ax.spines['right'].set_visible(False)
                ax.spines['top'].set_visible(False)
                ax.set_ylim(bottom=0, top=max_df)

                # ax.set_xlabel("Type of peptide")
                ax.set_ylabel('Bits')

                logo = logomaker.Logo(data_aux.loc[range(i * line_size, (i + 1) * line_size), :],
                                      ax=ax,
                                      color_scheme='NajafabadiEtAl2017', )

                # style using Axes methods
                # logo.ax.set_ylabel("$-\Delta \Delta G$ (kcal/mol)", labelpad=-1)
                # logo.ax.xaxis.set_ticks_position('none')
                logo.ax.set_xlim([i * line_size - 0.5, (i + 1) * line_size - 0.5])
                logo.ax.set_ylim([0, max_df])

            image_path = "weblogo.png"
            fig.savefig("weblogo.png")

            with open(image_path, "rb") as image_file:
                image_data = base64.b64encode(image_file.read()).decode('utf-8')

            # base64data = open("base64.txt","w")

            # base64data.write(image_data)

            # print(image_data)

            # base64data = open("base64.txt")

            # send_data = base64data.read()

            # return HttpResponse(image_data, content_type="image/png")

            return HttpResponse(image_data, content_type="text/plain")

        # return JsonResponse({'data': output}, safe=False)
    raise Http404
    df = all_df[all_df['characters'] == char_set].copy()
    df.sort_values(by='color_scheme', inplace=True)
    df.reset_index(inplace=True, drop=True)

    # for each color scheme
    for row_num, row in df.iterrows():
        # set axes
        col_num = sum(colspans[:j])
        col_span = colspans[j]
        ax = plt.subplot2grid((num_rows, num_cols), (row_num, col_num),
                              colspan=col_span)

        # get color scheme
        color_scheme = row['color_scheme']

        # make matrix for character set
        mat_df = logomaker.sequence_to_matrix(char_set)

        # make and style logo
        logomaker.Logo(mat_df,
                       ax=ax,
                       color_scheme=color_scheme,
                       show_spines=False)
        ax.set_xticks([])
        ax.set_yticks([])
        ax.set_title(repr(color_scheme))

# style and show figure
fig.tight_layout()
fig.show()
li = [
    'atagccggtacggca', 'ttagctgcaaccgca', 'tcagccactagagca', 'ataaccgcgaccgca',
    'ttagccgctaaggta', 'taagcctcgtacgta', 'ttagccgttacggcc', 'atatccggtacagta',
    'atagcaggtaccgaa', 'acatccgtgacggaa'
]

new_li = []
for i in range(len(li[0])):
    r = ''
    for j in range(len(li)):
        r += li[j][i]
    new_li.append(r)

position_weight_matrix = np.zeros((4, 15))
alphabets = ['a', 'c', 'g', 't']
for seq in range(len(new_li)):
    for alphabet in range(len(alphabets)):
        position_weight_matrix[alphabet][seq] = new_li[seq].count(
            alphabets[alphabet]) / 5

df = pd.DataFrame(position_weight_matrix.T, columns=['A', 'C', 'G', 'T'])
df.index = np.arange(1, len(df) + 1)

logos = logomaker.Logo(df)
logos.ax.set_xticks(np.arange(1, 16))
logos.ax.set_yticks(np.arange(3))
logos.ax.set_ylabel('Bits')
logos.ax.set_xlabel('Sequence Position')
pml.savefig('seqlogo.pdf')
plt.show()
Beispiel #25
0
logo_df = logomaker.get_example_matrix('logomaker_logo_matrix',
                                       print_description=False)

# create color scheme
color_scheme = {
    'L': [0, .5, 0],
    'O': [1, 0, 0],
    'G': [1, .65, 0],
    'maker': 'gray'
}

# create Logo object
logo_logo = logomaker.Logo(logo_df,
                           ax=ax,
                           color_scheme=color_scheme,
                           baseline_width=0,
                           font_name='Arial',
                           show_spines=False,
                           vsep=.005,
                           width=.95)

# color the 'O' at the end of the logo a different color
logo_logo.style_single_glyph(c='O', p=3, color=[0, 0, 1])

# change the font of 'maker' and flip characters upright.
logo_logo.style_glyphs_below(font_name='OCR A Std', flip=False, width=1.0)

# remove tick marks
ax.set_xticks([])
ax.set_yticks([])

# tighten layout
Beispiel #26
0
# do imports
import matplotlib.pyplot as plt
import logomaker as logomaker

# load ww information matrix
ww_df = logomaker.get_example_matrix('ww_information_matrix',
                                     print_description=False)

# create Logo object
ww_logo = logomaker.Logo(ww_df,
                         font_name='Stencil Std',
                         color_scheme='NajafabadiEtAl2017',
                         vpad=.1,
                         width=.8)

# style using Logo methods
ww_logo.style_xticks(anchor=0, spacing=5, rotation=45)
ww_logo.highlight_position(p=4, color='gold', alpha=.5)
ww_logo.highlight_position(p=26, color='gold', alpha=.5)

# style using Axes methods
ww_logo.ax.set_ylabel('information (bits)')
ww_logo.ax.set_xlim([-1, len(ww_df)])

# show plot
ww_logo.fig.show()
Beispiel #27
0
def plot_residue_data_logo(residue_index,
                           logos,
                           interactions,
                           gap=1000,
                           letter_map=None,
                           color_scheme="chemistry",
                           ylabel=None,
                           title=None,
                           fn=None,
                           fig_close=False):
    """Plot interactions using `logomaker.Logo
    <https://logomaker.readthedocs.io/en/latest/implementation.html#logo-class>`_.

    Parameters
    -----------
    residue_index : list
            Residue indices in an ascending order. If a residue index is smaller than its preceding one,
            the plotting function will consider it as the start of a new chain and will plot the following data
            in a new figure. A gap in residue index that is less than ``gap`` will be marked as gray areas in
            the figure, but a gap that is larger than ``gap`` will start a new figure.
    logos : list of str
            Single letter logos in the corresponding order as ``residue_index``. The height of logos in the figure
            will be determined by values given to ``interactions``. Three-letter name of the 20 common amino acids
            are accepted and will be converted to their corresponding single-letter names in this function by
            the default. Other mappings can be defined via ``letter_map``.
    interactions : list
            Plotting values in the corresponding order as ``residue_index``.
    gap : int, optional, default=1000
            The number of missing residues in ``residue_index`` that starts a new figure. A gap between two adjacent
            index in ``residue_index`` that is smaller than the provided value will be considered as missing residues
            and will be marked as gray areas in the figure, whereas a gap that is larger than the provided value
            will start a new figure and plot the following data in that new figure. This can help to make figures
            more compressed. The gap needs to be greater than 1000. The default is 1000.
    letter_map : dict, optional, default=None
            A dictionary that maps provided names to single-letter logos in the form of
            {"provided name": "single_letter logo"}.
    color_scheme : str, optional, default="chemistry"
            The color scheme used by logomaker.Logo(). See
            `Color Schemes <https://logomaker.readthedocs.io/en/latest/examples.html#color-schemes>`_ for accepted values.
            Default is "chemistry".
    ylabel : str, optional, default=None
            y axis label. Default is "Interactions".
    fn : str, optional, default=None
            Figure name. By default the figure is saved as "Figure_interactions_logo.pdf" as the current
            working directory.
    fig_close : bool, optional, default=False
        Use plt.close() to close the figure. Can be used to save memory if many figures are opened.

    """
    # single-letter dictionary
    single_letter = {
        'CYS': 'C',
        'ASP': 'D',
        'SER': 'S',
        'GLN': 'Q',
        'LYS': 'K',
        'ILE': 'I',
        'PRO': 'P',
        'THR': 'T',
        'PHE': 'F',
        'ASN': 'N',
        'GLY': 'G',
        'HIS': 'H',
        'LEU': 'L',
        'ARG': 'R',
        'TRP': 'W',
        'ALA': 'A',
        'VAL': 'V',
        'GLU': 'E',
        'TYR': 'Y',
        'MET': 'M'
    }
    if letter_map is not None:
        single_letter.update(letter_map)

    logos_checked = []
    for name in logos:
        if len(name) == 1:
            logos_checked.append(name)
        else:
            logos_checked.append(single_letter[name])
    if ylabel is None:
        ylabel = "Interactions"
    if fn is None:
        fn = os.path.join(os.getcwd(), "Figure_interactions_logo.pdf")

    length = 100
    # check for chain breaks, gray_areas and axis breaks
    axis_obj = AxisIndex(residue_index, logos_checked, interactions, length,
                         gap)
    axis_obj.sort()
    # plot
    for page_idx in axis_obj.breaks.keys():
        n_rows = len(axis_obj.breaks[page_idx])
        fig, axes = plt.subplots(n_rows,
                                 1,
                                 figsize=(4.5, 1.3 * n_rows),
                                 sharey=True)
        plt.subplots_adjust(hspace=0.5, left=0.2)
        ymax = []
        for ax_idx, ax in enumerate(np.atleast_1d(axes)):
            resi_selected = [
                item[0] for item in axis_obj.breaks[page_idx][ax_idx]
            ]
            logos_selected = [
                item[1] for item in axis_obj.breaks[page_idx][ax_idx]
            ]
            interaction_selected = [
                item[2] for item in axis_obj.breaks[page_idx][ax_idx]
            ]
            ymax.append(np.max(interaction_selected))
            if np.sum(interaction_selected) > 0:
                df = pd.DataFrame({
                    "Resid": resi_selected,
                    "Resn": logos_selected,
                    "Data": interaction_selected
                })
                matrix = df.pivot(index="Resid", columns='Resn',
                                  values="Data").fillna(0)
                logomaker.Logo(matrix, color_scheme=color_scheme, ax=ax)
            if ax_idx == (n_rows - 1):
                ax.set_xlabel("Residue Index", fontsize=8, weight="bold")
            ax.xaxis.set_major_locator(MultipleLocator(20))
            ax.xaxis.set_minor_locator(MultipleLocator(1))
            ax.set_xlim(resi_selected[0] - 0.5, resi_selected[-1] + 0.5)
            ax.set_ylabel(ylabel, fontsize=8, weight="bold", va="center")
            for label in ax.xaxis.get_ticklabels() + ax.yaxis.get_ticklabels():
                plt.setp(label, fontsize=8, weight="bold")
        np.atleast_1d(axes)[-1].set_ylim(0, np.max(ymax) * 1.05)
        # plot missing areas
        if page_idx in axis_obj.gray_areas.keys():
            for item in axis_obj.gray_areas[page_idx]:
                np.atleast_1d(axes)[item[0]].axvspan(item[1],
                                                     item[2],
                                                     facecolor="#c0c0c0",
                                                     alpha=0.3)
        if title is not None:
            np.atleast_1d(axes)[0].set_title(title, fontsize=10, weight="bold")
        plt.tight_layout()
        if len(axis_obj.breaks.keys()) == 1:
            fig.savefig(fn, dpi=300)
        else:
            name, ext = os.path.splitext(fn)
            fig.savefig("{}_{}{}".format(name, page_idx, ext), dpi=300)
        if fig_close:
            plt.close()

    return
def logo(file,
         limit=(),
         output_file=None,
         old_format=False,
         min_beta=.001,
         max_beta=100.,
         num_betas=1000):
    """Plot sequence logos using logomaker.
    
    Parameters
    ----------
    file : str
        path to file containing energy matrix
    limit : Tuple, default ()
        first and last base of sequence that is converted into logo
    output_file : str, default None
        path where plot is saved to, if not None
    old_format : boolean, default False
        If True, file is loaded with extra argument "delim_whitespace"
    min_beta : float, default 0.001
        minimal scaling factor
    max_beta : float, default 100
        maximal scaling factor
    max_beta : int, default 1000
        number of tested scaling factors
    Returns
    -------
    binding_logo : logomaker.src.Logo.Logo
    """

    # Load in a binding site matrix.
    arraydf = pd.read_csv(file, index_col="pos", delim_whitespace=old_format)

    # Rename columns to be useable by the logomaker package
    arraydf = arraydf.rename(columns={
        'val_A': 'A',
        'val_C': 'C',
        'val_G': 'G',
        'val_T': 'T'
    })

    if len(limit) != 0:
        if len(limit) != 2:
            raise RuntimeError("limit must have length 2.")
        else:
            arraydf = arraydf.iloc[limit[0]:limit[1] + 1]
    # finding scaling factor
    target_info = len(arraydf.index)
    beta = information.get_beta_for_effect_df(arraydf,
                                              target_info,
                                              min_beta=min_beta,
                                              max_beta=max_beta,
                                              num_betas=num_betas)

    # use logomaker to convert energy matrix to information matrix
    binding_info = logomaker.transform_matrix(df=beta * arraydf,
                                              from_type='weight',
                                              to_type='information')
    binding_logo = logomaker.Logo(
        binding_info,
        #font_name='Stencil Std',
        vpad=.1,
        width=.8)

    # style using Logo methods
    binding_logo.style_spines(visible=False)
    binding_logo.style_spines(spines=['left', 'bottom'], visible=True)
    binding_logo.style_xticks(rotation=90, fmt='%d', anchor=0)

    # style using Axes methods
    binding_logo.ax.set_ylabel("Information (bits)", labelpad=-1)
    binding_logo.ax.xaxis.set_ticks_position('none')
    binding_logo.ax.xaxis.set_tick_params(pad=-1)
    binding_logo.ax.grid(False)
    binding_logo.ax.set_xticklabels(np.arange(limit[0], limit[1] + 1))

    if output_file != None:
        plt.savefig(output_file)

    return binding_logo
Beispiel #29
0
def output_line_plot(arguments):
    (mpbs_name, mpbs_num, signals, conditions, pwm, output_location,
     window_size, colors) = arguments
    mpbs_name = mpbs_name.replace("(", "_").replace(")", "")

    # output signal
    output_filename = os.path.join(output_location, "{}.txt".format(mpbs_name))
    with open(output_filename, "w") as f:
        f.write("\t".join(conditions) + "\n")
        for i in range(window_size):
            res = []
            for j, condition in enumerate(conditions):
                res.append(signals[j][i])

            f.write("\t".join(map(str, res)) + "\n")

    # to create a motif loge, we only use A, C, G, T
    pwm = {k: pwm[k] for k in ('A', 'C', 'G', 'T')}
    pwm = pd.DataFrame(data=pwm)
    pwm = pwm.add(1)
    pwm_prob = (pwm.T / pwm.T.sum()).T
    pwm_prob_log = np.log2(pwm_prob)
    pwm_prob_log = pwm_prob_log.mul(pwm_prob)
    info_content = pwm_prob_log.T.sum() + 2
    icm = pwm_prob.mul(info_content, axis=0)

    start = -(window_size // 2)
    end = (window_size // 2) - 1
    x = np.linspace(start, end, num=window_size)

    plt.close('all')
    fig, ax = plt.subplots()
    for i, condition in enumerate(conditions):
        ax.plot(x, signals[i], color=colors[i], label=condition)

    ax.text(0.15,
            0.9,
            'n = {}'.format(mpbs_num),
            verticalalignment='bottom',
            horizontalalignment='right',
            transform=ax.transAxes,
            fontweight='bold')

    ax.xaxis.set_ticks_position('bottom')
    ax.yaxis.set_ticks_position('left')
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    ax.spines['left'].set_position(('outward', 15))
    ax.tick_params(direction='out')
    ax.set_xticks([start, 0, end])
    ax.set_xticklabels([str(start), 0, str(end)])
    min_signal = np.min(signals)
    max_signal = np.max(signals)
    ax.set_yticks([min_signal, max_signal])
    ax.set_yticklabels([str(round(min_signal, 2)),
                        str(round(max_signal, 2))],
                       rotation=90)

    ax.set_title(mpbs_name, fontweight='bold')
    ax.set_xlim(start, end)
    ax.set_ylim([min_signal, max_signal])
    ax.legend(loc="upper right", frameon=False)
    ax.spines['bottom'].set_position(('outward', 70))

    ax = plt.axes([0.105, 0.085, 0.85, .2])
    logo = logomaker.Logo(icm, ax=ax, show_spines=False, baseline_width=0)
    ax.set_xticks([])
    ax.set_yticks([])
    fig.tight_layout()

    output_filename = os.path.join(output_location, "{}.pdf".format(mpbs_name))
    plt.savefig(output_filename)
def protein_logo(positions):
    """Draws a sequence logo of the positions requested showing differences between ABCG family members

  Arguments:
  positions -- a list of positions within the sequence alignment
  """
    ABCG1 = []
    ABCG2 = []
    ABCG4 = []
    ABCG5 = []
    ABCG8 = []

    for seq in ABCG1_sequences:
        tmp = ''
        for i in positions:
            tmp = tmp + seq[1][i]
        ABCG1.append(tmp)

    for seq in ABCG2_sequences:
        tmp = ''
        for i in positions:
            tmp = tmp + seq[1][i]
        ABCG2.append(tmp)

    for seq in ABCG4_sequences:
        tmp = ''
        for i in positions:
            tmp = tmp + seq[1][i]
        ABCG4.append(tmp)

    for seq in ABCG5_sequences:
        tmp = ''
        for i in positions:
            tmp = tmp + seq[1][i]
        ABCG5.append(tmp)

    for seq in ABCG8_sequences:
        tmp = ''
        for i in positions:
            tmp = tmp + seq[1][i]
        ABCG8.append(tmp)

    fig = plt.figure(figsize=[0.5 * len(ABCG1[0]), 5])

    ax = plt.subplot2grid((5, 1), (0, 0))
    ABCG1_logo = lm.Logo(lm.alignment_to_matrix(ABCG1),
                         ax=ax,
                         color_scheme='black')
    ax.set_xticks(range(len(positions)))
    ax.set_xticklabels(positions)
    ax.xaxis.tick_top()
    ax1 = plt.subplot2grid((5, 1), (1, 0))
    ABCG2_logo = lm.Logo(lm.alignment_to_matrix(ABCG2),
                         ax=ax1,
                         color_scheme='black')
    ax1.set_xticks([])
    ax2 = plt.subplot2grid((5, 1), (2, 0))
    ABCG4_logo = lm.Logo(lm.alignment_to_matrix(ABCG4),
                         ax=ax2,
                         color_scheme='black')
    ax2.set_xticks([])
    ax3 = plt.subplot2grid((5, 1), (3, 0))
    ABCG5_logo = lm.Logo(lm.alignment_to_matrix(ABCG5),
                         ax=ax3,
                         color_scheme='black')
    ax3.set_xticks([])
    ax4 = plt.subplot2grid((5, 1), (4, 0))
    ABCG8_logo = lm.Logo(lm.alignment_to_matrix(ABCG8),
                         ax=ax4,
                         color_scheme='black')
    ax4.set_xticks(range(len(positions)))

    plt.xticks(rotation=45, ha='right')
    this_conservation_pattern = []
    for i in positions:
        this_conservation_pattern.append(conservation_pattern[i])
    ax4.set_xticklabels(this_conservation_pattern)
    ax4.tick_params(labelsize=8)

    ax.set_yticks([])
    ax1.set_yticks([])
    ax2.set_yticks([])
    ax3.set_yticks([])
    ax4.set_yticks([])

    ax.set_ylabel('ABCG1', rotation=0, ha='right', fontsize=20)
    ax1.set_ylabel('ABCG2', rotation=0, ha='right', fontsize=20)
    ax2.set_ylabel('ABCG4', rotation=0, ha='right', fontsize=20)
    ax3.set_ylabel('ABCG5', rotation=0, ha='right', fontsize=20)
    ax4.set_ylabel('ABCG8', rotation=0, ha='right', fontsize=20)

    conservation_colours = conserved_colours(positions)

    for pos in range(len(conservation_colours[0])):
        ABCG1_logo.highlight_position(p=pos,
                                      color=conservation_colours[0][pos])

    for pos in range(len(conservation_colours[0])):
        ABCG2_logo.highlight_position(p=pos,
                                      color=conservation_colours[1][pos])

    for pos in range(len(conservation_colours[0])):
        ABCG4_logo.highlight_position(p=pos,
                                      color=conservation_colours[2][pos])

    for pos in range(len(conservation_colours[0])):
        ABCG5_logo.highlight_position(p=pos,
                                      color=conservation_colours[3][pos])

    for pos in range(len(conservation_colours[0])):
        ABCG8_logo.highlight_position(p=pos,
                                      color=conservation_colours[4][pos])

    return fig