def gen_logo_and_plot(seqs, n, fig, ax, first, last):
    # generate df
    c_mat = lm.alignment_to_matrix(sequences=seqs,
                                   to_type='counts',
                                   characters_to_ignore='.-X')
    # remove gaped positions
    n_seqs = c_mat.sum(axis=1)
    pos_to_keep = n_seqs > len(seqs) / 2
    c_mat = c_mat[pos_to_keep]
    c_mat.reset_index(drop=True, inplace=True)
    # convert to probability
    p_mat = lm.transform_matrix(c_mat,
                                from_type='counts',
                                to_type='probability')
    # generate logo
    logo = lm.Logo(df=p_mat,
                   ax=ax[n],
                   stack_order='small_on_top',
                   font_name='FreeSans',
                   color_scheme=color_dict,
                   vsep=0.0005,
                   vpad=0.005)
    # modify logo
    logo.ax.set_xticks([0, int(len(p_mat) / 2), len(p_mat)])
    dif = last - first
    logo.ax.set_xticklabels([str(first), str(int(last - dif / 2)), str(last)])
    logo.ax.set_ylabel('Probability')
    logo.style_spines(visible=False)
    logo.style_spines(spines=['left', 'bottom'], visible=True, linewidth=1)
    ## make glyphs invisible that have less than 10% probability
    logo.fade_glyphs_in_probability_logo(0.1, 0.1000000001)

    return logo
def secondary_structure_conservation(seqs2, pdb_id, frame, logger, lbls,
                                     input_path):
    counts_mat2 = lm.alignment_to_matrix(seqs2)
    divider2 = len(seqs2[0]) / 40
    logger.info("Se divide las cadenas de las estructuras secundarias en " +
                str(math.ceil(divider2)) +
                " partes para la generacion de graficos con LogoMaker")
    logger.info("Se utiliza el color scheme skylign_protein")
    logger.info(
        "https://academic.oup.com/bioinformatics/article/36/7/2272/5671693")

    counts_mat_list2 = np.array_split(counts_mat2, math.ceil(divider2))
    alignment_label2 = Label(frame,
                             text="Alineamiento de estructura secundaria: ")
    alignment_label2.config(font=("Verdana", 20))
    lbls.append(alignment_label2)
    alignment_label2.pack(pady=(0, 30))
    for df in counts_mat_list2:
        crp_logo = lm.Logo(df, color_scheme='skylign_protein')
        # style using Axes methods
        crp_logo.ax.xaxis.set_ticks_position('none')
        crp_logo.ax.xaxis.set_tick_params(pad=-1)
        plt.savefig(input_path + "/" + pdb_id + "_aln_secondary.png")
        load = Image.open(input_path + "/" + pdb_id + "_aln_secondary.png")
        render = ImageTk.PhotoImage(load)
        img = Label(frame, image=render)
        lbls.append(img)
        img.image = render
        img.pack(pady=(30, 0))
def make_sequence_logo(sequence_list, figname):
    height_per_row = .8
    width_per_col = 1.5
    num_cols = 4
    num_rows = 1
    seqlogo_matrix = logomaker.alignment_to_matrix(sequence_list)
    seqlogo = logomaker.Logo(seqlogo_matrix,
                             font_name="Arial",
                             color_scheme="weblogo_protein",
                             width=1)
    seqlogo.style_spines(visible=False)
    seqlogo.ax.set_xticks([])
    seqlogo.ax.set_yticks([])
    plt.savefig(figname)
    print('Written %s' % figname, file=sys.stdout)
def build_consensus_from_consensus(env, df, col):
    # type: (Environment, pd.DataFrame, str) -> None
    df = df[~df[col].isna()].copy()  # we only need non-NA

    consensus_seqs = gather_consensus_sequences(env, df, col)

    msa_t = run_msa_on_sequences(env,
                                 consensus_seqs,
                                 SBSPOptions(env, gapopen=10000),
                                 outputorder="tree-order")

    # print(msa_t.to_string())
    #
    # print(consensus_seqs)

    summary_align = AlignInfo.SummaryInfo(
        MultipleSeqAlignment(msa_t.list_alignment_sequences))
    con = summary_align.dumb_consensus()

    # print(con)
    # print(summary_align)
    seqs = [x.seq._data for x in msa_t.list_alignment_sequences]
    counts_mat = lm.alignment_to_matrix(sequences=seqs,
                                        to_type='counts',
                                        characters_to_ignore='.-X')

    # Counts matrix -> Information matrix
    info_mat = lm.transform_matrix(counts_mat,
                                   from_type='counts',
                                   to_type='information')

    lm.Logo(info_mat)
    plt.show()

    from collections import Counter

    print("New set")
    counter = Counter(consensus_seqs)
    sorted_counter = counter.most_common()
    print("\n".join([str(x) for x in sorted_counter]))
Example #5
0
def haplotype_matrix_calculator(node_sequence_matrix):
    """Creates our sequence list for a given node into a matrix, useful to provide
    data to other functions such as logomaker.
    """
    try:
        sequence_list = nseqmatrix_to_seqlist(node_sequence_matrix)
        full_gap_list = True
        for index, sequence in enumerate(sequence_list):
            sequence_list[index] = sequence.replace(" ", "-")
            if sequence_is_gapsonly(sequence_list[index]) == False:
                full_gap_list = False

        if full_gap_list == False:
            frequency_matrix = logomaker.alignment_to_matrix(sequence_list)
        else:
            frequency_matrix = None
    except:
        sys.stderr.write(
            "Error at calculating haplotype matrix (feature_processing.haplotype_matrix_calculator).\n"
        )
        sys.exit(1)

    return frequency_matrix
def plot_letter_over_position(env, df, col, title=""):
    # type: (Environment, pd.DataFrame, str, str) -> None

    collect = dict()
    array, update_shifts = create_numpy_for_column_with_extended_motif(
        env, df, col, collect)
    df_original = df
    binned_arrays = [{
        "GC": df["GC"],
        "motifs": array,
        "shifts": update_shifts
    }]

    example = df.at[df.index[0], col]  # type: Dict[str, List[float]]
    w = len(next(iter(example.values())))  # width (numbere of positions)
    b = len(example)  # number of bases (letters)

    letters = example.keys()
    letter_to_idx = {x: x_pos for x_pos, x in enumerate(sorted(letters))}

    # fig, axes = plt.subplots(2, math.ceil(len(letters) / 2), sharex="all", sharey="all")
    fig = plt.figure(figsize=(10, 12))
    shape = (4, 2)

    ax1 = plt.subplot2grid(shape, (0, 0))
    ax2 = plt.subplot2grid(shape, (0, 1))
    ax3 = plt.subplot2grid(shape, (1, 0))
    ax4 = plt.subplot2grid(shape, (1, 1))
    ax_logo = plt.subplot2grid(shape, (3, 0))
    ax_counts = plt.subplot2grid(shape, (2, 0))
    ax_pos_dist = plt.subplot2grid(shape, (2, 1))
    ax_text = plt.subplot2grid(shape, (3, 1))

    axes = [ax1, ax2, ax3, ax4]

    # for each letter
    # for l, ax in zip(letters, axes.ravel()[:len(letters)]):
    ylim = [-0.1, 1.1]
    for l, ax in zip(letters, axes):
        # for each position in motif
        # go through df and accumulate values
        all_gc = list()
        all_probs = list()
        for w_pos in range(array.shape[1]):

            for ba in binned_arrays:
                arr = ba["motifs"]
                gc = ba["GC"].values
                shifts = ba["shifts"]

                for index in range(len(shifts)):

                    shifted_position = w_pos
                    # print(w_pos, shifted_position)

                    # shifted_pos = w_pos - shifts[index]
                    # if shifted_pos < 0 or shifted_pos >= w:
                    #     continue
                    if w_pos < shifts[index] or w_pos >= shifts[index] + 6:
                        continue

                    all_gc.append(shifted_position)

                    if arr[index, shifted_position,
                           letter_to_idx[l]] < 0 or arr[index,
                                                        shifted_position,
                                                        letter_to_idx[l]] > 1:
                        raise ValueError("Something's up")
                    all_probs.append(arr[index, shifted_position,
                                         letter_to_idx[l]])

            # ax.scatter(all_gc, all_probs, marker="+")
            # seaborn.regplot(all_gc, all_probs, ax=ax, lowess=True, scatter_kws={"s": 5, "alpha": 0.3})
        ax.set_title(f"{l}")

        df = pd.DataFrame({"Position": all_gc, "Probability": all_probs})
        df.sort_values("Position", inplace=True)

        # seaborn.kdeplot(df["Position"], df["Probability"], cmap="Reds", ax=ax)

        df_mean = df.groupby("Position", as_index=False).mean()
        seaborn.boxplot("Position",
                        "Probability",
                        data=df,
                        ax=ax,
                        color="red",
                        fliersize=0)
        seaborn.lineplot(df_mean["Position"],
                         df_mean["Probability"],
                         ax=ax,
                         color="blue")
        ax.set_ylim(ylim)
        # loess_with_stde(df, "Position", "Probability", ax, None)

        # plt.show()

    # add logo
    ax = ax_logo
    msa_t = collect["msa_t"]
    seqs = [x.seq._data for x in msa_t.list_alignment_sequences]
    counts_mat = lm.alignment_to_matrix(sequences=seqs,
                                        to_type='counts',
                                        characters_to_ignore='.-X')

    # Counts matrix -> Information matrix
    info_mat = lm.transform_matrix(counts_mat,
                                   from_type='counts',
                                   to_type='information')

    lm.Logo(info_mat, ax=ax, color_scheme="classic")
    ax.set_ylim([0, 2])

    # add distplot of starting positions
    ax = ax_counts
    # seaborn.distplot(update_shifts, ax=ax)
    counter = Counter(update_shifts)
    total = sum(counter.values())
    to_add = sorted(set(range(4)).difference(counter.keys()))
    normalized = [[x, 100 * counter[x] / total]
                  for x in counter] + [[x, 0] for x in to_add]
    normalized = np.array(normalized)
    seaborn.barplot(normalized[:, 0], normalized[:, 1], ax=ax, color="blue")
    ax.set_ylim([0, 100])
    ax.set_ylabel("Probability")
    ax.set_xlabel("Shift in consensus")

    ### Plot position distribution
    col_pos = col.replace("_MAT", "_POS_DISTR")
    ax = ax_pos_dist
    shift_to_pos_dist = get_position_distributions_by_shift(
        df_original, col_pos, update_shifts)
    for s in sorted(shift_to_pos_dist.keys()):
        list_pos_dist = shift_to_pos_dist[s]

        # average positions
        values = dict()
        for l in list_pos_dist:
            try:
                for i in l.keys():
                    if i not in values.keys():
                        values[i] = list()
                    values[i].append(l[i])
            except Exception:
                continue
        for i in values.keys():
            values[i] = np.mean(values[i])

        total = sum(values.values())
        for i in values.keys():
            values[i] /= total

        x = sorted(values.keys())
        y = [values[a] for a in x]

        seaborn.lineplot(x, y, label=s, ax=ax)

    ax.legend()

    # TEXT
    ax = ax_text
    from matplotlib.font_manager import FontProperties
    fp = FontProperties()
    fp.set_family("monospace")
    print("here")
    print(print_reduced_msa(msa_t, True, n=10))
    ax.text(0,
            0,
            print_reduced_msa(msa_t, True, n=10),
            horizontalalignment='left',
            verticalalignment='center',
            fontproperties=fp)
    ax.set_xlim([-0.2, 0.4])
    ax.set_ylim([-0.4, 0.4])
    # ax.axis("off",)
    ax.get_xaxis().set_visible(False)
    ax.get_yaxis().set_visible(False)

    plt.suptitle("Gc range: {}. Num Data points: {}".format(
        title, msa_t.number_of_sequences()))
    # save_figure(FigureOptions(save_fig=next_name(env["pd-work"])))
    plt.tight_layout()
    plt.subplots_adjust(top=0.9)

    plt.savefig(next_name(env["pd-work"]))
    plt.show()
def protein_logo(positions):
    """Draws a sequence logo of the positions requested showing differences between ABCG family members

  Arguments:
  positions -- a list of positions within the sequence alignment
  """
    ABCG1 = []
    ABCG2 = []
    ABCG4 = []
    ABCG5 = []
    ABCG8 = []

    for seq in ABCG1_sequences:
        tmp = ''
        for i in positions:
            tmp = tmp + seq[1][i]
        ABCG1.append(tmp)

    for seq in ABCG2_sequences:
        tmp = ''
        for i in positions:
            tmp = tmp + seq[1][i]
        ABCG2.append(tmp)

    for seq in ABCG4_sequences:
        tmp = ''
        for i in positions:
            tmp = tmp + seq[1][i]
        ABCG4.append(tmp)

    for seq in ABCG5_sequences:
        tmp = ''
        for i in positions:
            tmp = tmp + seq[1][i]
        ABCG5.append(tmp)

    for seq in ABCG8_sequences:
        tmp = ''
        for i in positions:
            tmp = tmp + seq[1][i]
        ABCG8.append(tmp)

    fig = plt.figure(figsize=[0.5 * len(ABCG1[0]), 5])

    ax = plt.subplot2grid((5, 1), (0, 0))
    ABCG1_logo = lm.Logo(lm.alignment_to_matrix(ABCG1),
                         ax=ax,
                         color_scheme='black')
    ax.set_xticks(range(len(positions)))
    ax.set_xticklabels(positions)
    ax.xaxis.tick_top()
    ax1 = plt.subplot2grid((5, 1), (1, 0))
    ABCG2_logo = lm.Logo(lm.alignment_to_matrix(ABCG2),
                         ax=ax1,
                         color_scheme='black')
    ax1.set_xticks([])
    ax2 = plt.subplot2grid((5, 1), (2, 0))
    ABCG4_logo = lm.Logo(lm.alignment_to_matrix(ABCG4),
                         ax=ax2,
                         color_scheme='black')
    ax2.set_xticks([])
    ax3 = plt.subplot2grid((5, 1), (3, 0))
    ABCG5_logo = lm.Logo(lm.alignment_to_matrix(ABCG5),
                         ax=ax3,
                         color_scheme='black')
    ax3.set_xticks([])
    ax4 = plt.subplot2grid((5, 1), (4, 0))
    ABCG8_logo = lm.Logo(lm.alignment_to_matrix(ABCG8),
                         ax=ax4,
                         color_scheme='black')
    ax4.set_xticks(range(len(positions)))

    plt.xticks(rotation=45, ha='right')
    this_conservation_pattern = []
    for i in positions:
        this_conservation_pattern.append(conservation_pattern[i])
    ax4.set_xticklabels(this_conservation_pattern)
    ax4.tick_params(labelsize=8)

    ax.set_yticks([])
    ax1.set_yticks([])
    ax2.set_yticks([])
    ax3.set_yticks([])
    ax4.set_yticks([])

    ax.set_ylabel('ABCG1', rotation=0, ha='right', fontsize=20)
    ax1.set_ylabel('ABCG2', rotation=0, ha='right', fontsize=20)
    ax2.set_ylabel('ABCG4', rotation=0, ha='right', fontsize=20)
    ax3.set_ylabel('ABCG5', rotation=0, ha='right', fontsize=20)
    ax4.set_ylabel('ABCG8', rotation=0, ha='right', fontsize=20)

    conservation_colours = conserved_colours(positions)

    for pos in range(len(conservation_colours[0])):
        ABCG1_logo.highlight_position(p=pos,
                                      color=conservation_colours[0][pos])

    for pos in range(len(conservation_colours[0])):
        ABCG2_logo.highlight_position(p=pos,
                                      color=conservation_colours[1][pos])

    for pos in range(len(conservation_colours[0])):
        ABCG4_logo.highlight_position(p=pos,
                                      color=conservation_colours[2][pos])

    for pos in range(len(conservation_colours[0])):
        ABCG5_logo.highlight_position(p=pos,
                                      color=conservation_colours[3][pos])

    for pos in range(len(conservation_colours[0])):
        ABCG8_logo.highlight_position(p=pos,
                                      color=conservation_colours[4][pos])

    return fig
Example #8
0
    motifs.sort()
 
    print ("Motifs")
    for motif in motifs:
        print (motif)
        
    if len(expected)>0:  
        differences = 0
        for e,m in zip(expected,motifs):
            if e!=m:
                if differences==0:
                    print ('Differences')
                print (e,m)
                differences+=1
        if differences==0:
            print ('No differences detected') 
            
    alignment_matrix    = lm.alignment_to_matrix(motifs)
    most_frequent_bases = alignment_matrix.idxmax(axis=1)
    consensus           = ''.join(most_frequent_bases.tolist())
    ax1.set_title(consensus)
    lm.Logo(alignment_matrix,ax=ax3)
    
    if len(expected)>0:
        ax4.set_title("Expected")
        lm.Logo(lm.alignment_to_matrix(expected),ax=ax4)
    plt.savefig(os.path.basename(__file__).split('.')[0] )
    
    plt.show()

# Make a histogram of the data and save this.
plt.figure(1)
plt.hist(df_GC_frac['GC_content'], bins = 20, color='orange', edgecolor='black')
plt.xlabel('GC content (%)')
plt.ylabel('Frequency')
plt.savefig(sys.argv[3]+'_GC_content_histogram.png')

# Make the DNA propensity logo of each of the sites.
# Will append each of the lists from the above to one list below (only taking into account each 5' and 3' flanking residues)
list_CREsense_5prime_flank.extend(list_CREantisense_3prime_flank)
list_CREsense_3prime_flank.extend(list_CREantisense_5prime_flank)
# print(list_CREsense_5prime_flank)


counts_5prime_flank = lm.alignment_to_matrix(list_CREsense_5prime_flank)
# print(counts_5prime_flank)
counts_3prime_flank = lm.alignment_to_matrix(list_CREsense_3prime_flank)
counts_5prime_flank.head()
counts_3prime_flank.head()
plt.figure(2)
logo_5prime = lm.Logo(counts_5prime_flank, stack_order='small_on_top')
logo_5prime.ax.set_xlabel('Position')
logo_5prime.ax.set_ylabel('Counts')
plt.savefig(sys.argv[3]+'_countslogo_5prime.png')
# plt.show()
plt.figure(3)
logo_3prime = lm.Logo(counts_3prime_flank, stack_order='small_on_top')
logo_3prime.ax.set_xlabel('Position')
logo_5prime.ax.set_ylabel('Counts')
plt.savefig(sys.argv[3]+'_logo_3prime.png')