def gen_logo_and_plot(seqs, n, fig, ax, first, last): # generate df c_mat = lm.alignment_to_matrix(sequences=seqs, to_type='counts', characters_to_ignore='.-X') # remove gaped positions n_seqs = c_mat.sum(axis=1) pos_to_keep = n_seqs > len(seqs) / 2 c_mat = c_mat[pos_to_keep] c_mat.reset_index(drop=True, inplace=True) # convert to probability p_mat = lm.transform_matrix(c_mat, from_type='counts', to_type='probability') # generate logo logo = lm.Logo(df=p_mat, ax=ax[n], stack_order='small_on_top', font_name='FreeSans', color_scheme=color_dict, vsep=0.0005, vpad=0.005) # modify logo logo.ax.set_xticks([0, int(len(p_mat) / 2), len(p_mat)]) dif = last - first logo.ax.set_xticklabels([str(first), str(int(last - dif / 2)), str(last)]) logo.ax.set_ylabel('Probability') logo.style_spines(visible=False) logo.style_spines(spines=['left', 'bottom'], visible=True, linewidth=1) ## make glyphs invisible that have less than 10% probability logo.fade_glyphs_in_probability_logo(0.1, 0.1000000001) return logo
def secondary_structure_conservation(seqs2, pdb_id, frame, logger, lbls, input_path): counts_mat2 = lm.alignment_to_matrix(seqs2) divider2 = len(seqs2[0]) / 40 logger.info("Se divide las cadenas de las estructuras secundarias en " + str(math.ceil(divider2)) + " partes para la generacion de graficos con LogoMaker") logger.info("Se utiliza el color scheme skylign_protein") logger.info( "https://academic.oup.com/bioinformatics/article/36/7/2272/5671693") counts_mat_list2 = np.array_split(counts_mat2, math.ceil(divider2)) alignment_label2 = Label(frame, text="Alineamiento de estructura secundaria: ") alignment_label2.config(font=("Verdana", 20)) lbls.append(alignment_label2) alignment_label2.pack(pady=(0, 30)) for df in counts_mat_list2: crp_logo = lm.Logo(df, color_scheme='skylign_protein') # style using Axes methods crp_logo.ax.xaxis.set_ticks_position('none') crp_logo.ax.xaxis.set_tick_params(pad=-1) plt.savefig(input_path + "/" + pdb_id + "_aln_secondary.png") load = Image.open(input_path + "/" + pdb_id + "_aln_secondary.png") render = ImageTk.PhotoImage(load) img = Label(frame, image=render) lbls.append(img) img.image = render img.pack(pady=(30, 0))
def make_sequence_logo(sequence_list, figname): height_per_row = .8 width_per_col = 1.5 num_cols = 4 num_rows = 1 seqlogo_matrix = logomaker.alignment_to_matrix(sequence_list) seqlogo = logomaker.Logo(seqlogo_matrix, font_name="Arial", color_scheme="weblogo_protein", width=1) seqlogo.style_spines(visible=False) seqlogo.ax.set_xticks([]) seqlogo.ax.set_yticks([]) plt.savefig(figname) print('Written %s' % figname, file=sys.stdout)
def build_consensus_from_consensus(env, df, col): # type: (Environment, pd.DataFrame, str) -> None df = df[~df[col].isna()].copy() # we only need non-NA consensus_seqs = gather_consensus_sequences(env, df, col) msa_t = run_msa_on_sequences(env, consensus_seqs, SBSPOptions(env, gapopen=10000), outputorder="tree-order") # print(msa_t.to_string()) # # print(consensus_seqs) summary_align = AlignInfo.SummaryInfo( MultipleSeqAlignment(msa_t.list_alignment_sequences)) con = summary_align.dumb_consensus() # print(con) # print(summary_align) seqs = [x.seq._data for x in msa_t.list_alignment_sequences] counts_mat = lm.alignment_to_matrix(sequences=seqs, to_type='counts', characters_to_ignore='.-X') # Counts matrix -> Information matrix info_mat = lm.transform_matrix(counts_mat, from_type='counts', to_type='information') lm.Logo(info_mat) plt.show() from collections import Counter print("New set") counter = Counter(consensus_seqs) sorted_counter = counter.most_common() print("\n".join([str(x) for x in sorted_counter]))
def haplotype_matrix_calculator(node_sequence_matrix): """Creates our sequence list for a given node into a matrix, useful to provide data to other functions such as logomaker. """ try: sequence_list = nseqmatrix_to_seqlist(node_sequence_matrix) full_gap_list = True for index, sequence in enumerate(sequence_list): sequence_list[index] = sequence.replace(" ", "-") if sequence_is_gapsonly(sequence_list[index]) == False: full_gap_list = False if full_gap_list == False: frequency_matrix = logomaker.alignment_to_matrix(sequence_list) else: frequency_matrix = None except: sys.stderr.write( "Error at calculating haplotype matrix (feature_processing.haplotype_matrix_calculator).\n" ) sys.exit(1) return frequency_matrix
def plot_letter_over_position(env, df, col, title=""): # type: (Environment, pd.DataFrame, str, str) -> None collect = dict() array, update_shifts = create_numpy_for_column_with_extended_motif( env, df, col, collect) df_original = df binned_arrays = [{ "GC": df["GC"], "motifs": array, "shifts": update_shifts }] example = df.at[df.index[0], col] # type: Dict[str, List[float]] w = len(next(iter(example.values()))) # width (numbere of positions) b = len(example) # number of bases (letters) letters = example.keys() letter_to_idx = {x: x_pos for x_pos, x in enumerate(sorted(letters))} # fig, axes = plt.subplots(2, math.ceil(len(letters) / 2), sharex="all", sharey="all") fig = plt.figure(figsize=(10, 12)) shape = (4, 2) ax1 = plt.subplot2grid(shape, (0, 0)) ax2 = plt.subplot2grid(shape, (0, 1)) ax3 = plt.subplot2grid(shape, (1, 0)) ax4 = plt.subplot2grid(shape, (1, 1)) ax_logo = plt.subplot2grid(shape, (3, 0)) ax_counts = plt.subplot2grid(shape, (2, 0)) ax_pos_dist = plt.subplot2grid(shape, (2, 1)) ax_text = plt.subplot2grid(shape, (3, 1)) axes = [ax1, ax2, ax3, ax4] # for each letter # for l, ax in zip(letters, axes.ravel()[:len(letters)]): ylim = [-0.1, 1.1] for l, ax in zip(letters, axes): # for each position in motif # go through df and accumulate values all_gc = list() all_probs = list() for w_pos in range(array.shape[1]): for ba in binned_arrays: arr = ba["motifs"] gc = ba["GC"].values shifts = ba["shifts"] for index in range(len(shifts)): shifted_position = w_pos # print(w_pos, shifted_position) # shifted_pos = w_pos - shifts[index] # if shifted_pos < 0 or shifted_pos >= w: # continue if w_pos < shifts[index] or w_pos >= shifts[index] + 6: continue all_gc.append(shifted_position) if arr[index, shifted_position, letter_to_idx[l]] < 0 or arr[index, shifted_position, letter_to_idx[l]] > 1: raise ValueError("Something's up") all_probs.append(arr[index, shifted_position, letter_to_idx[l]]) # ax.scatter(all_gc, all_probs, marker="+") # seaborn.regplot(all_gc, all_probs, ax=ax, lowess=True, scatter_kws={"s": 5, "alpha": 0.3}) ax.set_title(f"{l}") df = pd.DataFrame({"Position": all_gc, "Probability": all_probs}) df.sort_values("Position", inplace=True) # seaborn.kdeplot(df["Position"], df["Probability"], cmap="Reds", ax=ax) df_mean = df.groupby("Position", as_index=False).mean() seaborn.boxplot("Position", "Probability", data=df, ax=ax, color="red", fliersize=0) seaborn.lineplot(df_mean["Position"], df_mean["Probability"], ax=ax, color="blue") ax.set_ylim(ylim) # loess_with_stde(df, "Position", "Probability", ax, None) # plt.show() # add logo ax = ax_logo msa_t = collect["msa_t"] seqs = [x.seq._data for x in msa_t.list_alignment_sequences] counts_mat = lm.alignment_to_matrix(sequences=seqs, to_type='counts', characters_to_ignore='.-X') # Counts matrix -> Information matrix info_mat = lm.transform_matrix(counts_mat, from_type='counts', to_type='information') lm.Logo(info_mat, ax=ax, color_scheme="classic") ax.set_ylim([0, 2]) # add distplot of starting positions ax = ax_counts # seaborn.distplot(update_shifts, ax=ax) counter = Counter(update_shifts) total = sum(counter.values()) to_add = sorted(set(range(4)).difference(counter.keys())) normalized = [[x, 100 * counter[x] / total] for x in counter] + [[x, 0] for x in to_add] normalized = np.array(normalized) seaborn.barplot(normalized[:, 0], normalized[:, 1], ax=ax, color="blue") ax.set_ylim([0, 100]) ax.set_ylabel("Probability") ax.set_xlabel("Shift in consensus") ### Plot position distribution col_pos = col.replace("_MAT", "_POS_DISTR") ax = ax_pos_dist shift_to_pos_dist = get_position_distributions_by_shift( df_original, col_pos, update_shifts) for s in sorted(shift_to_pos_dist.keys()): list_pos_dist = shift_to_pos_dist[s] # average positions values = dict() for l in list_pos_dist: try: for i in l.keys(): if i not in values.keys(): values[i] = list() values[i].append(l[i]) except Exception: continue for i in values.keys(): values[i] = np.mean(values[i]) total = sum(values.values()) for i in values.keys(): values[i] /= total x = sorted(values.keys()) y = [values[a] for a in x] seaborn.lineplot(x, y, label=s, ax=ax) ax.legend() # TEXT ax = ax_text from matplotlib.font_manager import FontProperties fp = FontProperties() fp.set_family("monospace") print("here") print(print_reduced_msa(msa_t, True, n=10)) ax.text(0, 0, print_reduced_msa(msa_t, True, n=10), horizontalalignment='left', verticalalignment='center', fontproperties=fp) ax.set_xlim([-0.2, 0.4]) ax.set_ylim([-0.4, 0.4]) # ax.axis("off",) ax.get_xaxis().set_visible(False) ax.get_yaxis().set_visible(False) plt.suptitle("Gc range: {}. Num Data points: {}".format( title, msa_t.number_of_sequences())) # save_figure(FigureOptions(save_fig=next_name(env["pd-work"]))) plt.tight_layout() plt.subplots_adjust(top=0.9) plt.savefig(next_name(env["pd-work"])) plt.show()
def protein_logo(positions): """Draws a sequence logo of the positions requested showing differences between ABCG family members Arguments: positions -- a list of positions within the sequence alignment """ ABCG1 = [] ABCG2 = [] ABCG4 = [] ABCG5 = [] ABCG8 = [] for seq in ABCG1_sequences: tmp = '' for i in positions: tmp = tmp + seq[1][i] ABCG1.append(tmp) for seq in ABCG2_sequences: tmp = '' for i in positions: tmp = tmp + seq[1][i] ABCG2.append(tmp) for seq in ABCG4_sequences: tmp = '' for i in positions: tmp = tmp + seq[1][i] ABCG4.append(tmp) for seq in ABCG5_sequences: tmp = '' for i in positions: tmp = tmp + seq[1][i] ABCG5.append(tmp) for seq in ABCG8_sequences: tmp = '' for i in positions: tmp = tmp + seq[1][i] ABCG8.append(tmp) fig = plt.figure(figsize=[0.5 * len(ABCG1[0]), 5]) ax = plt.subplot2grid((5, 1), (0, 0)) ABCG1_logo = lm.Logo(lm.alignment_to_matrix(ABCG1), ax=ax, color_scheme='black') ax.set_xticks(range(len(positions))) ax.set_xticklabels(positions) ax.xaxis.tick_top() ax1 = plt.subplot2grid((5, 1), (1, 0)) ABCG2_logo = lm.Logo(lm.alignment_to_matrix(ABCG2), ax=ax1, color_scheme='black') ax1.set_xticks([]) ax2 = plt.subplot2grid((5, 1), (2, 0)) ABCG4_logo = lm.Logo(lm.alignment_to_matrix(ABCG4), ax=ax2, color_scheme='black') ax2.set_xticks([]) ax3 = plt.subplot2grid((5, 1), (3, 0)) ABCG5_logo = lm.Logo(lm.alignment_to_matrix(ABCG5), ax=ax3, color_scheme='black') ax3.set_xticks([]) ax4 = plt.subplot2grid((5, 1), (4, 0)) ABCG8_logo = lm.Logo(lm.alignment_to_matrix(ABCG8), ax=ax4, color_scheme='black') ax4.set_xticks(range(len(positions))) plt.xticks(rotation=45, ha='right') this_conservation_pattern = [] for i in positions: this_conservation_pattern.append(conservation_pattern[i]) ax4.set_xticklabels(this_conservation_pattern) ax4.tick_params(labelsize=8) ax.set_yticks([]) ax1.set_yticks([]) ax2.set_yticks([]) ax3.set_yticks([]) ax4.set_yticks([]) ax.set_ylabel('ABCG1', rotation=0, ha='right', fontsize=20) ax1.set_ylabel('ABCG2', rotation=0, ha='right', fontsize=20) ax2.set_ylabel('ABCG4', rotation=0, ha='right', fontsize=20) ax3.set_ylabel('ABCG5', rotation=0, ha='right', fontsize=20) ax4.set_ylabel('ABCG8', rotation=0, ha='right', fontsize=20) conservation_colours = conserved_colours(positions) for pos in range(len(conservation_colours[0])): ABCG1_logo.highlight_position(p=pos, color=conservation_colours[0][pos]) for pos in range(len(conservation_colours[0])): ABCG2_logo.highlight_position(p=pos, color=conservation_colours[1][pos]) for pos in range(len(conservation_colours[0])): ABCG4_logo.highlight_position(p=pos, color=conservation_colours[2][pos]) for pos in range(len(conservation_colours[0])): ABCG5_logo.highlight_position(p=pos, color=conservation_colours[3][pos]) for pos in range(len(conservation_colours[0])): ABCG8_logo.highlight_position(p=pos, color=conservation_colours[4][pos]) return fig
motifs.sort() print ("Motifs") for motif in motifs: print (motif) if len(expected)>0: differences = 0 for e,m in zip(expected,motifs): if e!=m: if differences==0: print ('Differences') print (e,m) differences+=1 if differences==0: print ('No differences detected') alignment_matrix = lm.alignment_to_matrix(motifs) most_frequent_bases = alignment_matrix.idxmax(axis=1) consensus = ''.join(most_frequent_bases.tolist()) ax1.set_title(consensus) lm.Logo(alignment_matrix,ax=ax3) if len(expected)>0: ax4.set_title("Expected") lm.Logo(lm.alignment_to_matrix(expected),ax=ax4) plt.savefig(os.path.basename(__file__).split('.')[0] ) plt.show()
# Make a histogram of the data and save this. plt.figure(1) plt.hist(df_GC_frac['GC_content'], bins = 20, color='orange', edgecolor='black') plt.xlabel('GC content (%)') plt.ylabel('Frequency') plt.savefig(sys.argv[3]+'_GC_content_histogram.png') # Make the DNA propensity logo of each of the sites. # Will append each of the lists from the above to one list below (only taking into account each 5' and 3' flanking residues) list_CREsense_5prime_flank.extend(list_CREantisense_3prime_flank) list_CREsense_3prime_flank.extend(list_CREantisense_5prime_flank) # print(list_CREsense_5prime_flank) counts_5prime_flank = lm.alignment_to_matrix(list_CREsense_5prime_flank) # print(counts_5prime_flank) counts_3prime_flank = lm.alignment_to_matrix(list_CREsense_3prime_flank) counts_5prime_flank.head() counts_3prime_flank.head() plt.figure(2) logo_5prime = lm.Logo(counts_5prime_flank, stack_order='small_on_top') logo_5prime.ax.set_xlabel('Position') logo_5prime.ax.set_ylabel('Counts') plt.savefig(sys.argv[3]+'_countslogo_5prime.png') # plt.show() plt.figure(3) logo_3prime = lm.Logo(counts_3prime_flank, stack_order='small_on_top') logo_3prime.ax.set_xlabel('Position') logo_5prime.ax.set_ylabel('Counts') plt.savefig(sys.argv[3]+'_logo_3prime.png')