def visualize_nucl_substitution_matrix(shms_df, output_fname, log): nucl_list = ['A', 'C', 'G', 'T'] nucl_matrix = [] for n in nucl_list: nucl_matrix.append([0] * len(nucl_list)) num_shms = 0 for it in shms_df: read_shms = shms_df[it] for shm in read_shms: if not shm.is_substitution(): continue if nucl_is_valid(shm.read_nucl) and nucl_is_valid(shm.gene_nucl): nucl_matrix[nucl_list.index(shm.read_nucl)][nucl_list.index( shm.gene_nucl)] += 1 num_shms += 1 for i in range(0, len(nucl_matrix)): for j in range(0, len(nucl_matrix[i])): nucl_matrix[i][j] = float(nucl_matrix[i][j]) / float(num_shms) fig, ax = plt.subplots() sns.heatmap(nucl_matrix, cmap=plt.cm.Blues, xticklabels=nucl_list, yticklabels=nucl_list, square=True, ax=ax) ax.tick_params(labelsize=14) plt.xticks(fontsize=12) plt.yticks(fontsize=12, rotation='horizontal') plt.xlabel("To", fontsize=14) plt.ylabel("From", fontsize=14, rotation='horizontal') utils.output_figure(output_fname, "Nucleotide substitution heatmap", log)
def visualize_largest_group_aa_variability(labeling_df, region, region_name, output_fname, log): region_seq = list(labeling_df[region]) max_group = get_region_largest_group(region_seq) if len(max_group) == 0: return group_len = len(max_group[0]) if group_len % 3 != 0: print("Largest " + region_name + " is not out-of-frame") return aa_len = group_len // 3 aa_seqs = [Seq(cdr).translate(to_stop=True) for cdr in max_group] aa_dict = {'Position': [], 'Hidrophobicity': []} for aa_seq in aa_seqs: aa_row = [utils.hydrophoby_dict[aa] for aa in aa_seq] for i in range(len(aa_row)): aa_dict['Position'].append(i + 1) aa_dict['Hidrophobicity'].append(aa_row[i]) plt.figure() sns.barplot(x='Position', y='Hidrophobicity', data=aa_dict, order=range(1, aa_len + 1), color='blue') plt.xlabel('Position (aa)', fontsize=14) plt.ylabel('Hidrophobicity', fontsize=14) plt.ylim( min(utils.hydrophoby_dict.values()) - 10, max(utils.hydrophoby_dict.values()) + 10) utils.output_figure(output_fname, region_name + " aa variability", log)
def visualize_largest_region_nucls(labeling_df, region, region_name, output_fname, log): region_seq = list(labeling_df[region]) max_group = get_region_largest_group(region_seq) if len(max_group) == 0: return nucl_dict = get_nucls_lists(max_group) x = np.array(range(0, len(max_group[0]))) x_l = [str(i) for i in range(1, len(max_group[0]) + 1)] acgt = nucl_dict['A'] + nucl_dict['C'] + nucl_dict['G'] + nucl_dict['T'] cgt = nucl_dict['C'] + nucl_dict['G'] + nucl_dict['T'] gt = nucl_dict['G'] + nucl_dict['T'] #sns.set_color_codes("pastel") sns.set_color_codes("muted") f, ax = plt.subplots(figsize=(15, 6)) sns.barplot(x=x, y=acgt, label="A", color='b') sns.barplot(x=x, y=cgt, label="C", color='g') sns.barplot(x=x, y=gt, label="G", color='r') sns.barplot(x=x, y=nucl_dict['T'], label="T", color='orange') plt.ylim(0, 115) ax.legend(ncol=4, loc="upper center", frameon=True, fontsize=16) plt.xlabel(region_name + ' position (nt)', fontsize=16) plt.ylabel('Nucleotide %', fontsize=16) plt.xticks(x, x_l, fontsize=14) plt.yticks(fontsize=14) utils.output_figure(output_fname, region_name + " nucleotide distribution", log)
def output_shm_stats_for_isotype(num_shms, shm_pos, isotype, output_prefix, log): plt.figure(1) plt.subplot(211) # plot for SHM positions plt.hist(shm_pos, color=isotype_colors[isotype], alpha=.75, bins=50) #cdr_color = "#EFBEBE" #plt.gca().add_patch(patches.Rectangle((cdr_positions[isotype]['CDR1'][0], 0), # cdr_positions[isotype]['CDR1'][1] - cdr_positions[isotype]['CDR1'][0], # max(n) + 2, facecolor= cdr_color, lw = 0)) #plt.gca().add_patch(patches.Rectangle((cdr_positions[isotype]['CDR2'][0], 0), # cdr_positions[isotype]['CDR2'][1] - cdr_positions[isotype]['CDR2'][0], # max(n) + 2, facecolor= cdr_color, lw = 0)) #plt.gca().add_patch(patches.Rectangle((cdr_positions[isotype]['CDR3'][0], 0), # cdr_positions[isotype]['CDR3'][1] - cdr_positions[isotype]['CDR3'][0], # max(n) + 2, facecolor= cdr_color, lw = 0)) #n, bins, p = pylab.hist(shm_pos, color = isotype_colors[isotype], bins = 50) plt.xlabel("Relative position of " + isotype + "V SHM in read", fontsize=16) plt.ylabel("# SHMs", fontsize=16) plt.xticks(fontsize=14) plt.yticks(fontsize=14) plt.title("SHMs in " + isotype + "V", fontsize=18) # plot for SHM number plt.subplot(212) plt.hist(num_shms, color=isotype_colors[isotype], bins=50, alpha=.75) plt.xlabel("# SHMs in " + isotype + "V", fontsize=16) plt.ylabel("# sequences", fontsize=16) plt.xticks(fontsize=14) plt.yticks(fontsize=14) output_fname = output_prefix + "_" + isotype + "V" utils.output_figure(output_fname, "Distribution of # SHMs in " + isotype + "V segments", log)
def visualize_length_abundance_dist(labeling_df, region, region_name, output_fname, log): region_seq = list(labeling_df[region]) region_dict = dict() for seq in region_seq: if seq not in region_dict: region_dict[seq] = 0 region_dict[seq] += 1 abun = [] #np.array() lens = [] #np.array() for seq in region_dict: if region_dict[seq] == 1: continue abun.append(region_dict[seq]) lens.append(len(seq)) abun = np.asarray(abun) lens = np.asarray(lens) f, ax = plt.subplots() sns.jointplot(abun, lens, size=6) #plt.xlabel(region_name + ' abundance', fontsize = 14) #ax.xaxis.set_label_position('top') #plt.ylabel(region_name + ' length', fontsize = 14) #plt.xticks(fontsize = 14) #plt.yticks(fontsize = 14) #plt.xlim(-1, abun.max() + 1) #plt.ylim(-1, lens.max() + 1) utils.output_figure( output_fname, region_name + " joint distribution of abundances & lengths", log)
def visualize_indel_shm_lengths(shm_df, output_fname, log): prev_read_pos = -1 prev_gene_pos = -1 insertion_length = [] deletions_lengths = [] in_len = 0 del_len = 0 for it in shm_df: read_shms = shm_df[it] for shm in read_shms: if shm.is_deletion(): if shm.gene_pos - prev_gene_pos == 1: del_len += 1 else: if del_len > 0: deletions_lengths.append(del_len) del_len = 1 prev_gene_pos = shm.gene_pos if shm.is_insertion(): if shm.read_pos - prev_read_pos == 1: in_len += 1 else: if in_len > 0: insertion_length.append(in_len) in_len = 1 prev_read_pos = shm.read_pos if in_len != 0: insertion_length.append(in_len) if del_len != 0: deletions_lengths.append(del_len) dt = [] labels = [] max_x_value = 0 if len(deletions_lengths) > 10: dt.append(deletions_lengths) labels.append("Deletions") if len(insertion_length) > 10: dt.append(insertion_length) labels.append("Insertions") if len(dt) == 0: log.info( "Output contains very low number of indel SHMs. Plot drawing was skipped" ) return plt.hist(dt, label=labels, bins=50) plt.legend(loc='upper center', ncol=len(dt), fontsize=14) plt.xlabel("Insertion / deletion SHM length", fontsize=16) plt.ylabel("# insertion / deletion SHMs", fontsize=16) xlim_right = 0 if len(deletions_lengths) != 0: xlim_right = max(deletions_lengths) if len(insertion_length) != 0: xlim_right = max(xlim_right, max(insertion_length)) plt.xlim(.5, xlim_right + .5) plt.xticks(range(0, xlim_right + 1), fontsize=14) plt.yticks(fontsize=14) utils.output_figure(output_fname, "Distribution of insertion/deletion SHM lengths", log)
def visualize_special_shm_positions(shm_df, syn_output_fname, special_output_fname, log): synonymous_pos = [] stop_codon_pos = [] deletion_pos = [] insertion_pos = [] for it in shm_df: read_shms = shm_df[it] for shm in read_shms: if not it.is_variable(): continue relative_pos = float(shm.read_pos) / float(it.read_len) if shm.synonymous: synonymous_pos.append(relative_pos) elif shm.to_stop_codon: stop_codon_pos.append(relative_pos) elif shm.is_deletion(): deletion_pos.append(relative_pos) elif shm.is_insertion(): insertion_pos.append(relative_pos) output_synonymous_shms(synonymous_pos, syn_output_fname, log) pos = [] labels = [] colors = [] plt.figure(figsize=(12, 9)) #sns.distplot(synonymous_pos, hist = False, label = "Synonymous SHMs", color = 'r') #if len(stop_codon_pos) > 100: # pos.append(stop_codon_pos) # labels.append('Stop codon') # colors.append('g') # #sns.distplot(stop_codon_pos, hist = False, label = "Stop codon SHMs", color = 'g') if len(deletion_pos) > 10: pos.append(deletion_pos) labels.append('Deletions') colors.append('b') #sns.distplot(deletion_pos, hist = False, label = "Deletion SHMs", color = 'b') if len(insertion_pos) > 10: pos.append(insertion_pos) labels.append('Insertions') colors.append('g') if len(pos) == 0: log.info( "Output contains very low number of special SHMs. Plot drawing will be skipped" ) return #sns.distplot(insertion_pos, hist = False, label = "Insertion SHMs", color = 'orange') plt.hist(pos, color=colors, label=labels, bins=100 / len(pos)) plt.xlim(0, .75) plt.legend(loc='upper center', ncol=len(pos), fontsize=12, bbox_to_anchor=(0.5, -0.07)) plt.xlabel("Relative position of V SHM in read", fontsize=14) plt.ylabel("# SHMs", fontsize=14) plt.xticks(fontsize=12) plt.yticks(fontsize=12) utils.output_figure(special_output_fname, "Distribution of indel V SHM positions in read", log)
def visualize_region_lengths(labeling_df, region, region_name, output_fname, log): region_seq = list(labeling_df[region]) region_len = [len(s) for s in region_seq if len(s) > 1] plt.figure() plt.hist(region_len) plt.xlabel(region_name + ' length (nt)', fontsize=14) plt.ylabel('# ' + region_name + 's', fontsize=14) utils.output_figure(output_fname, region_name + " length distribution", log)
def output_shm_stats_for_isotype(num_shms, locus, output_fname, log): plt.figure() plt.hist(num_shms, color=isotype_colors[locus], bins=50, alpha=.75) plt.xlabel("# SHMs in " + locus + "V", fontsize=16) plt.ylabel("# sequences", fontsize=16) plt.xticks(fontsize=14) plt.yticks(fontsize=14) plt.title('# SHMs in ' + locus + 'V' + ' sequences', fontsize=14) utils.output_figure(output_fname, "Distribution of # SHMs in " + locus + "V segments", log)
def OutputHeatmap(self, output_fname, log): plt.figure(figsize=(10, 15)) sns.heatmap(np.array(self.abundant_vj_matrix), xticklabels=self.sorted_js, yticklabels=self.used_vs, cmap='jet') plt.yticks(rotation=0, fontsize=10) plt.xticks(rotation=90, fontsize=10) utils.output_figure( output_fname, "VJ heatmap for the most abundant VJ combinations", log)
def output_synonymous_shms(synonymous_pos, output_fname, log): if len(synonymous_pos) < 100: return plt.hist(synonymous_pos, color='r', bins=100) plt.xlabel("Relative position of V SHM in read", fontsize=14) plt.ylabel("#SHMs", fontsize=14) plt.xlim(0, .75) plt.xticks(fontsize=12) plt.yticks(fontsize=12) utils.output_figure( output_fname, "Distribution of synonymous SHM positions in V segment", log)
def OutputJUsage(self, output_fname, log): plt.figure(figsize=(10, 8)) perc_list = [ float(self.j_dict[j]) / len(self.vj_df) * 100 for j in self.sorted_js ] plt.bar(range(len(self.sorted_js)), perc_list) plt.xticks(range(len(self.sorted_js)), self.sorted_js, rotation=90, fontsize=10) plt.ylabel('% of sequences', fontsize=14) utils.output_figure(output_fname, "Usage of J genes", log)
def visualize_region_lengths(labeling_df, region, region_name, output_fname, log): region_seq = list(labeling_df[region]) region_len = [len(s) for s in region_seq if len(s) > 1] f, ax = plt.subplots(figsize=(8, 8)) sns.distplot(region_len, kde=False, rug=False) plt.xlabel(region_name + ' length', fontsize=16) plt.ylabel('# ' + region_name + 's', fontsize=16) plt.xticks(fontsize=14) plt.yticks(fontsize=14) plt.xlim(0, 100) utils.output_figure(output_fname, region_name + " length distribution", log)
def visualize_largest_group_aa_variability(labeling_df, region, region_name, output_fname, log): region_seq = list(labeling_df[region]) max_group = get_region_largest_group(region_seq) if len(max_group) == 0: return group_len = len(max_group[0]) if group_len % 3 != 0: print "Largest " + region_name + " is not out-of-frame" return aa_seqs = [Seq(cdr).translate(to_stop=True) for cdr in max_group] aa_list = [dict() for i in range(0, group_len / 3)] for aa_seq in aa_seqs: for i in range(0, len(aa_seq)): if aa_seq[i] not in aa_list[i]: aa_list[i][aa_seq[i]] = 0 aa_list[i][aa_seq[i]] += 1 aa_num = [len(aa) for aa in aa_list] aa_large_abun = [] aa_large_acid = [] for aa in aa_list: aa = sorted(aa.items(), key=operator.itemgetter(1), reverse=True) sum = 0 for i in aa: sum += i[1] aa_large_abun.append(float(aa[0][1]) / float(sum) * 100) aa_large_acid.append(aa[0][0]) aa_set = set() aa_colors = get_aa_colors() for aa in aa_large_acid: aa_set.add(aa) for aa in aa_set: x_ = [] abun_ = [] for i in range(0, len(aa_large_abun)): x_.append(i) if aa_large_acid[i] == aa: abun_.append(aa_large_abun[i]) else: abun_.append(0) df = pd.DataFrame({'x': x_, 'y': abun_}) sns.barplot(x='x', y='y', data=df, color=aa_colors[amino_acids.index(aa)]) plt.xticks(range(0, len(aa_large_abun)), aa_large_acid, fontsize=14) plt.yticks(fontsize=14) plt.xlabel('The most abundant amino acid', fontsize=16) plt.ylabel('% ' + region_name + 's', fontsize=16) utils.output_figure(output_fname, region_name + " aa variability", log)
def OutputGeneMutability(gene_mutability_dict, output_fname, gene_type, log): df_dict = {'Gene': [], 'Mutability': []} for gene in gene_mutability_dict: for m in gene_mutability_dict[gene]: df_dict['Gene'].append(gene) df_dict['Mutability'].append(m) plt.figure(figsize=(10, 8)) sns.boxplot(x='Gene', y='Mutability', data=df_dict) max_mutability = max(0.55, max(df_dict['Mutability'])) plt.ylim(-0.05, max_mutability) plt.xticks(rotation=90) plt.ylabel('Mutability') utils.output_figure(output_fname, "Mutability of " + gene_type + ' genes', log)
def OutputGeneSHMPlot(gene_shms, gene_name, gene_length, num_aligned_seqs, output_fname, log): nucl_dict = { 'A': [0] * gene_length, 'C': [0] * gene_length, 'G': [0] * gene_length, 'T': [0] * gene_length } num_shms = 0 for shm in gene_shms: if not shm.is_substitution() or not nucl_is_valid(shm.read_nucl): continue nucl_dict[shm.read_nucl][shm.gene_pos] += 1 num_shms += 1 for nucl in nucl_dict: for i in range(len(nucl_dict[nucl])): nucl_dict[nucl][i] = nucl_dict[nucl][i] x = range(gene_length) plt.figure() plt.bar(x, [ float(sum(y)) / num_aligned_seqs for y in zip( nucl_dict['A'], nucl_dict['C'], nucl_dict['G'], nucl_dict['T']) ], color='blue', label='A') plt.bar(x, [ float(sum(y)) / num_aligned_seqs for y in zip(nucl_dict['C'], nucl_dict['G'], nucl_dict['T']) ], color='green', label='C') plt.bar(x, [ float(sum(y)) / num_aligned_seqs for y in zip(nucl_dict['G'], nucl_dict['T']) ], color='red', label='G') plt.bar(x, [float(m) / num_aligned_seqs for m in nucl_dict['T']], color='orange', label='T') plt.legend(loc='upper center', ncol=4) plt.ylim(0, 1.1) plt.xlabel('Position in V gene', fontsize=14) plt.ylabel('Fraction of sequences', fontsize=14) plt.title( str(num_aligned_seqs) + ' sequences were aligned to ' + gene_name) utils.output_figure(output_fname, "SHM position in " + gene_name, log) return nucl_dict
def output_shms_pos(all_shms_pos, colors, output_prefix, log): for isotype in all_shms_pos: if len(all_shms_pos[isotype]) < 10: continue plt.hist(all_shms_pos[isotype], bins=100, color=colors[isotype], alpha=.75) plt.xlabel("#SHM in " + isotype + "V gene segment", fontsize=16) plt.ylabel("# sequences", fontsize=16) plt.xticks(fontsize=14) plt.yticks(fontsize=14) plt.xlim(0, .75) output_fname = output_prefix + "_" + isotype + "V_pos" utils.output_figure( output_fname, "Distribution of SHM relative positions in " + isotype + "V segments", log)
def output_num_shms(num_all_shms, colors, output_prefix, log): pos = [] labels = [] cols = [] for isotype in num_all_shms: if len(num_all_shms[isotype]) > 0: pos.append(num_all_shms[isotype]) labels.append(str(isotype)) cols.append(colors[isotype]) plt.hist(pos, bins=50, color=cols, alpha=.75, label=labels) plt.legend(loc='upper center', ncol=len(pos), fontsize=16) plt.xlabel("# of SHM in V gene segment", fontsize=16) plt.ylabel("# SHMs", fontsize=16) plt.xticks(fontsize=14) plt.yticks(fontsize=14) plt.xlim(0, 150) output_fname = output_prefix + "_shms_number" utils.output_figure(output_fname, "Distribution of # SHMs in V segments", log)
def visualize_vj_heatmap(labeling_df, output_pdf, log): v_hits = list(labeling_df['V_hit']) j_hits = list(labeling_df['J_hit']) if len(v_hits) == 0 or len(j_hits) == 0: log.info("VJ data-frame contains 0 records. VJ usage visualization will be skipped") return vj_matrix = VJMatrix(v_hits, j_hits) log.info(str(len(vj_matrix.vj_dict)) + " VJ pairs were extracted. Pairs are presented by " + str(len(vj_matrix.v_set)) + " V genes & " + str(len(vj_matrix.j_set)) + " J genes") table, v, j = vj_matrix.CreateTable(100) mplt.rcParams.update({'font.size': 20}) #plt.figure(figsize=(15, 15)) f, ax = plt.subplots(figsize=(10, 15)) sns.heatmap(table, cmap = plt.cm.jet, xticklabels = v, yticklabels = j, ax = ax) ax.tick_params(labelsize = 16) x = [i + 0.0 for i in range(0, len(v))] y = [i + .5 for i in range(0, len(j))] plt.xticks(x, v, rotation=60, fontsize=14) plt.yticks(y, j, rotation='horizontal', fontsize=14) utils.output_figure(output_pdf, "VJ heatmap for the most abundant VJ combinations", log)
def visualize_aa_substitution_matrix(shms_df, output_fname, log): dict_aa = dict() num_shms = 0 for it in shms_df: read_shms = shms_df[it] prev_pos = -1 for shm in read_shms: if prev_pos / 3 != shm.read_pos / 3: if aa_is_valid(shm.gene_aa) and aa_is_valid(shm.read_aa): aa_pair = shm.gene_aa + shm.read_aa if not aa_pair in dict_aa: dict_aa[aa_pair] = 0 dict_aa[aa_pair] += 1 num_shms += 1 prev_pos = shm.read_pos aa_list = get_aa_list() aa_freq = [] for i in range(0, len(aa_list)): aa_freq.append([0] * len(aa_list)) for aa_pair in dict_aa: aa_freq[aa_list.index(aa_pair[1])][aa_list.index( aa_pair[0])] = float(dict_aa[aa_pair]) / float(num_shms) fig, ax = plt.subplots() sns.heatmap(aa_freq, cmap=plt.cm.jet, xticklabels=aa_list, yticklabels=aa_list, square=True, ax=ax) ax.tick_params(labelsize=14) plt.xticks(fontsize=12) plt.yticks(fontsize=12, rotation='horizontal') #for tick, color in zip(ax.get_xticklabels(), get_aa_ticks_colors(aa_list)): # tick.set_color(color) #for tick, color in zip(ax.get_yticklabels(), get_aa_ticks_colors(aa_list)): # tick.set_color(color) plt.xlabel("To", fontsize=14) plt.ylabel("From", fontsize=14, rotation='horizontal') utils.output_figure(output_fname, "Amino acid substitution heatmap", log) return aa_freq
def OutputFractionOfSynonymousSHMs(shm_df, output_fname, log): v_shm_fractions = [] j_shm_fractions = [] for it in shm_df: read_shms = shm_df[it] num_synonymous = 0 for shm in read_shms: if shm.synonymous: num_synonymous += 1 fraction = 0 if len(read_shms) != 0: fraction = float(num_synonymous) / len(read_shms) if it.is_variable(): v_shm_fractions.append(fraction) else: j_shm_fractions.append(fraction) plt.hist([v_shm_fractions, j_shm_fractions], label=['V gene', 'J gene']) plt.xlabel('Fraction of synonymous SHMs', fontsize=14) plt.ylabel('# sequences', fontsize=14) plt.legend(loc='upper right', fontsize=14) utils.output_figure(output_fname, "Fractions of synonymous SHMs in V and J genes", log)