def addplot(oindexlist, ofracslist, n_seqs, fname, title): hist = Hist(30, 0., 1.) for ofracs in ofracslist: hist.fill(ofracs) fig, ax = self.plotting.mpl_init() hist.mpl_plot(ax, remove_empty_bins=True) ax.text(0.65, 0.8 * ax.get_ylim()[1], 'size: %d' % n_seqs, fontsize=20, fontweight='bold') ax.text(0.65, 0.7 * ax.get_ylim()[1], 'h: %.2f' % utils.fay_wu_h(line=None, restrict_to_region=restrict_to_region, occurence_indices=oindexlist, n_seqs=n_seqs), fontsize=20, fontweight='bold') regionstr = restrict_to_region + ' ' if restrict_to_region is not None else '' self.plotting.mpl_finish( ax, plotdir, fname, title=title, xlabel=regionstr + 'mutation frequency', ylabel=regionstr + 'density of mutations', xticks=[0, 1], log='' ) # xticks=[min(occurence_fractions), max(occurence_fractions)], self.addfname(fnames, fname)
def print_stuff(line): cluster_index = sorted_clusters.index(cluster) naive_cdr3, matureiseq0_cdr3 = utils.subset_sequences( line, iseq=0, restrict_to_region='cdr3' ) # line['naive_seq'][(line['codon_positions']['v']):((line['codon_positions']['j'])+3)] #get nt sequence of CDR3 from first base of cysteine through last base of tryptophan # mature_cdr3_seqs = [] # trying to translate the consensus cdr3 so I can search these with my seed seqs # for iseq in range(len(line['unique_ids'])): # naive_cdr3_seq, mature_cdr3_seq = utils.subset_sequences(line, iseq=iseq, restrict_to_region='cdr3') # mature_cdr3_seqs.append(mature_cdr3_seq) # translated_cdr3 = Seq().... not done cdr3_aa = '%-30s' % Seq(naive_cdr3).translate() if any('-ig' in s for s in line['unique_ids']): cdr3_aa = utils.color('red', cdr3_aa, width=30) print '%4s %s %s %s %5d %5d %5d %7.3f %8.4f %2d %s %4.2f' % ( cluster_index, utils.color_gene(line['v_gene'], width=15), utils.color_gene(line['d_gene'], width=15), utils.color_gene(line['j_gene'], width=10), len(line['unique_ids']), numpy.mean(line['n_mutations']), numpy.median(line['n_mutations']), numpy.mean(line['mut_freqs']), float(len(cluster)) / n_total, (line['cdr3_length'] / 3), cdr3_aa, utils.fay_wu_h(line, debug=False), )
def print_stuff(line): cluster_index = sorted_clusters.index(cluster) naive_cdr3, matureiseq0_cdr3 = utils.subset_sequences(line, iseq=0, restrict_to_region='cdr3') # returns the CDR3 nt sequence for naive, and the first mutated sequence (iseq0); CDR3 = first base of cysteine through last base of tryptophan # mature_cdr3_seqs = [] # trying to translate the consensus cdr3 so I can search these with my seed seqs # for iseq in range(len(line['unique_ids'])): # naive_cdr3_seq, mature_cdr3_seq = utils.subset_sequences(line, iseq=iseq, restrict_to_region='cdr3') # mature_cdr3_seqs.append(mature_cdr3_seq) # mature_cdr3_seqs # translated_cdr3 = mature_cdr3_seqs.translate() cdr3_aa = '%-30s' % Seq(naive_cdr3).translate() # If a cluster contains one of our seed seqs, color this CDR3 red if any('-ig' in s for s in line['unique_ids']): cdr3_aa = utils.color('red', cdr3_aa, width=30) if args.cdr3 in cdr3_aa: # Only print clusters with naive CDR3 that matches our specified --cdr3 argument print 'index genes size n muts SHM rep frac CDR3 FayWuH' print ' mean med len seq' print '%4s %s %s %s %5d %5d %5d %7.3f %8.4f %2d %s %4.2f' % ( cluster_index, utils.color_gene(line['v_gene'], width=15), utils.color_gene(line['d_gene'], width=15), utils.color_gene(line['j_gene'], width=10), len(line['unique_ids']), numpy.mean(line['n_mutations']), numpy.median(line['n_mutations']), numpy.mean(line['mut_freqs']), float(len(cluster)) / n_total, (line['cdr3_length']/3), cdr3_aa, utils.fay_wu_h(line, debug=False), ) # print 'number of mutations per sequence in cluster', sorted(line['n_mutations']) print len(line['naive_seq']), 'length of naive seq' # utils.print_reco_event(utils.synthesize_single_seq_line(line, iseq=0)) # print ascii-art representation of the rearrangement event print 'unique_ids: ', getkey(line['unique_ids']) print print utils.print_reco_event(line)
def print_stuff(line): intscore = 0 # create a clonal family scoring system cluster_index = sorted_clusters.index(cluster) shm_index = shm_clusters.index(cluster) naive_cdr3, matureiseq0_cdr3 = utils.subset_sequences( line, iseq=0, restrict_to_region='cdr3' ) # line['naive_seq'][(line['codon_positions']['v']):((line['codon_positions']['j'])+3)] #get nt sequence of CDR3 from first base of cysteine through last base of tryptophan # mature_cdr3_seqs = [] # trying to translate the consensus cdr3 so I can search these with my seed seqs # for iseq in range(len(line['unique_ids'])): # naive_cdr3_seq, mature_cdr3_seq = utils.subset_sequences(line, iseq=iseq, restrict_to_region='cdr3') # mature_cdr3_seqs.append(mature_cdr3_seq) # translated_cdr3 = Seq().... not done cdr3_aa = '%-30s' % Seq(naive_cdr3).translate() if any('-ig' in s for s in line['unique_ids']): cdr3_aa = utils.color('red', cdr3_aa, width=30) # score clusters based on cluster size if cluster_index < 25: intscore = intscore + 4 elif cluster_index >= 25 and cluster_index <= 50: intscore = intscore + 3 elif cluster_index >= 50 and cluster_index <= 75: intscore = intscore + 2 elif cluster_index >= 75 and cluster_index <= 100: intscore = intscore + 1 # score clusters based on SHM if shm_index < 25: intscore = intscore + 4 elif shm_index >= 25 and shm_index <= 50: intscore = intscore + 3 elif shm_index >= 50 and shm_index <= 75: intscore = intscore + 2 elif shm_index >= 75 and shm_index <= 100: intscore = intscore + 1 # score clusters based on SFS if utils.fay_wu_h(line, debug=False) <= -20: intscore = intscore + 4 elif utils.fay_wu_h(line, debug=False) <= -10: intscore = intscore + 3 elif utils.fay_wu_h(line, debug=False) <= 0: intscore = intscore + 2 elif utils.fay_wu_h(line, debug=False) <= 10: intscore = intscore + 1 # score by bnAb gene usage if (line['v_gene']).split('*')[0] in ( cd4bs_genes or glycan_genes or bridging_genes or mper_genes ): # beware this does not include CDR3 length of bnAb VH genes intscore = intscore + 4 print '%4s %4s %s %s %s %5d %5d %5d %7.3f %8.4f %2d %s %4.2f' % ( intscore, cluster_index, utils.color_gene(line['v_gene'], width=15), utils.color_gene(line['d_gene'], width=15), utils.color_gene(line['j_gene'], width=10), len(line['unique_ids']), numpy.mean(line['n_mutations']), numpy.median(line['n_mutations']), numpy.mean(line['mut_freqs']), float(len(cluster)) / n_total, (line['cdr3_length'] / 3), cdr3_aa, utils.fay_wu_h(line, debug=False), )
sorted_clusters = sorted(annotations, key=lambda q: len(annotations[q]['unique_ids']), reverse=True) #### sorted_clusters = [c for c in sorted_clusters if utils.is_functional(annotations[c])] # checks if the cluster contains ANY non-functional sequences # total size of repertoire (number sequences) n_total = sum([len(cluster) for cluster in sorted_clusters]) # add more criteria biggest_clusters = sorted_clusters[:100] # 100 biggest clusters shm_clusters = sorted(biggest_clusters, key=lambda q: numpy.mean(annotations[q]['mut_freqs']), reverse=True) # rank by SHM sfs_clusters = sorted(biggest_clusters, key=lambda q: utils.fay_wu_h(annotations[q], debug=False) ) # rank by SFS score cluster_sfses = {} for cluster in biggest_clusters: cluster_sfses[cluster] = utils.fay_wu_h(annotations[cluster], debug=False) print numpy.mean(cluster_sfses.values()) print numpy.std(cluster_sfses.values()) print numpy.percentile(cluster_sfses.values(), 5) print numpy.percentile(cluster_sfses.values(), 10) print numpy.percentile(cluster_sfses.values(), 50) print numpy.percentile(cluster_sfses.values(), 80) print numpy.percentile(cluster_sfses.values(), 90) # create function that gives me the score - this function calls a subfunction for each metric (i.e. percentile). The superfunction can then weight the metrics # give 0 points to anyone not in top 30 percentile
# sort by size sorted_clusters = sorted(annotations, key=lambda q: len(annotations[q]['unique_ids']), reverse=True) # sorted_clusters = [c for c in sorted_clusters if utils.is_functional(annotations[c])] # checks if the cluster contains ANY non-functional sequences n_total = sum([len(cluster) for cluster in sorted_clusters]) # add more criteria biggest_clusters = sorted_clusters[:100] # 100 biggest clusters shm_clusters = sorted(biggest_clusters, key=lambda q: numpy.mean(annotations[q]['mut_freqs']), reverse=True) sfs_clusters = sorted( biggest_clusters, key=lambda q: utils.fay_wu_h(annotations[q], debug=False)) # cluster size: print x biggest clusters print '\x1b[1;32;40m' + ' printing the largest clusters' + '\x1b[0m' for cluster in sorted_clusters[:5]: # if sorted_clusters.index(cluster) < 50: # print_stuff(annotations[cluster]) print_stuff(annotations[cluster]) # high mean %SHM: print most mutated clusters from 100 biggest clusters mutclust = int(args.nclust) print '\x1b[1;32;40m' + ' printing the most mutated clusters (within 100 biggest)' + '\x1b[0m' for cluster in shm_clusters[:mutclust]: # if sorted_clusters.index(cluster) < 50: # print_stuff(annotations[cluster]) print_stuff(annotations[cluster])