def status_plot(bams): ''' Brief: Generate histograms showing average length of MS region depending on MSI status Args: lst Return: none ''' for locus in _MSI_LOCI: msi_avgs = [] mss_avgs = [] for bam in bams: bam_name = bam.split('/')[-1].replace('A.bam', '') if bam_name in _ANNOTATIONS: status = _ANNOTATIONS[bam_name] if status != 'MSI' and status != 'MSS': continue average = avg_length(count_reads(bam, locus)) if average == 'Insufficient reads': continue if status == 'MSI': msi_avgs.append(float(average)) elif status == 'MSS': mss_avgs.append(float(average)) if len(msi_avgs) != 0 or len(mss_avgs) != 0: plt.hist([msi_avgs, mss_avgs], color=['red', 'blue'], label=['MSI', 'MSS']) plt.title('%s Distribution (subset)' % locus) plt.legend(loc='best') plt.xlabel = ('Average MS length (bp)') plt.ylabel('Number of BAM files') saveloc = '/home/upload/msi_project/status_corr_dist/subsetA/%s_dist.png' % locus plt.savefig(saveloc) plt.clf()
def print_mm_depth(bams, mismatch=2, length=7): ''' Brief: used for optimizing flank length and mismatch parameters, prints edit distance and read depth to a file indicated below Args: list, int, int Returns: none ''' outfile = '/home/upload/msi_project/mm_depth_analysis/subsetA-mismatch_depth-%d-%d.txt' % ( mismatch, length) with open(outfile, 'w') as f: f.write('#mismatches: %d, flank length: %d\n' % (mismatch, length)) f.write('\t') for locus in _MSI_LOCI: f.write(locus + '\t\t\t') f.write('\n\t') for i in range(len(_MSI_LOCI)): f.write('f1 mm\tf2mm\t% accepted reads\t') f.write('\n') for bam in bams: f.write(bam.split('/')[-1].replace('.bam', '') + '\t') for locus in _MSI_LOCI: accepted_reads, f1_mm, f2_mm, num_reads = count_reads( bam, locus, flank_length=length, flank_mismatch=mismatch, return_mms=True) if num_reads == 0: percent_accepted = "no coverage" else: percent_accepted = float( len(accepted_reads)) / float(num_reads) * 100 f.write( str(f1_mm) + '\t' + str(f2_mm) + '\t' + str(percent_accepted) + '\t') f.write('\n')
def test_count_reads_count_reads(fn): from count_reads import count_reads res = count_reads( fn, (24, 44), ) print(res.most_common(10))
def report_std_dev(bams, reporting_threshold=.9, mismatch=2, length=7): ''' Brief: Reports to file the MSI status of a patient to compare with the known status, determining MSI status based on standard deviation Args: lst, int, int Return: None ''' outfile = '/home/upload/msi_project/diag_analysis/method_3/mss_training_set_statuses_stdev.txt' with open(outfile, 'w') as f: f.write('#mismatch: %s, flank length: %s, reporting_threshold: %s\n' % (str(mismatch), str(length), str(reporting_threshold))) f.write('locus\t') for locus in _MSI_LOCI: f.write(locus + '\t') f.write('Average\tCall\tKnown status\n') avg_stdevs = [] #average for each bamfile all loci for bam in bams: bam_name = bam.split('/')[-1].replace('A.bam', '') f.write(bam_name + '\t') locus_stdevs = [] for locus in _MSI_LOCI: accepted_reads = (count_reads(bam, locus, flank_length=length, flank_mismatch=mismatch)) if len(accepted_reads) == 0: std_dev = 'n/a' f.write('n/a\t') else: lengths = [len(e) for e in accepted_reads] std_dev = np.std(lengths) f.write(str(std_dev) + '\t') locus_stdevs.append(std_dev) bam_stdev = avg_value(locus_stdevs) avg_stdevs.append(bam_stdev) if len(locus_stdevs) == 0: msi_status = 'Indeterminate' else: if bam_stdev < reporting_threshold: msi_status = 'MSS' else: msi_status = 'MSI' if bam_name in _ANNOTATIONS: known_status = _ANNOTATIONS[bam_name] else: known_status = 'Not reported' f.write( str(bam_stdev) + '\t' + msi_status + '\t' + known_status + '\n') return avg_stdevs
def report_dist_mode(bams, mismatch=2, length=7): ''' Brief: Reports to file the MSI status of a patient to compare with the known status, determining MSI status based on absolute distance from the mode Args: list, int, int Returns: none ''' outfile = '/home/upload/msi_project/diag_analysis/method_2/subsetA_statuses_mode.txt' all_reads = [] modes = [] #Create 2D array of accepted read by locus and bam file for bam in bams: bam_reads = [] for locus in _MSI_LOCI: accepted_reads = count_reads(bam, locus, flank_length=length, flank_mismatch=mismatch) bam_reads.append(accepted_reads) all_reads.append(bam_reads) #Generate a list of the mode length for each locus for i in range(len(_MSI_LOCI)): for j in range(len(all_reads)): locus = [] locus.extend(all_reads[j][i]) mode = mode_length(locus) modes.append(mode) #find average distance from the mode for each bam each locus, average for all loci per bam, correlate with annotations with open(outfile, 'w') as f: f.write('BAM\n') for i in range(len(all_reads)): #iterate over all bam files bam_name = bams[i].split('/')[-1].replace('A.bam', '') f.write(bam_name + '\t') for j in range(len(modes)): #iterate over all loci if modes[j] == 'error': avg_distance = 'low loc covg' elif len(all_reads[i][j]) == 0: avg_distance = 'low bam covg' else: total_distance = 0 mode = modes[j] for read in all_reads[i][j]: total_distance += abs(float(mode) - len(read)) avg_distance = float(total_distance) / len(all_reads[i][j]) f.write(str(avg_distance) + '\t') if bam_name in _ANNOTATIONS: known_status = _ANNOTATIONS[bam_name] else: known_status = 'Not reported' f.write(known_status + '\n')
def get_z_score(bamfile, locus, mismatch=2, length=7): if float(_MSS_LOCUS_DATA[locus][1]) == 0: return 'error' accepted_reads = count_reads(bamfile, locus, flank_length=length, flank_mismatch=mismatch) if len(accepted_reads) == 0: return 'error' else: lengths = [len(e) for e in accepted_reads] std_dev = np.std(lengths) z_score = ((float(_MSS_LOCUS_DATA[locus][0]) - float(std_dev)) / float(_MSS_LOCUS_DATA[locus][1])) z_score = abs(z_score) return z_score
def report_num_lengths(bams, mismatch=2, length=7): ''' Brief: Reports to file the MSI status of a patient to compare with the known status, determining MSI status based on number of different lengths Args: list, int, int Returns: none ''' outfile = '/home/upload/msi_project/diag_analysis/method_1/subsetA_statuses_length.txt' with open(outfile, 'w') as f: f.write('#mismatch: %s, flank length: %s\n' % (str(mismatch), str(length))) #f.write('BAM\tNUM DIF ELEMS\tSTATUS\tKNOWN STATUS\tAGREE?\n') f.write('locus\t') for locus in _MSI_LOCI: f.write(locus + '\t') f.write('\n') for bam in bams: status_marker = 0 bam_name = bam.split('/')[-1].replace('A.bam', '') f.write(bam_name + '\t') for locus in _MSI_LOCI: accepted_reads = (count_reads(bam, locus, flank_length=length, flank_mismatch=mismatch)) if len(accepted_reads) == 0: f.write('n/a\t') else: f.write(str(len(set(accepted_reads))) + '\t') status_marker += 1 msi_status = 'MSS' if status_marker > 0: msi_status = 'MSI' if bam_name in _ANNOTATIONS: known_status = _ANNOTATIONS[bam_name] else: known_status = 'Not reported' agree = False if msi_status == known_status: agree == True f.write(msi_status + '\t' + known_status + '\t' + str(agree) + '\n')
def bw_plot(bams): ''' Brief: Print a candlestick plot of the number of accepted reads at each locus Args: lst, dict Return: none ''' data = [] label = [] for locus in _MSI_LOCI: temp = [] label.append(locus) for bam in bams: runs, favg, bavg, num_reads = count_reads(bam, locus, return_mms=True) temp.append(num_reads) data.append(temp) plt.boxplot(data, labels=label) plt.xticks(rotation=90) plt.title('Subset Read Depth') plt.savefig('/home/upload/msi_project/subsetA_depth_plot.png')