def status_plot(bams):
    '''
	Brief: Generate histograms showing average length of MS region depending on MSI status
	Args: lst
	Return: none 
	'''
    for locus in _MSI_LOCI:
        msi_avgs = []
        mss_avgs = []
        for bam in bams:
            bam_name = bam.split('/')[-1].replace('A.bam', '')
            if bam_name in _ANNOTATIONS:
                status = _ANNOTATIONS[bam_name]
                if status != 'MSI' and status != 'MSS':
                    continue
                average = avg_length(count_reads(bam, locus))
                if average == 'Insufficient reads':
                    continue
                if status == 'MSI':
                    msi_avgs.append(float(average))
                elif status == 'MSS':
                    mss_avgs.append(float(average))
        if len(msi_avgs) != 0 or len(mss_avgs) != 0:
            plt.hist([msi_avgs, mss_avgs],
                     color=['red', 'blue'],
                     label=['MSI', 'MSS'])
            plt.title('%s Distribution (subset)' % locus)
            plt.legend(loc='best')
            plt.xlabel = ('Average MS length (bp)')
            plt.ylabel('Number of BAM files')
            saveloc = '/home/upload/msi_project/status_corr_dist/subsetA/%s_dist.png' % locus
            plt.savefig(saveloc)
            plt.clf()
def print_mm_depth(bams, mismatch=2, length=7):
    '''
	Brief: used for optimizing flank length and mismatch parameters, prints edit distance and read depth to a file indicated below
	Args: list, int, int
	Returns: none
	'''
    outfile = '/home/upload/msi_project/mm_depth_analysis/subsetA-mismatch_depth-%d-%d.txt' % (
        mismatch, length)
    with open(outfile, 'w') as f:
        f.write('#mismatches: %d, flank length: %d\n' % (mismatch, length))
        f.write('\t')
        for locus in _MSI_LOCI:
            f.write(locus + '\t\t\t')
        f.write('\n\t')
        for i in range(len(_MSI_LOCI)):
            f.write('f1 mm\tf2mm\t% accepted reads\t')
        f.write('\n')
        for bam in bams:
            f.write(bam.split('/')[-1].replace('.bam', '') + '\t')
            for locus in _MSI_LOCI:
                accepted_reads, f1_mm, f2_mm, num_reads = count_reads(
                    bam,
                    locus,
                    flank_length=length,
                    flank_mismatch=mismatch,
                    return_mms=True)
                if num_reads == 0:
                    percent_accepted = "no coverage"
                else:
                    percent_accepted = float(
                        len(accepted_reads)) / float(num_reads) * 100
                f.write(
                    str(f1_mm) + '\t' + str(f2_mm) + '\t' +
                    str(percent_accepted) + '\t')
            f.write('\n')
Esempio n. 3
0
def test_count_reads_count_reads(fn):
    from count_reads import count_reads

    res = count_reads(
        fn,
        (24, 44),
    )
    print(res.most_common(10))
def report_std_dev(bams, reporting_threshold=.9, mismatch=2, length=7):
    '''
	Brief: Reports to file the MSI status of a patient to compare with the known status, determining MSI status
		based on standard deviation
	Args: lst, int, int
	Return: None
	'''
    outfile = '/home/upload/msi_project/diag_analysis/method_3/mss_training_set_statuses_stdev.txt'

    with open(outfile, 'w') as f:
        f.write('#mismatch: %s, flank length: %s, reporting_threshold: %s\n' %
                (str(mismatch), str(length), str(reporting_threshold)))
        f.write('locus\t')
        for locus in _MSI_LOCI:
            f.write(locus + '\t')
        f.write('Average\tCall\tKnown status\n')
        avg_stdevs = []  #average for each bamfile all loci
        for bam in bams:
            bam_name = bam.split('/')[-1].replace('A.bam', '')
            f.write(bam_name + '\t')
            locus_stdevs = []
            for locus in _MSI_LOCI:
                accepted_reads = (count_reads(bam,
                                              locus,
                                              flank_length=length,
                                              flank_mismatch=mismatch))
                if len(accepted_reads) == 0:
                    std_dev = 'n/a'
                    f.write('n/a\t')
                else:
                    lengths = [len(e) for e in accepted_reads]
                    std_dev = np.std(lengths)
                    f.write(str(std_dev) + '\t')

                locus_stdevs.append(std_dev)

            bam_stdev = avg_value(locus_stdevs)
            avg_stdevs.append(bam_stdev)
            if len(locus_stdevs) == 0:
                msi_status = 'Indeterminate'
            else:
                if bam_stdev < reporting_threshold:
                    msi_status = 'MSS'
                else:
                    msi_status = 'MSI'

            if bam_name in _ANNOTATIONS:
                known_status = _ANNOTATIONS[bam_name]
            else:
                known_status = 'Not reported'

            f.write(
                str(bam_stdev) + '\t' + msi_status + '\t' + known_status +
                '\n')

        return avg_stdevs
def report_dist_mode(bams, mismatch=2, length=7):
    '''
        Brief: Reports to file the MSI status of a patient to compare with the known status, determining MSI status
                based on absolute distance from the mode
        Args: list, int, int
        Returns: none
	'''
    outfile = '/home/upload/msi_project/diag_analysis/method_2/subsetA_statuses_mode.txt'

    all_reads = []
    modes = []

    #Create 2D array of accepted read by locus and bam file
    for bam in bams:
        bam_reads = []
        for locus in _MSI_LOCI:
            accepted_reads = count_reads(bam,
                                         locus,
                                         flank_length=length,
                                         flank_mismatch=mismatch)
            bam_reads.append(accepted_reads)
        all_reads.append(bam_reads)

#Generate a list of the mode length for each locus
    for i in range(len(_MSI_LOCI)):
        for j in range(len(all_reads)):
            locus = []
            locus.extend(all_reads[j][i])
        mode = mode_length(locus)
        modes.append(mode)

#find average distance from the mode for each bam each locus, average for all loci per bam, correlate with annotations
    with open(outfile, 'w') as f:
        f.write('BAM\n')
        for i in range(len(all_reads)):  #iterate over all bam files
            bam_name = bams[i].split('/')[-1].replace('A.bam', '')
            f.write(bam_name + '\t')
            for j in range(len(modes)):  #iterate over all loci
                if modes[j] == 'error':
                    avg_distance = 'low loc covg'
                elif len(all_reads[i][j]) == 0:
                    avg_distance = 'low bam covg'
                else:
                    total_distance = 0
                    mode = modes[j]
                    for read in all_reads[i][j]:
                        total_distance += abs(float(mode) - len(read))
                    avg_distance = float(total_distance) / len(all_reads[i][j])
                f.write(str(avg_distance) + '\t')
            if bam_name in _ANNOTATIONS:
                known_status = _ANNOTATIONS[bam_name]
            else:
                known_status = 'Not reported'
            f.write(known_status + '\n')
def get_z_score(bamfile, locus, mismatch=2, length=7):
    if float(_MSS_LOCUS_DATA[locus][1]) == 0:
        return 'error'

    accepted_reads = count_reads(bamfile,
                                 locus,
                                 flank_length=length,
                                 flank_mismatch=mismatch)
    if len(accepted_reads) == 0:
        return 'error'
    else:
        lengths = [len(e) for e in accepted_reads]
        std_dev = np.std(lengths)
        z_score = ((float(_MSS_LOCUS_DATA[locus][0]) - float(std_dev)) /
                   float(_MSS_LOCUS_DATA[locus][1]))
        z_score = abs(z_score)
        return z_score
def report_num_lengths(bams, mismatch=2, length=7):
    '''
	Brief: Reports to file the MSI status of a patient to compare with the known status, determining MSI status
		based on number of different lengths
	Args: list, int, int
	Returns: none
	'''
    outfile = '/home/upload/msi_project/diag_analysis/method_1/subsetA_statuses_length.txt'

    with open(outfile, 'w') as f:
        f.write('#mismatch: %s, flank length: %s\n' %
                (str(mismatch), str(length)))
        #f.write('BAM\tNUM DIF ELEMS\tSTATUS\tKNOWN STATUS\tAGREE?\n')
        f.write('locus\t')
        for locus in _MSI_LOCI:
            f.write(locus + '\t')
        f.write('\n')
        for bam in bams:
            status_marker = 0
            bam_name = bam.split('/')[-1].replace('A.bam', '')
            f.write(bam_name + '\t')
            for locus in _MSI_LOCI:
                accepted_reads = (count_reads(bam,
                                              locus,
                                              flank_length=length,
                                              flank_mismatch=mismatch))
                if len(accepted_reads) == 0:
                    f.write('n/a\t')
                else:
                    f.write(str(len(set(accepted_reads))) + '\t')
                status_marker += 1
            msi_status = 'MSS'
            if status_marker > 0:
                msi_status = 'MSI'
            if bam_name in _ANNOTATIONS:
                known_status = _ANNOTATIONS[bam_name]
            else:
                known_status = 'Not reported'
            agree = False
            if msi_status == known_status:
                agree == True
            f.write(msi_status + '\t' + known_status + '\t' + str(agree) +
                    '\n')
def bw_plot(bams):
    '''
	Brief: Print a candlestick plot of the number of accepted reads at each locus
	Args: lst, dict
	Return: none
	'''
    data = []
    label = []
    for locus in _MSI_LOCI:
        temp = []
        label.append(locus)
        for bam in bams:
            runs, favg, bavg, num_reads = count_reads(bam,
                                                      locus,
                                                      return_mms=True)
            temp.append(num_reads)
        data.append(temp)
    plt.boxplot(data, labels=label)
    plt.xticks(rotation=90)
    plt.title('Subset Read Depth')
    plt.savefig('/home/upload/msi_project/subsetA_depth_plot.png')