def fix_insert_size(in_bam, config): """ Tophat sets PI in the RG to be the inner distance size, but the SAM spec states should be the insert size. This fixes the RG in the alignment file generated by Tophat header to match the spec """ fixed_file = os.path.splitext(in_bam)[0] + ".pi_fixed.bam" if file_exists(fixed_file): return fixed_file header_file = os.path.splitext(in_bam)[0] + ".header.sam" read_length = bam.estimate_read_length(in_bam) bam_handle = bam.open_samfile(in_bam) header = bam_handle.header.copy() rg_dict = header['RG'][0] if 'PI' not in rg_dict: return in_bam PI = int(rg_dict.get('PI')) PI = PI + 2 * read_length rg_dict['PI'] = PI header['RG'][0] = rg_dict with pysam.Samfile(header_file, "wb", header=header) as out_handle: with bam.open_samfile(in_bam) as in_handle: for record in in_handle: out_handle.write(record) shutil.move(header_file, fixed_file) return fixed_file
def fix_insert_size(in_bam, config): """ Tophat sets PI in the RG to be the inner distance size, but the SAM spec states should be the insert size. This fixes the RG in the alignment file generated by Tophat header to match the spec """ fixed_file = os.path.splitext(in_bam)[0] + ".pi_fixed.bam" if file_exists(fixed_file): return fixed_file header_file = os.path.splitext(in_bam)[0] + ".header.sam" read_length = bam.estimate_read_length(in_bam) bam_handle= bam.open_samfile(in_bam) header = bam_handle.header.copy() rg_dict = header['RG'][0] if 'PI' not in rg_dict: return in_bam PI = int(rg_dict.get('PI')) PI = PI + 2*read_length rg_dict['PI'] = PI header['RG'][0] = rg_dict with pysam.Samfile(header_file, "wb", header=header) as out_handle: with bam.open_samfile(in_bam) as in_handle: for record in in_handle: out_handle.write(record) shutil.move(header_file, fixed_file) return fixed_file
def pick_kmersize(fq): """ pick an appropriate kmer size based off of https://www.biostars.org/p/201474/ tl;dr version: pick 31 unless the reads are very small, if not then guess that readlength / 2 is about right. """ if bam.is_bam(fq): readlength = bam.estimate_read_length(fq) else: readlength = fastq.estimate_read_length(fq) halfread = int(round(readlength / 2)) if halfread >= 31: kmersize = 31 else: kmersize = halfread if kmersize % 2 == 0: kmersize += 1 return kmersize
def pick_kmersize(fq): """ pick an appropriate kmer size based off of https://www.biostars.org/p/201474/ tl;dr version: pick 31 unless the reads are very small, if not then guess that readlength / 2 is about right. """ if bam.is_bam(fq): readlength = bam.estimate_read_length(fq) else: readlength = fastq.estimate_read_length(fq) halfread = int(round(readlength / 2)) if halfread >= 31: kmersize = 31 else: kmersize = halfread if kmersize % 2 == 0: kmersize += 1 return kmersize