def test_merge_bam(): with get_input_files('1.bam', '1.bam') as input_files, get_tmp_path() as outpath: Bam.merge(input_files, outpath) alignment_count_output = int(view("-c", outpath).strip()) alignment_count_input = int(view("-c", input_files[0]).strip()) * 2 assert alignment_count_input == alignment_count_output
def setup_module(): # This function is run once for this module for bam_path in bam_files: assert bam_path.endswith(".bam") sam_path = bam_path[:-4] + ".sam" pysam.view(sam_path, "-b", "-o", bam_path, catch_stdout=False) pysam.index(bam_path, catch_stdout=False)
def checkSamtoolsViewEqual(filename1, filename2, without_header=False): '''return true if the two files are equal in their content through samtools view. ''' # strip MD and NM tags, as not preserved in CRAM files args = ["-x", "MD", "-x", "NM"] if not without_header: args.append("-h") lines1 = pysam.view(*(args + [filename1])) lines2 = pysam.view(*(args + [filename2])) if len(lines1) != len(lines2): return False if lines1 != lines2: # line by line comparison # sort each line, as tags get rearranged between # BAM/CRAM for n, pair in enumerate(zip(lines1, lines2)): l1, l2 = pair l1 = sorted(l1[:-1].split("\t")) l2 = sorted(l2[:-1].split("\t")) if l1 != l2: print("mismatch in line %i" % n) print(l1) print(l2) return False else: return False return True
def checkSamtoolsViewEqual(filename1, filename2, without_header=False): '''return true if the two files are equal in their content through samtools view. ''' # strip MD and NM tags, as not preserved in CRAM files args = ["-x", "MD", "-x", "NM"] if not without_header: args.append("-h") lines1 = pysam.view(*(args + [filename1])) lines2 = pysam.view(*(args + [filename2])) if len(lines1) != len(lines2): return False if lines1 != lines2: # line by line comparison # sort each line, as tags get rearranged between # BAM/CRAM for n, pair in enumerate(zip(lines1, lines2)): l1, l2 = pair l1 = sorted(l1[:-1].split("\t")) l2 = sorted(l2[:-1].split("\t")) if l1 != l2: print "mismatch in line %i" % n print l1 print l2 return False else: return False return True
def execute(self, inBam, exclude, readList, outBam, picardOptions=None, JVMmemory=None): # pylint: disable=W0221 picardOptions = picardOptions or [] if tools.samtools.SamtoolsTool().isEmpty(inBam): # Picard FilterSamReads cannot deal with an empty input BAM file shutil.copyfile(inBam, outBam) elif os.path.getsize(readList) == 0: # Picard FilterSamReads cannot deal with an empty READ_LIST_FILE if exclude: shutil.copyfile(inBam, outBam) else: tmpf = util.file.mkstempfname('.sam') if inBam.endswith('.sam'): # output format (sam/bam) is inferred by samtools based on file extension header = pysam.view('-o', tmpf, '-H', '-S', inBam, catch_stdout=False) else: header = pysam.view('-o', tmpf, '-H', inBam, catch_stdout=False) # pysam.AlignmentFile cannot write an empty file # samtools cannot convert SAM -> BAM on an empty file # but Picard SamFormatConverter can deal with empty files opts = ['INPUT=' + tmpf, 'OUTPUT=' + outBam, 'VERBOSITY=ERROR'] PicardTools.execute(self, 'SamFormatConverter', opts, JVMmemory='50m') else: opts = [ 'INPUT=' + inBam, 'OUTPUT=' + outBam, 'READ_LIST_FILE=' + readList, 'FILTER=' + (exclude and 'excludeReadList' or 'includeReadList'), 'WRITE_READS_FILES=false' ] PicardTools.execute(self, self.subtoolName, opts + picardOptions, JVMmemory)
def test_sieve(): """ Test filtering a BAM file by MAPQ, flag, and blacklist """ outfile = '/tmp/test_sieve.bam' outfiltered = '/tmp/test_sieveFiltered.bam' outlog = '/tmp/test_sieve.log' args = '-b {} --smartLabels --minMappingQuality 10 --samFlagExclude 512 -bl {} -o {} --filterMetrics {} --filteredOutReads {}'.format( BAMFILE_FILTER, BEDFILE_FILTER, outfile, outlog, outfiltered).split() sieve.main(args) _foo = open(outlog, 'r') resp = _foo.readlines() _foo.close() expected = [ '#bamFilterReads --filterMetrics\n', '#File\tReads Remaining\tTotal Initial Reads\n', 'test_filtering\t5\t193\n' ] assert_equal(resp, expected) unlink(outlog) h = hashlib.md5(pysam.view(outfile).encode('utf-8')).hexdigest() assert (h == "acbc4443fb0387bfd6c412af9d4fc414") unlink(outfile) h1 = hashlib.md5(pysam.view(outfiltered).encode('utf-8')).hexdigest() assert (h1 == "b90befdd5f073f14acb9a38661f301ad") unlink(outfiltered)
def execute(self, inBam, exclude, readList, outBam, picardOptions=None, JVMmemory=None): picardOptions = picardOptions or [] if os.path.getsize(readList) == 0: # Picard FilterSamReads cannot deal with an empty READ_LIST_FILE if exclude: shutil.copyfile(inBam, outBam) else: tmpf = util.file.mkstempfname('.sam') with open(tmpf, 'wt') as outf: if inBam.endswith('.sam'): header = pysam.view('-H', '-S', inBam) else: header = pysam.view('-H', inBam) for line in header: outf.write(line) # pysam.AlignmentFile cannot write an empty file # samtools cannot convert SAM -> BAM on an empty file # but Picard SamFormatConverter can deal with empty files opts = ['INPUT=' + tmpf, 'OUTPUT=' + outBam, 'VERBOSITY=ERROR'] PicardTools.execute(self, 'SamFormatConverter', opts, JVMmemory='50m') else: opts = ['INPUT=' + inBam, 'OUTPUT=' + outBam, 'READ_LIST_FILE=' + readList, 'FILTER=' + (exclude and 'excludeReadList' or 'includeReadList'), 'WRITE_READS_FILES=false'] PicardTools.execute(self, self.subtoolName, opts + picardOptions, JVMmemory)
def Downsample(k, path_bam, output_path, reads_min_count): base_name = os.path.basename(path_bam) path_new = output_path + "/" + base_name.split(".bam")[0] + ".DS.bam" path_temp = output_path + "/" + base_name.split(".bam")[0] + ".temp.sorted" path_new_sort = output_path + "/" + base_name.split( ".bam")[0] + ".sorted_DS.bam" reads_count = count_reads(path_bam, output_path) s = reads_min_count / reads_count string = str(s) s = string.split('.')[1] if reads_count <= reads_min_count: shutil.copy2(path_bam, path_new_sort) #os.rename(path_bam, path_new_sort) pysam.index(path_new_sort) else: # pysam.view("-b","-s",str(k)+'.'+str(s),"-O","BAM","-o",path_new,path_bam,catch_stdout=False) pysam.view("-b", "-s", str(k) + '.' + str(s), "-O", "BAM", "-o", path_new, path_bam, catch_stdout=False) pysam.sort("-O", "BAM", "-T", path_temp, "-o", path_new_sort, path_new) pysam.index(path_new_sort) os.remove(path_new)
def create_bam(filename, threads=0): """ Function that create a BAM file from a SAM file. Args : filename [STR] = SAM filename Returns: bamfile [STR] = BAM filename """ # name of the bam file to create bamfile = os.path.dirname(filename)[:-3] + "bam/" + os.path.basename( filename)[:-3] + "bam" # convert sam to bam using pysam pysam.view('-@', str(threads - 1), '-S', '-b', '-o', bamfile, filename, catch_stdout=False) os.remove(filename) return bamfile
def filter_bam(in_fpath, out_fpath, min_mapq=0, required_flag_tags=None, filtering_flag_tags=None, regions=None): cmd = ['-bh'] # The following line: cmd.append('-o' + out_fpath) # should be # cmd.extend(['-o', out_fpath]) # but it is a workaround, take a look at: # https://groups.google.com/forum/#!msg/pysam-user-group/ooHgIiNVe4c/CcY06d45rzQJ if min_mapq: cmd.extend(['-q', str(min_mapq)]) if required_flag_tags: flag = create_flag(required_flag_tags) cmd.extend(['-f', str(flag)]) if filtering_flag_tags: flag = create_flag(filtering_flag_tags) cmd.extend(['-F', str(flag)]) cmd.extend([in_fpath]) if regions: regions = ['{0}:{1}-{2}'.format(*s) for s in regions.segments] cmd.extend(regions) pysam.view(*cmd)
def setup_module(): # This function is run once for this module for bam_path in bam_files: assert bam_path.endswith('.bam') sam_path = bam_path[:-4] + '.sam' pysam.view(sam_path, '-b', '-o', bam_path, catch_stdout=False) pysam.index(bam_path, catch_stdout=False)
def bam_to_sam(bamfile, odir, bname): samfile = odir + bname + '.sam' # print() # print('Using bamfile: '+bamfile) # print('to make samfile: '+samfile) pysam.view('-h', bamfile, '-o', samfile, catch_stdout=False) return samfile
def only_mapped(filename, threads=1): """ Function that keep only mapped reads in the BAM file Args: filename [STR] = BAM file, containing all alignments Returns: mappedfile [STR] = BAM file with only the mapped reads """ # new bamfile name mappedfile = '{0}/filtered_{1}'.format(os.path.dirname(filename), os.path.basename(filename)[7:]) # get only mapped reads pysam.view('-@', str(threads - 1), '-b', '-F', '4', filename, '-o', mappedfile, catch_stdout=False) return mappedfile
def run_mapping(num_threads, reference_file, fastq_files, output_path, sample_name, remove_large_files): '''Run mapping with bwa to create a SAM file, then convert it to BAM, sort and index the file''' logging.info("Starting mapping with BWA") output_file = output_path + '/' + sample_name logging.info("Creating output file: {}.sorted.bam".format(output_file)) if len(fastq_files) == 1: bwacommand = ['bwa', 'mem', '-t', num_threads, reference_file, fastq_files[0]] if len(fastq_files) == 2: bwacommand = ['bwa', 'mem', '-t', num_threads, reference_file, fastq_files[0], fastq_files[1]] with open(output_file + '.sam', 'w') as g: p1 = subprocess.Popen(bwacommand, stdout=g) p1.communicate() p1.wait() pysam.view('-Sb', '-@', num_threads, output_file + '.sam', '-o', output_file + '.bam', catch_stdout=False) pysam.sort('-@', num_threads, output_file + '.bam', '-o', output_file + '.sorted.bam', catch_stdout=False) pysam.index(output_file + '.sorted.bam', catch_stdout=False) os.remove(output_file + '.sam') os.remove(output_file + '.bam') if remove_large_files: os.remove(fastq_files[0]) if len(fastq_files)==2: os.remove(fastq_files[1]) logging.info("Finished mapping") return(output_file + '.sorted.bam')
def _convert_bam_to_sam(self): pysam.view( "-h", "-o", self.aligning_result_path_sam, self.aligning_result_path_bam, catch_stdout=False, )
def split_bam(bam): fwdbam = re.sub(r'bam$', 'fwd.bam', bam) revbam = re.sub(r'bam$', 'rev.bam', bam) pysam.view("-F", "16", "-h", "-b", "-o", fwdbam, bam, catch_stdout=False) pysam.view("-f", "16", "-h", "-b", "-o", revbam, bam, catch_stdout=False) pysam.index(fwdbam) pysam.index(revbam) return fwdbam, revbam
def generate_bam_file(sam_content, file_prefix): sam_file = "{}.sam".format(file_prefix) bam_file = "{}.bam".format(file_prefix) sam_fh = open(sam_file, "w") sam_fh.write(sam_content) sam_fh.close() pysam.view("-Sb", "-o{}".format(bam_file), sam_file, catch_stdout=False) pysam.index(bam_file)
def convert_to_sam(self): ''' Convert input BAM to SAM format ''' sorted_output = self.bam_output[:-4] + ".sorted.bam" sam_output = sorted_output[:-4] + ".sam" pysam.view("-h", "-o", sam_output, sorted_output, catch_stdout=False) sys.stderr.write('New sorted sam file: ' + str(sam_output) + '.sam\n')
def _generate_bam_file(self, sam_content, file_prefix): sam_file = "%s.sam" % file_prefix bam_file = "%s.bam" % file_prefix sam_fh = open(sam_file, "w") sam_fh.write(sam_content) sam_fh.close() pysam.view("-Sb", "-o%s" % bam_file, sam_file) pysam.index(bam_file)
def bam_count_reads(bam_file, aligned=False): """ Wrapper to count the number of (aligned) reads in a bam file """ if aligned: return pysam.view("-c", "-F", "260", bam_file).strip() # pylint: disable=no-member return pysam.view("-c", bam_file).strip() # pylint: disable=no-member
def split_reads_by_chrom(sam_file, tmp_dir="/dev/shm/tmp_label_reads", n_threads=1): """ Reads a SAM/BAM file and splits the reads into one file per chromosome. Returns a list of the resulting filenames.""" ts = time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime()) print("[ %s ] Splitting SAM by chromosome..." % (ts)) tmp_dir = tmp_dir + "/raw" os.system("mkdir -p %s" % (tmp_dir)) if sam_file.endswith(".sam"): # Convert to bam ts = time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime()) print("[ %s ] -----Converting to bam...." % (ts)) bam_file = tmp_dir + "/all_reads.bam" pysam.view("-b", "-S", "-@", str(n_threads), "-o", bam_file, sam_file, catch_stdout=False) elif sam_file.endswith(".bam"): bam_file = sam_file else: raise ValueError("Please provide a .sam or .bam file") # Index the file if no index exists if not os.path.isfile(bam_file + ".bai"): ts = time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime()) print("[ %s ] -----Sorting and indexing..." % (ts)) sorted_bam = tmp_dir + "/all_reads.sorted.bam" pysam.sort("-@", str(n_threads), "-o", sorted_bam, bam_file) bam_file = sorted_bam pysam.index(bam_file) # Open bam file tmp_dir += "/chroms" os.system("mkdir -p %s" % (tmp_dir)) read_files = [] ts = time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime()) print("[ %s ] -----Writing chrom files..." % (ts)) with pysam.AlignmentFile(bam_file, "rb") as bam: # Iterate over chromosomes and write a reads file for each chromosomes = [ x.contig for x in bam.get_index_statistics() \ if x.mapped > 0 ] for chrom in chromosomes: records = bam.fetch(chrom) fname = tmp_dir + "/" + chrom + ".sam" with pysam.AlignmentFile(fname, "w", template=bam) as o: for record in records: o.write(record) read_files.append(fname) return read_files
def make_bam_view(sam_path): sam_file = os.path.basename(sam_path) sam_base, sam_ext = os.path.splitext(sam_file) sam_dir = os.path.dirname(sam_path) bam_file = sam_base + '.bam' bam_path = os.path.join(sam_dir, bam_file) pysam.view('-o', bam_path, '-bh', sam_path, save_stdout=bam_file) return bam_path
def writeSplitBam(two): chrom = two[0] new_bam = two[1] idx = chrs.index(chrom) # Split into per-chromosome bam files pysam.view(bamname, chrom, '-b', '-o', new_bam, catch_stdout=False) pysam.index(new_bam) return (chrom)
def convert_bam_to_sam(in_file): if not is_bam(in_file): raise ValueError("Non BAM file passed to convert_sam_to_bam: " "%s" % (in_file)) out_file = replace_suffix(in_file, ".sam") if file_exists(out_file): return out_file with file_transaction(out_file) as tmp_out_file: pysam.view("-h", "-o" + tmp_out_file, in_file) return out_file
def compare_contents(file1, file2, ftype="bed"): if ftype == "bed": with open(file1) as f: contents1 = f.readlines() with open(file2) as f: contents2 = f.readlines() else: contents1 = pysam.view(file1) contents2 = pysam.view(file2) return contents1 == contents2
def bam2sam(in_file): """ converts a bam file to a sam file bam2sam("file.bam") -> "file.sam" """ out_file = replace_suffix(in_file, ".sam") if file_exists(out_file): return out_file with file_transaction(out_file) as tmp_out_file: pysam.view("-h", "-o" + tmp_out_file, in_file) return out_file
def get_region(infile, ref_name, start, end, outfile): '''Writes BAM file of the given region''' region = ref_name + ':' + str(start + 1) + '-' + str(end + 1) pysam.view('-b', '-F', '0x4', '-o', outfile, infile, region, catch_stdout=False)
def get_region(infile, ref_name, start, end, outfile): """Writes BAM file of the given region""" region = ref_name + ":" + str(start + 1) + "-" + str(end + 1) pysam.view("-b", "-F", "0x4", "-o", outfile, infile, region, catch_stdout=False)
def slice(self, region=None): args = ["-b", "-h", self.filename] if region: range_string = region['chr'] + ":" + str( region['start']) + "-" + str(region['end']) args.append(range_string) args2 = [self.filename, range_string] samview = pysam.view(*args2) return pysam.view(*args)
def ngmlral(): ngmlr_ext_dir = os.path.join(work_dir, 'ngmlr_alignments') if not os.path.exists(ngmlr_ext_dir): os.makedirs(ngmlr_ext_dir) bam_ngmlr_file = os.path.join(ngmlr_ext_dir, (prefix + '.bam')) if not os.path.exists(bam_ngmlr_file): if stats_trigger in ['y', 'yes']: log.debug('[Alignment][ngmlral] - ngmlr -t %s -r %s -q %s -x ont' % (th, ref, fast_Q_file)) ngmlrline = subprocess.Popen([ 'ngmlr', '-t', str(th), '-r', str(ref), '-q', str(fast_Q_file), '-x ont' ], stdout=subprocess.PIPE) PairDict = sam_parser(ngmlrline, ngmlr_ext_dir) else: sam_nglmr_file = os.path.join(ngmlr_ext_dir, (prefix + '.sam')) log.debug( '[Alignment][ngmlral] - ngmlr -t %s -r %s -q %s -o %s -x ont' % (th, ref, fast_Q_file, sam_ngmlr_file)) ngmlrline = subprocess.Popen([ 'ngmlr', '-t', str(th), '-r', str(ref), '-q', str(fast_Q_file), '-o', str(sam_ngmlr_file), '-x ont' ], stdout=subprocess.PIPE).wait() outputfilebam = os.path.join(ngmlr_ext_dir, (prefix + '.tmp.bam')) log.debug( '[Alignment][ngmlral] - samtools view -Sb -@ %s %s -o %s' % (th, sam_nglmr_file, outputfilebam)) pysam.view("-Sb", "-@%s" % str(th), sam_nglmr_file, "-o%s" % outputfilebam, catch_stdout=False) os.remove(sam_nglmr_file) pysam.sort(outputfilebam, "-o%s" % bam_ngmlr_file, catch_stdout=False) log.debug('[Alignment][ngmlral] - samtools index %s -@%s' % (bam_ngmlr_file, str(th))) pysam.index(bam_ngmlr_file, "-@%s" % str(th), catch_stdout=False) os.remove(outputfilebam) else: log.warning('[Alignment][ngmlral] - file %s already exists!' % bam_ngmlr_file) try: shutil.move(ngmlr_ext_dir, os.path.join(work_dir, out_dir)) except shutil.Error: log.error("Unable to move %s" % ngmlr_ext_dir)
def filterBam(bam_file, size_min, size_max, bam_filter_file, sample): if not Path(bam_file + ".bai").exists(): pysam.index(bam_file) # index le bam si besoin # Création du fichier des readgroups à garder: with NamedTemporaryFile(mode='w', delete=False) as fp: # with open(Path(bam_filter_file).parent.joinpath("readGroup.txt"), mode='w') as fp: # print(fp.name) for i in range(int(size_min), int(size_max) + 1, 1): print(f"size{i}") fp.write(f"size{i}\n") # utilisation de pysam view généré un bam avec les reads groups garder au dessus pysam.view("-b", "-h", "-R", fp.name, "-o", bam_filter_file, bam_file, catch_stdout=False) pysam.index(bam_filter_file)
def generate_sample_bams(n, filename_prefix, cycles, barcodes, barcode_len=8, gc_pos=0.7, gc_neg=0.3, length=250): generate_sample_sams(n, filename_prefix, cycles, barcodes, barcode_len, gc_pos, gc_neg, length) for c in cycles: for b in barcodes: filename = filename_prefix + "{}_{}.bam".format(c, b) filename_sam = filename_prefix + "{}_{}.sam".format(c, b) # create the file upfront, so pysam can open it with open(filename, 'w') as fp: pass pysam.view("-bS", "-o", filename, filename_sam, save_stdout=filename)
def good_header(self): try:#Test file integrity header = pysam.view("-H",self.inputFilePath) content = pysam.view(self.inputFilePath) outFile = open(self.outputFileRoot+".header","w+") outFile.write(''.join(header)) outFile.write(''.join(content)) outFile.close() return True except Exception as e: print str(e) print >> sys.stderr, "Cannot read binary header, please check BAM file.)" return False
def _bam_to_sam(local_name, temp_name): temp_local = tempfile.NamedTemporaryFile(suffix='.sam', prefix='local_bam_converted_to_sam_') fd, temp_temp = tempfile.mkstemp(suffix='.sam', prefix='history_bam_converted_to_sam_') os.close(fd) try: pysam.view('-h', '-o%s' % temp_local.name, local_name) except Exception as e: raise Exception("Converting local (test-data) BAM to SAM failed: %s" % e) try: pysam.view('-h', '-o%s' % temp_temp, temp_name) except Exception as e: raise Exception("Converting history BAM to SAM failed: %s" % e) os.remove(temp_name) return temp_local, temp_temp
def combine_samfiles(multi=False, clipped=False): #Seperate out clipped and unclipped! #Look at naming! if multi: sam1 = "unclipped_multimap.sam" sam2 = "clipped_multimap.sam" bam1 = "unclipped_multimap.bam" bam2 = "clipped_multimap.bam" out = open("multi_mapped.sam", "w") else: sam1 = "unclipped_unique.sam" sam2 = "clipped_unique.sam" bam1 = "unclipped_unique.bam" bam2 = "clipped_unique.bam" out = open("unique_mapped.sam", "w") #Convert unclipped sam to bam #Converts sam to bam bam1_o = open(bam1, "w") a = pysam.view("-bS", sam1) for r in a: bam1_o.write(r) bam1_o.close() #Converts clipped sam to bam if clipped == True: if os.stat(sam2).st_size > 0: #Checking file is not empty try: bam2_o = open(bam2, "w") b = pysam.view("-bS", sam2) for r in b: bam2_o.write(r) bam2_o.close() except: print "Samtools raised error, will assume Sam file is empty!" #Merge clipped and unclipped input_filenames = ["-f", bam1, bam2] output_filename = "tmp1.bam" merge_parameters = [output_filename] + input_filenames pysam.merge(*merge_parameters) pysam.sort("-n", "tmp1.bam", "tmp2" ) subprocess.call(["rm", sam2, bam2]) else: #If no clipped bam, just sort pysam.sort("-n", bam1, "tmp2" ) #Converts file to sam d = pysam.view("-h", "tmp2.bam") for r in d: out.write(r) subprocess.call(["rm", "tmp2.bam", "tmp1.bam", sam1, bam1])
def bed_from_sam(samIN, name): file = open(name, 'wa') pysam.view("-bS", "-o"+name, samIN) pysam.sort(name, name) Bamname = name+".bam" pysam.index(Bamname) #delete Sam file os.remove(samIN) bamfile = pysam.AlignmentFile(Bamname, "rb") for read in bamfile.fetch(): if read.mapq > mapQ: #this may eliminate reads that aligned more than once, to cout how may use the XS flag line = "%s\t%i\t%i\n" % (str(read).split("\t")[0], read.reference_start, read.reference_end) file.write(line) file.close()
def check_inputs(file): """check that input files exist and identify appropriate naming convention for filtering reads by chromosome""" try: samfile = pysam.Samfile(file, "rb") except IOError: print 'file not found' pass try: pysam.view("-X", file, "chr11:1000-1100") head = "chr" except NameError: head = "" return samfile, head
def _get_sort_order(in_bam, config): for line in pysam.view("-H", in_bam).split("\r\n"): if line.startswith("@HD"): for keyval in line.split()[1:]: key, val = keyval.split(":") if key == "SO": return val
def test_bam_extract_01(self): TEST_DIR, T_TEST_DIR = self.__get_temp_dirs() input_file = TEST_DIR + "test_terg_02.bam" output_file = T_TEST_DIR + "test_terg_02.filtered.bam" output_file_s = T_TEST_DIR + "test_terg_02.filtered.sam" test_file = TEST_DIR + "test_terg_02.filtered.sam" # c = BAMExtract(input_file) # c.extract("chr21:39000000-40000000", "chr5:1-2", output_file) command = ["bin/dr-disco", "bam-extract", "chr21:39000000-40000000", "chr5:1-2", output_file, input_file] self.assertEqual(subprocess.call(command), 0) # Bam2Sam fhq = open(output_file_s, "w") fhq.write(pysam.view(output_file)) fhq.close() if not filecmp.cmp(output_file_s, test_file): print 'diff \'' + output_file_s + '\' \'' + test_file + '\'' self.assertTrue(filecmp.cmp(output_file_s, test_file))
def test_01(self): basename = 'multilength_fragments_per_position_001' if not os.path.exists('tmp/' + basename + '.bam'): fhq = open('tmp/' + basename + '.bam', "wb") fhq.write(pysam.view('-bS', 'tests/data/' + basename + ".sam")) fhq.close() args = CLI(['tmp/' + basename + '.bam', '--verbose']) args.parameters.left_padding = 0 args.parameters.right_padding = 0 flaimapper = FlaiMapper(args) i = 0 for region in flaimapper.regions(): self.assertEqual(region.region[0], 'SNORD78') for result in region: if i == 0: self.assertEqual(region.region[1] + result.start, 11) self.assertEqual(region.region[1] + result.stop, 11 + 61) elif i == 1: self.assertEqual(region.region[1] + result.start, 44) self.assertEqual(region.region[1] + result.stop, 44 + 28) i += 1 self.assertEqual(i, 2)
def create_bam(filename): """ Function that create a BAM file from a SAM file. Args : filename [STR] = SAM filename Returns: bamfile [STR] = BAM filename """ # name of the bam file to create bamfile = os.path.dirname(filename)[:-3] + "bam/" + os.path.basename(filename)[:-3] + "bam" # convert sam to bam using pysam pysam.view('-Sb',filename, '-o', bamfile, catch_stdout=False) return bamfile
def convert_sam_to_bam(): """ This method should take a newly create .sam file from alignment and - convert it to .bam - sort .bam - index .bam """ ids = generate_ids() for id in ids: start_time = time() print 'converting: %s'%id base_path = os.path.join(SAMPLE_DIR, id) sam_path = os.path.join(base_path, id+'-bwape.sam') bam_path = os.path.join(base_path, id+'-bwape.bam') bam_content = pysam.view('-bS', sam_path) bam_file = open(bam_path, 'w+') bam_file.writelines(bam_content) bam_file.close() pysam.sort(bam_path, bam_path+'_sorted') pysam.index(bam_path+'_sorted.bam') # indexing creates file.bam.bam. Move it to file.bam bam_call = "mv {0} {1}".format(bam_path+'_sorted.bam', bam_path) index_call = "mv {0} {1}".format(bam_path+'_sorted.bam.bai', bam_path+'.bam.bai') subprocess.call(bam_call, shell=True) subprocess.call(index_call, shell=True) end_time = time() print 'completed: %.3fs'%(end_time-start_time)
def convert_bam_bed(bam, name, paired, outdir): count = 0 print "==> Converting bam to bed...\n" # if aligner=="T": outbam = open("{}/{}.unique.bam".format(outdir, name), "wb") filtered_bam = pysam.view( "-bq 50", bam) ##Filters for uniquely aligned reads! for read in filtered_bam: count += 1 outbam.write(read) inbam = pybedtools.BedTool("{}/{}.unique.bam".format(outdir, name)) bed = inbam.bam_to_bed(split=True) bed.saveas("{}/{}.BED".format(outdir, name)) #STAR conversion #elif aligner=="S": # samfile = pysam.Samfile(name+".bam", "rb") # for alignedread in samfile.fetch(): # count += 1 # samfile.close() # inbam = pybedtools.BedTool(name+".bam") # bed = inbam.bam_to_bed(split=True) # bed.saveas(name+".BED") if paired: count /= 2 return count
def main(BAM): # retreive the region from filename geo=BAM.split('.')[4] # create two pipe files. bampipe = BAM.rsplit('.',1)[0] + '.pipe' bampipetohadoop = BAM.rsplit('.',1)[0] + '.hadooppipe' os.mkfifo(bampipe) os.mkfifo(bampipetohadoop) # Start 2 subprocesses, one to download file fron swift and one to pipe filterd data to hdfs command = 'swift download GenomeData ' + BAM + ' -o - > ' + bampipe p = subprocess.Popen(command, shell=True) command2 = '/usr/local/hadoop/bin/hadoop fs -put -f - < ' + bampipetohadoop + ' /genome/' + BAM p2 = subprocess.Popen(command2, shell=True) # open the hadoop pipe f=open(bampipetohadoop,'w') # read the swift pipe rows = pysam.view('-B', bampipe) # call the filter function for r in rows: fline(r,f,geo) f.close() # remove pipe files. os.remove(bampipe) os.remove(bampipetohadoop) # create and empty file so that the job can be restarted without processing all files again. open( BAM, 'w').close()
def sam_to_bam(self,samfile,bamfile): ''' samtools view -bS ''' bamout = pysam.view('-bS',samfile) with open(bamfile,'w') as handle: handle.write("".join(bamout)) return
def only_mapped(filename): """ Function that keep only mapped reads in the BAM file Args: filename [STR] = BAM file, containing all alignments Returns: mappedfile [STR] = BAM file with only the mapped reads """ # new bamfile name mappedfile = '{0}/filtered_{1}'.format(os.path.dirname(filename), os.path.basename(filename)[7:]) # get only mapped reads pysam.view('-b', '-F', '4', filename, '-o', mappedfile, catch_stdout=False) return mappedfile
def convert_bam_bed(name, paired): count = 0 filtered_bam = pysam.view( "-bq 50", name+".bam") ##Filters for uniquely aligned reads! for read in filtered_bam: count += 1 if paired: count /= 2 return count
def testIterate(self): '''compare results from iterator with those from samtools.''' ps = list(self.samfile.fetch()) sa = list(pysam.view( "ex1.bam", raw = True) ) self.assertEqual( len(ps), len(sa), "unequal number of results: %i != %i" % (len(ps), len(sa) )) # check if the same reads are returned for line, pair in enumerate( zip( ps, sa ) ): data = pair[1].split("\t") self.assertEqual( pair[0].qname, data[0], "read id mismatch in line %i: %s != %s" % (line, pair[0].rname, data[0]) )
def testReturnValueData(self): args = "-O BAM {}".format(os.path.join(DATADIR, "ex1.bam")).split(" ") retval = pysam.view(*args) if IS_PYTHON3: self.assertTrue(isinstance(retval, bytes)) self.assertFalse(isinstance(retval, str)) else: self.assertTrue(isinstance(retval, bytes)) self.assertTrue(isinstance(retval, basestring))
def testInput(self): '''Test the input file format, modify self.filepath and return bool Status: True:file is ready to use; False: wrong file, program stop ''' #test if file has header try: self.header = pysam.view("-H",self.filepath) except: try: self.header = pysam.view("-SH",self.filepath) except: logging.error("Input file does not have header, please check your file. Program quit") return (False,"None") #Header test passed, test if it is BAM try: infile = gzip.open(self.filepath) infile.readline(10) except:#cannot read line, should be sam logging.info("Input is SAM, converting to BAM...") bamout = ".".join(self.filepath.split(".")[0:-1])+"."+"bam" infile = pysam.Samfile(self.filepath,"r",header=self.header) #print >> sys.stderr,pysam.view("-SH",infile) outfile = pysam.Samfile(bamout,"wb",template=infile) for i in infile.fetch(): outfile.write(i) self.filepath = bamout #Now the infile is BAM,check if it is sorted if Utils.is_sorted(self.header): pysam.index(self.filepath) return True else:#sort the BAM logging.info("Input is not sorted, sorting file...") bamsort = ".".join(self.filepath.split(".")[0:-1])+"."+"sort" pysam.sort(self.filepath,bamsort) pysam.index(bamsort+".bam") self.filepath = bamsort+".bam" # change input file path self.header = pysam.view("-H",bamsort+".bam") logging.info("Input file sorted") #if Utils.is_sorted(self.header): # print >> sys.stderr, "The file is sorted" return True
def sam_to_bam(self, sam_path, bam_path_prefix): if self._sam_file_is_empty(sam_path) is True: # pysam will generate an error if an emtpy SAM file will # be converted. Due to this an empty bam file with the # same header information will be generated from scratch self._generate_empty_bam_file(sam_path, bam_path_prefix) # Remove SAM file os.remove(sam_path) return temp_unsorted_bam_path = self._temp_unsorted_bam_path( bam_path_prefix) # Generate unsorted BAM file pysam.view("-Sb", "-o%s" % temp_unsorted_bam_path, sam_path) # Generate sorted BAM file pysam.sort(temp_unsorted_bam_path, "-o", bam_path_prefix + ".bam") # Generate index for BAM file pysam.index("%s.bam" % bam_path_prefix) # Remove unsorted BAM file os.remove(temp_unsorted_bam_path) # Remove SAM file os.remove(sam_path)
def SAM_to_BAM(samfile_name, bamfile_name): '''Converts a SAM file into an ordered and indexed BAM file.''' unsortedbamfile_name = samfile_name[:-4] + "_unsorted.bam" bamfile = open(unsortedbamfile_name, "wb") bamfile.write(pysam.view("-b", "-S", samfile_name)) bamfile.close() if bamfile_name.endswith(".bam"): bamfile_name = bamfile_name[:-4] pysam.sort(unsortedbamfile_name, bamfile_name) pysam.index(bamfile_name + ".bam")
def test_02_e(self): input_file = TEST_DIR + "test_terg_02.bam" output_file = T_TEST_DIR + "test_terg_02.filtered.bam" output_file_s = T_TEST_DIR + "test_terg_02.filtered.sam" c = BAMExtract(input_file, False) c.extract("chr12:151000000-153000000", "chr5:1-2", output_file) # Bam2Sam fhq = open(output_file_s, "w") fhq.write(pysam.view(output_file)) fhq.close() with open(output_file_s, "r") as fh: self.assertEqual(fh.read(), "") # empty file check
def test_02_b(self): input_file = TEST_DIR + "test_terg_02.bam" output_file = T_TEST_DIR + "test_terg_02.filtered.bam" output_file_s = T_TEST_DIR + "test_terg_02.filtered.sam" test_file = TEST_DIR + "test_terg_02.filtered.sam" c = BAMExtract(input_file, False) c.extract("chr5:1-2", "chr21:39000000-40000000", output_file) # Bam2Sam fhq = open(output_file_s, "w") fhq.write(pysam.view(output_file)) fhq.close() if not filecmp.cmp(output_file_s, test_file): print 'diff \'' + output_file_s + '\' \'' + test_file + '\'' self.assertTrue(filecmp.cmp(output_file_s, test_file))
def test_03(self): if not os.path.exists("tmp"): os.mkdir("tmp") input_file = "tests/fix-chimeric/test_terg_03.filtered.bam" test_file = "tests/fix-chimeric/test_terg_03.filtered.fixed.sam" output_file = "tmp/test_terg_03.filtered.fixed.bam" output_file_s = "tmp/test_terg_03.filtered.fixed.sam" alignment_handle = ChimericAlignment(input_file) alignment_handle.convert(output_file, "tmp") # Bam2Sam fhq = open(output_file_s, "w") fhq.write(pysam.view(output_file)) fhq.close() self.assertTrue(filecmp.cmp(test_file, output_file_s), msg="diff '" + test_file + "' '" + output_file_s + "':\n" + subprocess.Popen(['diff', test_file, output_file_s], stdout=subprocess.PIPE).stdout.read())