def test_merge(self): test_file = os.path.abspath('test_data/test_1500_merged_reads.bam') output_filtered_forward, output_filtered_reverse = filter_reads( self.arguments) output_tempfile = tempfile.NamedTemporaryFile( prefix='test_filtered_merged_', suffix='.bam', delete=False, dir=os.getcwd()) output_tempfile.close() self.arguments.output = os.path.abspath(output_tempfile.name) merged_output = merge_bams(self.arguments, output_filtered_forward, output_filtered_reverse) self.assertEqual(merged_output, 0) save = pysam.set_verbosity(0) test_out_fh = pysam.AlignmentFile(self.arguments.output, 'r') test_cmp_fh = pysam.AlignmentFile(test_file, 'r') pysam.set_verbosity(save) print(self.arguments.output) # Compare each read individually since bellerophon.merge_bams() # adds a row to the @PG section of the SAM header, making the checksums differ. for output_read, test_read in zip(test_out_fh, test_cmp_fh): self.assertEqual(output_read, test_read) test_out_fh.close() test_cmp_fh.close() os.unlink(self.arguments.output) self.assertFalse(os.path.exists(self.arguments.output))
def collapse_barcode(bam, out): logger.info(f'Deduplicating {bam} {size(bam)} by collapsing barcodes ...') verbosity = pysam.set_verbosity(0) with pysam.AlignmentFile(bam, 'rb') as b1, pysam.AlignmentFile(bam, 'rb') as b2: results = {} for read1, read2 in zip(itertools.islice(b1, 0, None, 2), itertools.islice(b2, 1, None, 2)): if read1.query_name != read2.query_name: raise ValueError( f'Read names do not match: {read1.query_name} != {read2.query_name}.' ) if read1.is_unmapped or read2.is_unmapped or read1.reference_name != read2.reference_name: continue if not read1.is_read1: read1, read2 = read2, read1 randomer = read1.query_name.split(':')[0] start = read1.positions[-1] if read1.is_reverse else read1.pos stop = read2.positions[-1] if read2.is_reverse else read2.pos strand = '-' if read1.is_reverse else '+' location = (read1.reference_name, start, stop, strand, randomer) if location in results: continue results[location] = (read1, read2) with pysam.AlignmentFile(out, 'wb', template=b1) as o: for (read1, read2) in results.values(): o.write(read1) o.write(read2) logger.info( f'Deduplicating {bam} {size(bam)} by collapsing barcodes complete.' ) pysam.set_verbosity(verbosity)
def run(args): # XXX https://github.com/pysam-developers/pysam/issues/939 pysam.set_verbosity(0) # pylint: disable=no-member if args.show_zmws: if [args.whitelist, args.blacklist, args.percentage].count(None) != 3: log.warning("Ignoring unused filtering arguments") show_zmws(args.input_bam) return 0 try: return filter_reads(input_bam=args.input_bam, output_bam=args.output_bam, whitelist=args.whitelist, blacklist=args.blacklist, percentage=args.percentage, count=args.count, seed=args.seed, ignore_metadata=args.ignore_metadata, relative=args.relative, anonymize=args.anonymize, use_barcodes=args.barcodes, sample_scraps=args.sample_scraps, keep_original_uuid=args.keep_uuid, use_subreads=args.subreads, min_adapters=args.min_adapters) except UserError as e: log.error(str(e)) return 1
def barcode_collapse(bam, output, debug=False): """ Deduplicate paired-end BAM by collapsing barcodes. :param bam: str, path to BAM file. :param output: str, path to the output file. :param debug: bool, set to True for invoking debug mode. """ it.info(f'Deduplicating {bam} by collapsing barcodes ...') pysam.set_verbosity(1 if debug else 0) with pysam.AlignmentFile(bam, 'rb') as b1, pysam.AlignmentFile(bam, 'rb') as b2: results = {} for read1, read2 in zip(itertools.islice(b1, 0, None, 2), itertools.islice(b2, 1, None, 2)): if read1.query_name != read2.query_name: it.error_and_exit(f'Read names do not match: {read1.query_name} != {read2.query_name}.') if read1.is_unmapped or read2.is_unmapped or read1.reference_name != read2.reference_name: continue if not read1.is_read1: read1, read2 = read2, read1 randomer = read1.query_name.split(':')[0] start = read1.positions[-1] if read1.is_reverse else read1.pos stop = read2.positions[-1] if read2.is_reverse else read2.pos strand = '-' if read1.is_reverse else '+' location = (read1.reference_name, start, stop, strand, randomer) if location in results: continue results[location] = (read1, read2) with pysam.AlignmentFile(out, 'wb', template=b1) as o: for (read1, read2) in results.values(): o.write(read1) o.write(read2) it.info(f'Deduplicating {bam} by collapsing barcodes complete.')
def run_consolidate(dataset_file, output_file, datastore_file, consolidate, n_files, consolidate_f=lambda ds: ds.consolidate): # XXX https://github.com/pysam-developers/pysam/issues/939 pysam.set_verbosity(0) # pylint: disable=no-member datastore_files = [] with openDataSet(dataset_file) as ds_in: if consolidate: if len(ds_in.toExternalFiles()) <= 0: raise ValueError( "DataSet {} must contain one or more files!".format( dataset_file)) new_resource_file = bam_of_dataset(output_file) consolidate_f(ds_in)(new_resource_file, numFiles=n_files, useTmp=False) # always display the BAM/BAI if consolidation is enabled # XXX there is no uniqueness constraint on the sourceId, but this # seems sloppy nonetheless - unfortunately I don't know how else to # make view rule whitelisting work reads_name = get_reads_name(ds_in) for ext_res in ds_in.externalResources: if ext_res.resourceId.endswith(".bam"): ds_file = DataStoreFile(ext_res.uniqueId, Constants.TOOL_ID + "-out-2", ext_res.metaType, ext_res.bam, name=reads_name, description=reads_name) datastore_files.append(ds_file) # Prevent duplicated index files being added to datastore, since consolidated # dataset may contain multiple indices pointing to the same physical file added_resources = set() for index in ext_res.indices: if (index.metaType in Constants.BAI_FILE_TYPES and index.resourceId not in added_resources): added_resources.add(index.resourceId) ds_file = DataStoreFile( index.uniqueId, Constants.TOOL_ID + "-out-3", index.metaType, index.resourceId, name="Index of {}".format(reads_name.lower()), description="Index of {}".format( reads_name.lower())) datastore_files.append(ds_file) ds_in.newUuid() ds_in.write(output_file) datastore = DataStore(datastore_files) datastore.write_json(datastore_file) return 0
def check_if_equal(bam_path, gbam_path, no_check_fields=[]): # Suppress warnings to work with BAM files without index file. # https://github.com/pysam-developers/pysam/issues/939#issuecomment-669016051 save = pysam.set_verbosity(0) bam_file = pysam.AlignmentFile(bam_path, "rb") pysam.set_verbosity(save) fields_to_check = [ field for field in list(map(int, Fields)) if field not in no_check_fields ] gbam_file = get_reader(gbam_path, get_parsing_tmpl(fields_to_check)) from gbam_tools import GbamRecord i = 0 while True: cur_gbam = gbam_file.next_record() cur_bam = next(bam_file, None) if i > 0 and i % 100000 == 0: print('%d records are processed' % i) if cur_gbam == None or cur_bam == None: # Assert there is no records left assert (cur_gbam == cur_bam) break for field in fields_to_check: if field == Fields.REFID: assert (cur_bam.reference_id == cur_gbam.refid) if field == Fields.POS: assert (cur_bam.reference_start == cur_gbam.pos) if field == Fields.MAPQ: assert (cur_bam.mapping_quality == cur_gbam.mapq) if field == Fields.BIN: assert (cur_bam.bin == cur_gbam.bin) if field == Fields.FLAGS: assert (cur_bam.flag == cur_gbam.flag) if field == Fields.NEXTREFID: assert (cur_bam.next_reference_id == cur_gbam.next_ref_id) if field == Fields.NEXTPOS: assert (cur_bam.next_reference_start == cur_gbam.next_pos) if field == Fields.TLEN: assert (cur_bam.template_length == cur_gbam.tlen) if field == Fields.READNAME: assert (list(bytearray(cur_bam.query_name, 'utf8')) == cur_gbam.read_name[:-1]) if field == Fields.RAWCIGAR: assert (cur_bam.cigarstring == cur_gbam.cigar) if field == Fields.RAWSEQUENCE: assert (cur_bam.query_sequence == cur_gbam.seq) if field == Fields.RAWQUAL: assert (cur_bam.query_qualities == array('B', cur_gbam.qual)) i += 1
def get_align_file(path: str, mode='r', template=None, expectIndex=True, threads=1): hts_ext = os.path.splitext(path)[-1] xam_type = HTS_EXT_TO_AF_MODE[hts_ext] if expectIndex is False: save = pysam.set_verbosity(0) af = pysam.AlignmentFile(path, f'{mode}{xam_type}', template=template, threads=threads) if expectIndex is False: pysam.set_verbosity(save) return af
def __init__(self, regex="[^\|]+", is_bam=True): self.regex = regex # min percent identity to consider a valid read self.min_identity = 0.95 # bam files without an index will generate a warning on # opening. since we don't need an index, setting the # verbosity will silence this message pysam.set_verbosity(0) if is_bam: self.read_mode = "rb" self.write_mode = "wb" else: self.read_mode = "r" self.write_mode = "w"
def process_bamfile(alignment, min_qual, filtered_out): """Filter alignment BAM files Reads all the reads in the input BAM alignment file. Keep reads in the output if they are aligned with a good quality (greater than min quality threshold given) saving their only some columns: ReadID, Contig, Position_start, Position_end, strand to save memory. Parameters: ----------- alignment : str Path to the input temporary alignment. min_qual : int Minimum mapping quality required to keep a Hi-C pair. filtered_out : str Path to the output temporary tsv alignement. Returns: -------- int: Number of reads aligned. """ # Check the quality and status of each aligned fragment. aligned_reads = 0 save = pysam.set_verbosity(0) temp_bam = pysam.AlignmentFile(alignment, "rb", check_sq=False) pysam.set_verbosity(save) with open(filtered_out, "a") as f: for r in temp_bam: # Check mapping quality if r.mapping_quality >= min_qual: # Check Mapping (0 or 16 flags are kept only) if r.flag == 0: aligned_reads += 1 read = str(r.query_name + "\t" + r.reference_name + "\t" + str(r.reference_start) + "\t" + "+" + "\n") f.write(read) elif r.flag == 16: aligned_reads += 1 read = str(r.query_name + "\t" + r.reference_name + "\t" + str(r.reference_start) + "\t" + "-" + "\n") f.write(read) temp_bam.close() return aligned_reads
def reheader_bam(bam_file_in, bam_file_out, biosample_name=None, library_name=None): """ Write a new BAM file identical to the input except for substitution or addition of SM and/or LB tags in the @RG header. If the tags are already present and current no file will be written. :return: True if header was changed, False if header is already current """ # XXX https://github.com/pysam-developers/pysam/issues/939 pysam.set_verbosity(0) # pylint: disable=no-member was_changed = False with pysam.AlignmentFile(bam_file_in, "rb", check_sq=False) as bam_in: # pylint: disable=no-member header = dict(bam_in.header) for rg in header["RG"]: if biosample_name: if rg.get("SM", None) != biosample_name: was_changed = True rg["SM"] = biosample_name if library_name: if rg.get("LB", None) != library_name: was_changed = True rg["LB"] = library_name if not was_changed: return False log.debug("Writing modified header and records to %s", bam_file_out) with pysam.AlignmentFile( bam_file_out, # pylint: disable=no-member "wb", header=header) as bam_out: for rec in bam_in: bam_out.write(rec) log.debug("Running pbindex") subprocess.check_call(["samtools", "index", bam_file_out]) subprocess.check_call(["pbindex", bam_file_out]) return True
def main(): parser = argparse.ArgumentParser(description='Shard .bam file using the .pbi index', prog='shard_bam') parser.add_argument('-p', '--prefix', type=str, default="shard", help="Shard filename prefix") parser.add_argument('-n', '--num_shards', type=int, default=4, help="Number of shards") parser.add_argument('-t', '--num_threads', type=int, default=2, help="Number of threads to use during sharding") parser.add_argument('-x', '--exclude', type=str, help='Comma-separated list of tags to exclude ' '(note: removing ip and pw tags will break ccs)') parser.add_argument('-i', '--index', type=str, required=False, help="PBI index filename") parser.add_argument('bam', type=str, help="BAM") args = parser.parse_args() pbi = args.bam + ".pbi" if args.index is None else args.index # Silence message about the .bai file not being found. pysam.set_verbosity(0) # Decode PacBio .pbi file and determine the shard offsets. print(f"Reading index ({pbi}). This may take a few minutes...", flush=True) offsets, zmw_counts, read_count = compute_shard_offsets(pbi, args.num_shards) # Prepare a function with arguments partially filled in. tags_to_exclude = [] if args.exclude is None else args.exclude.split(",") func = partial(write_shard, args.bam, offsets, zmw_counts, tags_to_exclude, args.prefix) idx = list(range(0, len(offsets) - 1)) # Write the shards using the specified number of threads. print(f"Writing {len(idx)} shards using {args.num_threads} threads...", flush=True) res = ThreadPool(args.num_threads).imap_unordered(func, idx) # Emit final stats on the sharding. all_num_reads_written = list(res) count = 0 for i in range(len(all_num_reads_written)): count += all_num_reads_written[i] print(f' - wrote {all_num_reads_written[i]} reads to {args.prefix}{i}.bam', flush=True) print(f'Sharded {count}/{read_count} reads across {len(idx)} shards.', flush=True)
def RTag(sli, c): ''' Add CL/HP-tag to BAM upon request (slows a bit) ''' for s in sli: save = pysam.set_verbosity(0) bamfilein = pysam.AlignmentFile(s, mode='rb', require_index=False) pysam.set_verbosity(save) with pysam.AlignmentFile(s + '.tmp', mode='wb', template=bamfilein) as bamfileout: for r in bamfilein.fetch(until_eof=True): r.set_tag('CL', c.clonenumber, 'i') r.set_tag('HP', c.hapnumber, 'i') bamfileout.write(r) bamfilein.close() os.remove(s) os.rename(s + '.tmp', s)
def main(): parser = argparse.ArgumentParser( description= 'Reset base qualities of reads in the CLR bam to the requested Phred base quality', prog='reset_clr_bam_bq') parser.add_argument('-q', '--basequal', type=int, default=10, help="Desired Phred base quality") parser.add_argument('-p', '--prefix', type=str, default="barbequed", help="Shard filename prefix") parser.add_argument('bam', type=str, help="BAM") args = parser.parse_args() # Silence message about the .bai file not being found. pysam.set_verbosity(0) if args.basequal < 0 or (args.basequal > 60 and args.basequal != 255): raise ValueError(f"Requested BQ value {args.basequal} isn't valid.") # https://pysam.readthedocs.io/en/latest/api.html#pysam.AlignedSegment.query_qualities bq = str(chr(args.basequal)) # no add 33 because the link above says so print(f"Setting base qualities to ASCII {str(chr(args.basequal+33))}.") bf = pysam.Samfile(args.bam, 'rb', check_sq=False) with pysam.Samfile(f'{args.prefix}.bam', 'wb', header=bf.header) as out: for read in bf: sausage = copy.deepcopy(read) n = len(sausage.query_sequence) qual = [ord(b) for b in list(bq * n)] sausage.query_qualities = qual out.write(sausage)
from cdispyutils.hmac4 import get_auth import subprocess import glob import os import sys import requests import json import pysam import numpy as np import matplotlib.pyplot as plt from operator import add pysam.set_verbosity(0) auth = '' main_header_order = [ 'Sample', 'VCF File', 'Expectations', 'True-Positive', 'False-Positive', 'Sensitivity', 'Specificity' ] data_types = { 'VCF': 'submitted_somatic_mutations', 'FASTQ': 'submitted_unaligned_reads_files', 'BAM': 'submitted_aligned_reads_files', 'CNV': 'submitted_copy_number_files' } metadata_types = {'METADATA': 'experiment_metadata_files'}
def process_bwa_bamfile(alignment, min_qual, contig_data, out_file): """Filter alignment BAM files Reads all the reads in the input BAM alignment file. Keep reads in the output if they are aligned with a good quality (greater than min quality threshold given) saving their only some columns: ReadID, Contig, Position_start, Position_end, strand to save memory. Parameters: ----------- alignment : str Path to the input temporary alignment. min_qual : int Minimum mapping quality required to keep a Hi-C pair. contig_data : dict Dictionnary of the all the contigs from the assembly, the contigs names are the keys to the data of the contig available with the following keys: "id", "length", "GC", "hit", "coverage". Coverage still at 0 and need to be updated later. out_file : str Path to the output pairs file. Returns: -------- int: Number of pairs aligned. """ # Read the bam file. n_pairs = 0 save = pysam.set_verbosity(0) temp_bam = pysam.AlignmentFile(alignment, "rb", check_sq=False) pysam.set_verbosity(save) with open(out_file, "w") as merged: # Write header of the pairs file. merged.write("## pairs format v1.0\n") merged.write("#columns: readID chr1 pos1 chr2 pos2 strand1 strand2\n") merged.write("#sorted: readID\n") merged.write("#shape: upper triangle\n") for contig in contig_data: merged.write("#chromsize: {0} {1}\n".format( contig, contig_data[contig]["length"])) # Loop until the end of the file. Read the reads by two as the forward # and reverse reads should be interleaved. while n_pairs >= 0: try: for_read = next(temp_bam) while for_read.is_supplementary: for_read = next(temp_bam) rev_read = next(temp_bam) while rev_read.is_supplementary: rev_read = next(temp_bam) # Check mapping quality if (for_read.mapping_quality >= min_qual and rev_read.mapping_quality >= min_qual): # Check flag if not (for_read.is_unmapped or rev_read.is_unmapped): n_pairs += 1 # Safety check (forward and reverse are the same reads) if for_read.query_name != rev_read.query_name: logger.error( "Reads should be paired - %s\t%s", for_read.query_name, rev_read.query_name, ) raise ValueError # Define pairs value. name = for_read.query_name contig1 = for_read.reference_name contig2 = rev_read.reference_name pos1 = for_read.pos + 1 pos2 = for_read.pos + 1 strand1 = "+" strand2 = "+" if for_read.is_reverse: strand1 = "-" if rev_read.is_reverse: strand2 = "-" # Modify order to have an upper triangle and write # the pair. if (contig1 == contig2 and pos1 <= pos2) or contig_data[contig1][ "id"] < contig_data[contig2]["id"]: merged.write("\t".join([ name, contig1, str(pos1), contig2, str(pos2), strand1, strand2, ]) + "\n") else: merged.write("\t".join([ name, contig2, str(pos2), contig1, str(pos1), strand2, strand1, ]) + "\n") # Exit the loop if no more reads. except StopIteration: break # Close the bam file and return number of pairs temp_bam.close() return n_pairs
def StrandSim(w, c): ''' Perform first part of strand-seq simulations and re-align to the original haplotype ''' hfa = pyfaidx.Fasta(c.ffile) if w.chrom not in hfa.keys(): now = datetime.now().strftime('%d/%m/%Y %H:%M:%S') print('[' + now + '][Warning] Chromosome ' + w.chrom + ' not found in ' + c.ffile + '. Skipped simulation') else: now = datetime.now().strftime('%d/%m/%Y %H:%M:%S') print('[' + now + '][Message] Preparing simulation from ' + c.ffile + '. Haplotype ' + str(c.hapnumber)) chr_ = hfa[w.chrom] seq_ = chr_[w.start - 1:w.end].seq tmpfa = os.path.abspath(c.haplodir + '/' + 'htmp.fa') region = w.chrom + '_' + str(w.start) + '_' + str(w.end) with open(tmpfa, 'w') as tmpfout: #write temporary fa for sampling reads tmpfout.write('>' + region + '\n' + '\n'.join(re.findall('.{1,60}', seq_)) + '\n') Ns = seq_.count('N') #normalize coverage on Ns Nreads = round(((c.regioncoverage * (len(seq_) - Ns)) / c.length) / 2) #for paired-end sequencing mate1h = os.path.abspath(c.haplodir + '/hr1.tmp.fq') mate2h = os.path.abspath(c.haplodir + '/hr2.tmp.fq') hapcov = Nreads * c.length * 2 / ((w.end - w.start) - Ns) now = datetime.now().strftime('%d/%m/%Y %H:%M:%S') print('[' + now + '][Message] Simulated coverage for this region will be ' + str(hapcov)) now = datetime.now().strftime('%d/%m/%Y %H:%M:%S') print('[' + now + '][Message] Simulating') wgsim.core(r1=mate1h, r2=mate2h, ref=tmpfa, err_rate=c.error, mut_rate=c.mutation, indel_frac=c.indels, indel_ext=c.extindels, N=Nreads, dist=c.distance, stdev=c.stdev, size_l=c.length, size_r=c.length, max_n=0.05, is_hap=0, is_fixed=0, seed=0) os.remove(tmpfa) mate1hnew = os.path.abspath(c.haplodir + '/hr1.fq') mate2hnew = os.path.abspath(c.haplodir + '/hr2.fq') with open(mate1hnew, 'w') as out1, open(mate2hnew, 'w') as out2: for (name1, seq1, qual1), (name2, seq2, qual2) in zip(mp.fastx_read(mate1h), mp.fastx_read(mate2h)): #change name1/name2 newname1 = '@c' + str(c.singlecellnum) + 'h' + str( c.hapnumber) + 'fh_' + name1 newname2 = '@c' + str(c.singlecellnum) + 'h' + str( c.hapnumber) + 'fh_' + name2 read1 = [newname1, seq1, '+', qual1] read2 = [newname2, seq2, '+', qual2] out1.write('\n'.join(read1) + '\n') out2.write('\n'.join(read2) + '\n') os.remove(mate1h) os.remove(mate2h) now = datetime.now().strftime('%d/%m/%Y %H:%M:%S') print( '[' + now + '][Message] Mapping simulated reads to the corresponding haplotype' ) BAM = os.path.abspath(c.haplodir + '/' + str(c.r_number) + '.srt.bam') sam_cmd = [ 'minimap2', '-ax', 'sr', '--MD', '--cs', '-Y', '--sam-hit-only', '-t', str(c.threads), c.ffile, mate1hnew, mate2hnew ] bam_cmd = [ 'samtools', 'sort', '-@', str(round(c.threads / 2)), '-o', BAM ] p1 = subprocess.Popen(sam_cmd, stderr=open(os.devnull, 'wb'), stdout=subprocess.PIPE) bout = open(BAM, 'wb') p2 = subprocess.run(bam_cmd, stdin=p1.stdout, stderr=open(os.devnull, 'wb'), stdout=bout) bout.close() os.remove(mate1hnew) os.remove(mate2hnew) #now re-parse BAM file to keep only Watson/Crick reads #Watson reads: read1 forward, read2 reverse #Crick reads: read2 forward, read1 reverse ivf = None if len(c.sce_bedregion) != 0: sce_string = '' for s in c.sce_bedregion: if s[3] == c.cellid and s[4] == c.hapid: sce_string += s.chrom + '\t' + str(s.start) + '\t' + str( s.end) + '\n' if sce_string != '': sce_fromscratch = pybedtools.BedTool(sce_string.rstrip(), from_string=True) ivf = sce_fromscratch.as_intervalfile( ) #intervals where to perform SCE events now = datetime.now().strftime('%d/%m/%Y %H:%M:%S') print( '[' + now + '][Message] Detected one ore more SCE event for current cell/haplotype' ) now = datetime.now().strftime('%d/%m/%Y %H:%M:%S') print( '[' + now + '][Message] Extracting Watson (R1F,R2R) and Crick (R1R,R2F) reads') save = pysam.set_verbosity(0) bamstrand = pysam.AlignmentFile( BAM, 'rb', require_index=False) #until-eof consumes the bamfile pysam.set_verbosity(save) Wreads = list(WR(bamstrand, ivf)) bamstrand.close() save = pysam.set_verbosity(0) bamstrand = pysam.AlignmentFile( BAM, 'rb', require_index=False) #re-open for second round pysam.set_verbosity(save) Creads = list(CR(bamstrand, ivf)) bamstrand.close() os.remove(BAM) if c.noise > 0: now = datetime.now().strftime('%d/%m/%Y %H:%M:%S') print('[' + now + '][Message] Adding noise to strands') CtoW = random.sample(Creads, round(len(Wreads) / 100 * c.noise)) Wreads += CtoW WtoC = random.sample(Wreads, round(len(Creads) / 100 * c.noise)) Creads += WtoC now = datetime.now().strftime('%d/%m/%Y %H:%M:%S') print('[' + now + '][Message] Writing Watson and Crick FASTQ') w1 = os.path.abspath(c.haplodir + '/' + str(c.r_number) + '.w1.fq') w2 = os.path.abspath(c.haplodir + '/' + str(c.r_number) + '.w2.fq') c1 = os.path.abspath(c.haplodir + '/' + str(c.r_number) + '.c1.fq') c2 = os.path.abspath(c.haplodir + '/' + str(c.r_number) + '.c2.fq') with open(w1, 'w') as wout1, open(w2, 'w') as wout2: for r1, r2 in Wreads: if r1.get_tag('OS') == 'W': #this is true W read1 = [ '@' + r1.query_name, r1.query_sequence, '+', '2' * c.length ] read2 = [ '@' + r2.query_name, mp.revcomp(r2.query_sequence), '+', '2' * c.length ] else: #write to Watson, but is Crick read1 = [ '@' + r1.query_name, mp.revcomp(r1.query_sequence), '+', '2' * c.length ] read2 = [ '@' + r2.query_name, r2.query_sequence, '+', '2' * c.length ] wout1.write('\n'.join(read1) + '\n') wout2.write('\n'.join(read2) + '\n') with open(c1, 'w') as cout1, open(c2, 'w') as cout2: for r1, r2 in Creads: if r1.get_tag('OS') == 'C': #this is true C read1 = [ '@' + r1.query_name, mp.revcomp(r1.query_sequence), '+', '2' * c.length ] read2 = [ '@' + r2.query_name, r2.query_sequence, '+', '2' * c.length ] else: #write to Crick, but is Watson read1 = [ '@' + r1.query_name, r1.query_sequence, '+', '2' * c.length ] read2 = [ '@' + r2.query_name, mp.revcomp(r2.query_sequence), '+', '2' * c.length ] cout1.write('\n'.join(read1) + '\n') cout2.write('\n'.join(read2) + '\n') now = datetime.now().strftime('%d/%m/%Y %H:%M:%S') print( '[' + now + '][Message] Mapping Watson and Crick reads to the original reference' ) BAM = os.path.abspath(c.haplodir + '/' + str(c.r_number) + '.W.srt.bam') sam_cmd = [ 'minimap2', '-ax', 'sr', '--MD', '--cs', '-Y', '--sam-hit-only', '-t', str(c.threads), '-R', '@RG\\tID:illumina\\tSM:strand', c.REF, w1, w2 ] bam_cmd = [ 'samtools', 'sort', '-@', str(round(c.threads / 2)), '-o', BAM ] p1 = subprocess.Popen(sam_cmd, stderr=open(os.devnull, 'wb'), stdout=subprocess.PIPE) bout = open(BAM, 'wb') p2 = subprocess.run(bam_cmd, stdin=p1.stdout, stderr=open(os.devnull, 'wb'), stdout=bout) bout.close() os.remove(w1) os.remove(w2) BAM = os.path.abspath(c.haplodir + '/' + str(c.r_number) + '.C.srt.bam') sam_cmd = [ 'minimap2', '-ax', 'sr', '--MD', '--cs', '-Y', '--sam-hit-only', '-t', str(c.threads), '-R', '@RG\\tID:illumina\\tSM:strand', c.REF, c1, c2 ] bam_cmd = [ 'samtools', 'sort', '-@', str(round(c.threads / 2)), '-o', BAM ] p1 = subprocess.Popen(sam_cmd, stderr=open(os.devnull, 'wb'), stdout=subprocess.PIPE) bout = open(BAM, 'wb') p2 = subprocess.run(bam_cmd, stdin=p1.stdout, stderr=open(os.devnull, 'wb'), stdout=bout) bout.close() os.remove(c1) os.remove(c2)
def categorize_outcomes(self, max_reads=None): # Record how long each categorization takes. times_taken = [] if self.fns['outcomes_dir'].is_dir(): shutil.rmtree(str(self.fns['outcomes_dir'])) self.fns['outcomes_dir'].mkdir() outcome_to_qnames = defaultdict(list) bam_read_type = 'nonredundant' # iter wrap since tqdm objects are not iterators alignment_groups = iter(self.alignment_groups()) if max_reads is not None: alignment_groups = itertools.islice(alignment_groups, max_reads) special_als = defaultdict(list) with self.fns['outcome_list'].open('w') as outcome_fh: for name, als in self.progress(alignment_groups, desc='Categorizing reads'): seq = als[0].get_forward_sequence() # Special handling of empty sequence. if seq is None: seq = '' if seq in self.seq_to_outcome: layout = self.seq_to_outcome[seq] layout.query_name = name else: layout = self.categorizer(als, self.target_info, error_corrected=self.has_UMIs, mode=self.layout_mode) try: layout.categorize() except: print() print(self.sample_name, name) raise if layout.special_alignment is not None: special_als[layout.category, layout.subcategory].append( layout.special_alignment) outcome_to_qnames[layout.category, layout.subcategory].append(name) outcome = self.final_Outcome.from_layout(layout) outcome_fh.write(f'{outcome}\n') times_taken.append(time.monotonic()) # To make plotting easier, for each outcome, make a file listing all of # qnames for the outcome and a bam file (sorted by name) with all of the # alignments for these qnames. qname_to_outcome = {} bam_fn = self.fns_by_read_type['bam_by_name'][bam_read_type] header = sam.get_header(bam_fn) alignment_sorters = sam.multiple_AlignmentSorters(header, by_name=True) for outcome, qnames in outcome_to_qnames.items(): outcome_fns = self.outcome_fns(outcome) outcome_fns['dir'].mkdir() alignment_sorters[outcome] = outcome_fns['bam_by_name'][ bam_read_type] with outcome_fns['query_names'].open('w') as fh: for qname in qnames: qname_to_outcome[qname] = outcome fh.write(qname + '\n') with alignment_sorters: saved_verbosity = pysam.set_verbosity(0) with pysam.AlignmentFile(bam_fn) as full_bam_fh: for al in self.progress(full_bam_fh, desc='Making outcome-specific bams'): if al.query_name in qname_to_outcome: outcome = qname_to_outcome[al.query_name] alignment_sorters[outcome].write(al) pysam.set_verbosity(saved_verbosity) # Make special alignments bams. for outcome, als in self.progress( special_als.items(), desc='Making special alignments bams'): outcome_fns = self.outcome_fns(outcome) bam_fn = outcome_fns['special_alignments'] sorter = sam.AlignmentSorter(bam_fn, header) with sorter: for al in als: sorter.write(al) return np.array(times_taken)
def open_bam(*args, **kwargs): # https://github.com/pysam-developers/pysam/issues/939 pysam.set_verbosity(0) return AlignmentFile(*args, **kwargs)
def code_block(input_bam: Bam, tags: Optional[String], out_bam: String): from sys import exit, stderr import pysam def commaSepList(inputStr): tmpList = inputStr.split(",") if len(tmpList) == 0: print("No input tags provided") exit(1) else: tmpList = [s.strip() for s in tmpList] return tmpList if not tags: tags = "ZA,ZB,RX,QX" tags = commaSepList(tags) # Some work around for htslib errors that don't impact the calculation save = pysam.set_verbosity(0) bamfh = pysam.AlignmentFile(input_bam, "rb") pysam.set_verbosity(save) umi_dict = dict() outfh = pysam.AlignmentFile(out_bam, "wb", template=bamfh) readcount = 0 for read in bamfh: readcount += 1 if read.query_name not in umi_dict: if sum([read.has_tag(t) for t in tags]) == len(tags): if read.has_tag("XA"): allTags = read.get_tags() relTags = [(t, v) for t, v in allTags if t in tags] umi_dict.update({read.query_name: relTags}) outfh.write(read) else: missingTag = [t for t in tags if not read.has_tag(t)] missingTagStr = ", ".join(missingTag) errStr = "".join([ missingTagStr, " is missing for read ", read.query_name, "\n" ]) stderr.write(errStr) elif read.query_name in umi_dict: msg = "Accessing umi_dict\n" stderr.write(msg) if not read.has_tag("RX"): curTags = read.get_tags() curTags.extend(umi_dict[read.query_name]) read.set_tags(curTags) outfh.write(read) else: outfh.write(read) if readcount % 100000 == 0: msg = str(readcount) + "processed reads\n" stderr.write(msg) outfh.close() bamfh.close() return {"out": out_bam}
def main(): parser = argparse.ArgumentParser( description='Remove redundant alignment records from ONT BAM file', prog='remove_redundant_reads') parser.add_argument('-p', '--prefix', type=str, default="shard", help="Output prefix") parser.add_argument('-a', '--annotations', type=str, help="Annotations on (potential) duplicate reads") parser.add_argument('bam', type=str, help="BAM") args = parser.parse_args() # create a dict of set's, a trick to avoid Hash collisions guilty_dict_per_chr = dict() with open(args.annotations) as f: for line in f: arr = line.strip().split('\t') name = arr[0] chrom = arr[2] guilty_dict_per_chr.setdefault(chrom, set()) guilty_dict_per_chr[chrom].add(name) # Silence message about the .bai file not being found. pysam.set_verbosity(0) num_alignments, num_dropped_alignments = 0, 0 bf = pysam.Samfile(args.bam, 'rb', check_sq=False) with pysam.Samfile(f'{args.prefix}.bam', 'wb', header=bf.header) as out: # we rely on the observation that for coordinate sorted BAM, # duplicate records will appear in blocks, hence once we step off a position with duplicates, we start afresh current_position = -1 current_signatures = set() for read in bf: num_alignments += 1 chrom = read.reference_name n = read.query_name if n in guilty_dict_per_chr[chrom]: mq = read.mapping_quality sam_flag = read.flag pos = read.reference_start signature = f"{n}-{chrom}-{pos}-{mq}-{sam_flag}-" if current_position != pos: # new position, let's write and reset out.write(read) current_position = pos current_signatures = set() current_signatures.add(signature) elif signature in current_signatures: # You're wanted! num_dropped_alignments += 1 pass else: # you are a new group of duplicates that map to this location out.write(read) current_signatures.add(signature) else: out.write(read) print(f'num_alignments: {num_alignments}') print(f'num_dropped_alignments: {num_dropped_alignments}') print(f'num_kept_alignments: {num_alignments - num_dropped_alignments}')
node = graph.add_vertex() v_id[node] = "{idx}_{sample}".format(idx=0, sample=sample) v_name[node] = "" v_seq[node] = "" v_q_qual[node] = "" # add reads as vertices of the graph if reads.endswith(".gz"): with gzip.open(reads, "rt") as _reads: graph = graph_operations.set_nodes(graph, _reads, format, sample) else: with open(reads, "rU") as _reads: graph = graph_operations.set_nodes(graph, _reads, format, sample) # add edges from all-vs-all alignment of reads (please see rule minimap2) verbose = pysam.set_verbosity( 0) # https://github.com/pysam-developers/pysam/issues/939 bam = pysam.AlignmentFile(bam, "rb") pysam.set_verbosity(verbose) for read in bam.fetch(until_eof=True): graph = graph_operations.set_edges(graph, read, threshold) bam.close() graph.remove_vertex(0) # write log files sys.stderr.write("graph construction summary for sample {}:" "\n nodes:\t{}\n edges:\t{}\n".format(sample, graph.num_vertices(), graph.num_edges())) graph_operations.save_and_draw_graph(graph, xml_out=graph_xml,
def merge_bams(args, filtered_forward, filtered_reverse): previous = None save = pysam.set_verbosity(0) forward = pysam.AlignmentFile(filtered_forward, 'r', threads=args.threads) reverse = pysam.AlignmentFile(filtered_reverse, 'r', threads=args.threads) pysam.set_verbosity(save) new_header = OrderedDict(forward.header) if 'PG' in new_header: last_pg = new_header['PG'][-1] previous = last_pg['ID'] command = 'bellerophon --forward %s --reverse %s --output %s --quality %s' % \ (os.path.split(args.forward)[-1], os.path.split(args.reverse)[-1], os.path.split(args.output)[-1], args.quality) new_pg = dict(ID=__name__, PN=__name__, PP=None, VN=__version__, CL=command, DS=__description__) if previous is not None: new_pg['PP'] = previous new_pg = new_header['PG'] + [OrderedDict(new_pg)] else: new_pg = new_header['PG'] + [ OrderedDict(ID=__name__, PN=__name__, VN=__version__, CL=command, DS=__description__) ] new_header['PG'] = new_pg output_fh = pysam.AlignmentFile( args.output, 'wb', header=pysam.AlignmentHeader.from_dict(new_header)) processed_reads = 0 mismatched_reads = 0 unmapped_reads = 0 low_quality_reads = 0 starttime = time.time() for forward_read, reverse_read in zip(forward, reverse): proper_pairs = 0 # Skip reads that aren't the same, are unmapped, or are less than --quality if forward_read.query_name != reverse_read.query_name: mismatched_reads += 1 continue if forward_read.is_unmapped or reverse_read.is_unmapped: unmapped_reads += 1 continue if forward_read.mapping_quality < args.quality or reverse_read.mapping_quality < args.quality: low_quality_reads += 1 continue if not forward_read.is_unmapped or reverse_read.is_unmapped: proper_pairs = 1 # Get the proper distances and lengths, since they may be off now. if forward_read.reference_id == reverse_read.reference_id: distance = abs(forward_read.reference_start - reverse_read.reference_start) if forward_read.reference_start >= reverse_read.reference_start: forward_length = -1 * distance reverse_length = distance else: forward_length = distance reverse_length = -1 * distance else: forward_length = 0 reverse_length = 0 else: proper_pairs = 0 forward_length = 0 reverse_length = 0 # Zero the right flags for the forward and reverse reads. forward_read.is_secondary = 0 reverse_read.is_secondary = 0 forward_read.is_unmapped = 0 reverse_read.is_unmapped = 0 forward_read.is_supplementary = 0 reverse_read.is_supplementary = 0 # Make sure each one has the right flag for read number. forward_read.is_read1 = 1 reverse_read.is_read2 = 1 reverse_read.is_read1 = 0 forward_read.is_read2 = 0 # Swap the mapped and reverse attributes between reads. reverse_is_unmapped = reverse_read.is_unmapped forward_is_unmapped = forward_read.is_unmapped reverse_is_reverse = reverse_read.is_reverse forward_is_reverse = forward_read.is_reverse reverse_read.is_unmapped = forward_is_unmapped forward_read.is_unmapped = reverse_is_unmapped forward_read.mate_is_unmapped = forward_is_unmapped reverse_read.mate_is_unmapped = reverse_is_unmapped forward_read.mate_is_reverse = reverse_is_reverse reverse_read.mate_is_reverse = forward_is_reverse # Set them to paired and properly paired. forward_read.is_proper_pair = proper_pairs reverse_read.is_proper_pair = proper_pairs forward_read.is_paired = 1 reverse_read.is_paired = 1 # Set the next reference for the reads to each other. reverse_read.next_reference_id = forward_read.reference_id forward_read.next_reference_id = reverse_read.reference_id reverse_read.next_reference_start = forward_read.reference_start forward_read.next_reference_start = reverse_read.reference_start # And update the length that we calculated above. forward_read.template_length = forward_length reverse_read.template_length = reverse_length output_fh.write(forward_read) output_fh.write(reverse_read) processed_reads += 1 log.info('Successfully merged %d read pairs in %f seconds.' % (processed_reads, time.time() - starttime)) log.debug( 'Skipped %d pairs with mismatched read names, %d unmapped reads, and %d with a mapping quality below %d.' % (mismatched_reads, unmapped_reads, low_quality_reads, args.quality)) for filename in [filtered_forward, filtered_reverse]: os.unlink(filename) return 0
def filter_reads(args): log.setLevel(args.log_level) retval = [] save = pysam.set_verbosity(0) ffh = pysam.AlignmentFile(args.forward, 'r', threads=args.threads) rfh = pysam.AlignmentFile(args.reverse, 'r', threads=args.threads) pysam.set_verbosity(save) if ffh.header.references != rfh.header.references or ffh.header.lengths != rfh.header.lengths: log.error( 'The input files do not have the same sequence names or lengths.') return 1 for handle in [ffh, rfh]: filename = os.path.split( os.path.abspath(handle.filename.decode('utf-8')))[-1] log.info('Loading reads from %s...' % filename) processed_reads = 0 written_reads = 0 previous_read = None all_reads = [] unmapped_reads = [] five_reads = [] three_reads = [] mid_reads = [] counter = 0 come_in_here = re.compile(r'^[0-9]*M') dear_boy = re.compile(r'.*M$') have_a_cigar = re.compile( r'^[0-9]*[HS].*M.*[HS]$') # You're gonna go far, you're gonna fly output_tempfile = tempfile.NamedTemporaryFile(prefix='filtered_', suffix='.bam', delete=False, dir=os.getcwd()) retval.append(output_tempfile.name) output_tempfile.close() output_fh = pysam.AlignmentFile(output_tempfile.name, 'wb', header=handle.header) starttime = time.time() for read in handle: processed_reads += 1 # If this is 1. Not the first read, and 2. Not the previous read again: if previous_read is not None and read.query_name != previous_read: # If we have more than one read in the current batch and one # read is on the 5´ side of a ligation junction. if counter in [1, 2] and len(five_reads) == 1: # Serve it forth. output_fh.write(five_reads[0]) written_reads += 1 else: # Get the most recent read, set the unmapped flag, and send it # to the output file. new_read = all_reads[0] new_read.is_unmapped = 1 output_fh.write(new_read) written_reads += 1 # Reset these variables to their original values. counter = 0 all_reads = [] unmapped_reads = [] five_reads = [] three_reads = [] mid_reads = [] counter += 1 all_reads.append(read) previous_read = read.query_name # Determine whether read is unmapped, or has mapped reads spanning a junction if read.is_unmapped: unmapped_reads.append(read) # If the read is aligned - and has mapped reads at the end, or it is # aligned + and has mapped reads at the beginning, it goes in the 5´ # bin and is retained. elif (read.is_reverse and dear_boy.match(read.cigarstring) is not None) or (not read.is_reverse and come_in_here.match( read.cigarstring) is not None): five_reads.append(read) # If the read is aligned + and has mapped reads at the end, or it is # aligned - and has mapped reads at the beginning, it goes in the 3´ # bin and is discarded. elif (read.is_reverse and come_in_here.match(read.cigarstring) is not None) or (not read.is_reverse and dear_boy.match( read.cigarstring) is not None): three_reads.append(read) # If it has mapped reads in the middle, put it in that list. elif have_a_cigar.match(read.cigarstring): mid_reads.append(read) # If we have a read. if counter == 1: # And it is on the 5´ side of a ligation junction if len(five_reads) == 1: # We send it to the output output_fh.write(five_reads[0]) else: # Otherwise we flag it unmapped and push it out. new_read = all_reads[0] new_read.is_unmapped = 1 output_fh.write(new_read) written_reads += 1 # Or if we have two reads and one of them is on the 5´ side of a junction. elif counter == 2 and len(five_reads) == 1: # We do. output_fh.write(five_reads[0]) written_reads += 1 else: # The same kind of thing. new_read = all_reads[0] new_read.is_unmapped = 1 output_fh.write(new_read) written_reads += 1 log.debug('Processed %d reads in %f seconds and output %d.' % (processed_reads, time.time() - starttime, written_reads)) # Send the filenames of the filtered alignments back to the caller. return retval
def getJunctionsFromBam(self,sample): """ """ min_length = self.args.min_length max_length = self.args.max_length min_reads = self.args.min_reads fasta = self.args.genome samplename, filename, metadata, condition, bedfilename = sample #genome = pysam.AlignmentFile(filename) ############# old_verbosity = pysam.set_verbosity(0) try: genome = pysam.AlignmentFile(filename) except ValueError: print("Using: pysam.AlignmentFile(filename,check_sq=False) with",filename) genome = pysam.AlignmentFile(filename,check_sq=False) pysam.set_verbosity(old_verbosity) ############## counts = {} leftDiversity = {} rightDiversity = {} overhangs = {} for read in genome.fetch(until_eof=True): if True: #read.is_read2: if read.is_reverse: strand = "-" else: strand = "+" else: if read.is_reverse: strand = "+" else: strand = "-" blocks = read.get_blocks() try: read_start = blocks[0][0] except IndexError: continue read_end = blocks[-1][1] for i in range(len(blocks)-1): junction = (read.reference_name,blocks[i][1],blocks[i+1][0],strand) length = junction[2] - junction[1] if length >= min_length and length <= max_length: leftOH = blocks[i][1]-blocks[i][0] rightOH = blocks[i+1][1]-blocks[i+1][0] overhang = min(leftOH,rightOH) try: counts[junction] += 1 overhangs[junction] = max(overhang,overhangs[junction]) try: leftDiversity[junction][read_start] += 1 rightDiversity[junction][read_end] += 1 except KeyError: leftDiversity[junction][read_start] = 1 rightDiversity[junction][read_end] = 1 except KeyError: counts[junction] = 1 overhangs[junction] = overhang leftDiversity[junction] = {read_start:1} rightDiversity[junction] = {read_end:1} filteredJunctions = [] leftEntropy = {} rightEntropy = {} if genome: leftMotif = {} rightMotif = {} genome = pysam.FastaFile(fasta) for junction in sorted(counts): chromosome,left,right,strand = junction if genome: if (chromosome,left) not in leftMotif: try: leftMotif[(chromosome,left)] = genome.fetch(chromosome,left,left+2) except KeyError: leftMotif[(chromosome,left)] = "NN" if (chromosome,right) not in rightMotif: try: rightMotif[(chromosome,right)] = genome.fetch(chromosome,right-2,right) except KeyError: rightMotif[(chromosome,right)] = "NN" leftEntropy[junction] = 0 total = sum(leftDiversity[junction].values()) for species,count in leftDiversity[junction].items(): prop = count/total leftEntropy[junction] -= (prop) * np.log(prop) rightEntropy[junction] = 0 total = sum(rightDiversity[junction].values()) for species,count in rightDiversity[junction].items(): prop = count/total rightEntropy[junction] -= (prop) * np.log(prop) filteredJunctions.append(junction) # if self.args.strands in ("inferCombine", "inferOnly"): opposite = {"+":"-", "-":"+"} plus_motifs = {"GT_AG","GC_AG","AT_AC"} minus_motifs = {"CT_AC","CT_GC","GT_AT"} firstFiltered = filteredJunctions filteredJunctions = [] for junction in firstFiltered: chromosome,left,right,strand = junction motif = f"{leftMotif[(chromosome,left)]}_{rightMotif[(chromosome,right)]}" complement = (chromosome,left,right,opposite[strand]) if complement not in counts: filteredJunctions.append(junction) elif (junction in self.annotated or (strand == "+" and motif in plus_motifs) or (strand == "-" and motif in minus_motifs)): filteredJunctions.append(junction) if self.args.strands == "inferCombine": counts[junction] += counts[complement] elif (complement in self.annotated or (strand == "-" and motif in plus_motifs) or (strand == "+" and motif in minus_motifs)): pass else: filteredJunctions.append(junction) with open(bedfilename,"w") as bedOut: for junction in filteredJunctions: chromosome,left,right,strand = junction name = f"e:{leftEntropy[junction]:0.02f}:{rightEntropy[junction]:0.02f};o:{overhangs[junction]};m:{leftMotif[left]}_{rightMotif[right]};a:{self.annotated.get(junction,'?')}" bedOut.write(f"{chromosome}\t{left}\t{right}\t{name}\t{counts[junction]}\t{strand}\n") return filename, bedfilename, len(filteredJunctions)
import pysam from collections import defaultdict # suppress incorrect error warning - https://github.com/pysam-developers/pysam/issues/939 save = pysam.set_verbosity(0) # load and iterate through the PathSeq BAM file pathseq_bam = pysam.AlignmentFile(snakemake.input[0], mode="rb") # set verbosity back to original setting pysam.set_verbosity(save) output = [] UMI_dict = defaultdict(list) # seg is an AlignedSegment object for seg in pathseq_bam.fetch(until_eof=True): # not all records will have the CB tag and the UB tag - they should now if seg.has_tag("CB") and seg.has_tag("UB"): if (seg.get_tag(tag="CB") == snakemake.wildcards["cell"]): UMI_dict[seg.get_tag(tag="UB")].append(seg) barcode_bam = pysam.AlignmentFile(snakemake.output[0], mode="wb", template=pathseq_bam) for UMI in UMI_dict: #print(UMI) # keep one read per UMI - the read with the highest mapping quality UMI_reads = UMI_dict[UMI] UMI_read = UMI_reads[0] #print(UMI_read) for read in UMI_reads: #print(read) if read.mapping_quality > UMI_read.mapping_quality:
def check_raw_alignments(df, args, pon): # get soft-clip position and direction clips = [] for chrA, posA, contA, chrB, posB, contB, idx, svlen, spanning in zip( df.chrA, df.posA, df.contigA, df.chrB, df.posB, df.contigB, df.index, df.svlen, df.spanning): if spanning: clips.append((chrA, posA, 3, idx, chrA == chrB, svlen)) clips.append((chrB, posB, 3, idx, chrA == chrB, svlen)) else: if contA: start_lower = contA[0].islower() end_lower = contA[-1].islower() if start_lower and not end_lower: clip_side = 0 elif not start_lower and end_lower: clip_side = 1 else: # start_lower and end_lower: clip_side = 3 # any side clips.append((chrA, posA, clip_side, idx, chrA == chrB, svlen)) if contB: start_lower = contB[0].islower() end_lower = contB[-1].islower() if start_lower and not end_lower: clip_side = 0 elif not start_lower and end_lower: clip_side = 1 else: clip_side = 3 clips.append((chrB, posB, clip_side, idx, chrA == chrB, svlen)) clips = sorted(clips, key=lambda x: (x[0], x[1])) opts = {"bam": "rb", "cram": "rc", "sam": "r", "-": "rb", "stdin": "rb"} pad = 20 found = set([]) for pth, _ in pon: # open alignment file kind = pth.split(".")[-1] bam_mode = opts[kind] pysam.set_verbosity(0) infile = pysam.AlignmentFile( pth, bam_mode, threads=1, reference_filename=None if kind != "cram" else args["ref"]) pysam.set_verbosity(3) for chrom, pos, cs, index, intra, svlen in clips: if index in found: continue for a in infile.fetch(chrom, pos - pad if pos - pad > 0 else 0, pos + pad): if not a.cigartuples: continue # if pos == 3786481 and a.cigartuples[-1][0] == 4: # echo(a.cigartuples, abs(pos - a.pos), abs(pos - a.reference_end)) if a.cigartuples[0][0] == 4 and cs != 1: current_pos = a.pos if abs(current_pos - pos) < 8: found.add(index) break if a.cigartuples[-1][0] == 4 and cs != 0: current_pos = a.reference_end if abs(current_pos - pos) < 8: found.add(index) break df = df.drop(found) return df