def __init__(self, index, *args, clean=False, bwa_opts='-x ont2d', **kwargs): """bwa mem alignment server implementation using python binding. :param index: bwa index base path, or list thereof. :param clean: clean-up shared memory on exit. :param bwa_opts: command line options for bwa mem. """ super().__init__(*args, **kwargs) self.logger = logging.getLogger('BwaServe') self.index = index self.bwa_opts = bwa_opts self.aligner = None if BwaAligner is None: raise ImportError( '{} requires BwaAligner which could not be imported.'.format( self.__class__.__name__)) self.aligner = BwaAligner(self.index, options=self.bwa_opts) self.logger.info('bwa service started.')
def __init__(self): bam = '/stor/work/Lambowitz/cdw2854/cfNA/tgirt_map/merged_bam/dedup/unfragmented.chrM_filter.bam' index = '/stor/work/Lambowitz/ref/Ecoli/BL21_DE3.fa' self.bam = pysam.Samfile(bam) self.aligner = BwaAligner(index, options='-k 12') self.matched = re.compile('([0-9]+)M') self.clipped = re.compile('([0-9]+)S') self.alignments = None
def align(self, sequence): """Align a base sequence. :param sequence: sequence to align. :returns: the output of bwa mem call. """ if self.aligner is None: self.aligner = BwaAligner(self.index, options=self.bwa_opts) self.logger.debug("Aligning sequence of length {}.".format( len(sequence))) return self.aligner.align_seq(sequence)
def __init__(self, reference_path, config, log_filename=None): self.aligner = BwaAligner(reference_path) self.log_SNP_insert = np.log(config['SNP_insert']) self.log_non_SNP_insert = np.log(config['non_SNP_insert']) self.log_SNP_delete = np.log(config['SNP_delete']) self.log_non_SNP_delete = np.log(config['non_SNP_delete']) self.log_SNP_false_good = np.log(config['SNP_false_good']) self.log_non_SNP_good = np.log(config['non_SNP_good']) self.log_SNP_true_substitute = np.log(config['SNP_true_substitute']) self.log_SNP_false_substitute = np.log(config['SNP_false_substitute']) self.log_non_SNP_substitute = np.log(config['non_SNP_substitute']) self.log_file = None if log_filename: self.log_file = open(log_filename, 'w')
class ecoli_mapper(): def __init__(self): bam = '/stor/work/Lambowitz/cdw2854/cfNA/tgirt_map/merged_bam/dedup/unfragmented.chrM_filter.bam' index = '/stor/work/Lambowitz/ref/Ecoli/BL21_DE3.fa' self.bam = pysam.Samfile(bam) self.aligner = BwaAligner(index, options='-k 12') self.matched = re.compile('([0-9]+)M') self.clipped = re.compile('([0-9]+)S') self.alignments = None def ecoli_map(self, chrom, start, end): aligned = 0.0 self.alignments = [] for aln_count, aln in enumerate(self.bam.fetch(chrom, start, end)): alns = self.aligner.align_seq(aln.query_sequence) self.alignments.append(alns) filtered_alignments = filter(self.filter_bad_cigar, alns) if list(filtered_alignments): aligned += 1 return aligned / (aln_count + 1) def filter_bad_cigar(self, aln): clipped_base = sum(map(int, self.clipped.findall(aln.cigar))) or 0 mapped_base = sum(map(int, self.matched.findall(aln.cigar))) return (float(clipped_base) / mapped_base) < 0.2 and aln.NM < 3
def __init__(self, in_vcf, reference, out_vcf="/dev/stdout", min_length=50, threshold=0.8, min_distance=10, anno_hits=0): """ The setup """ self.in_vcf = in_vcf self.reference = reference self.out_vcf = out_vcf self.min_length = min_length self.threshold = threshold self.min_distance = min_distance self.anno_hits = anno_hits self.n_header = None self.aligner = BwaAligner(self.reference, options="-a")
class BwapyServe(rpc.AttrHandler): def __init__(self, index, *args, clean=False, bwa_opts='-x ont2d', **kwargs): """bwa mem alignment server implementation using python binding. :param index: bwa index base path, or list thereof. :param clean: clean-up shared memory on exit. :param bwa_opts: command line options for bwa mem. """ super().__init__(*args, **kwargs) self.logger = logging.getLogger('BwaServe') self.index = index self.bwa_opts = bwa_opts self.aligner = None if BwaAligner is None: raise ImportError( '{} requires BwaAligner which could not be imported.'.format( self.__class__.__name__)) print(self.bwa_opts) self.aligner = BwaAligner(self.index, options=self.bwa_opts) self.logger.info('bwa service started.') def _clean_index(self): self.logger.info('Cleaning alignment proxy.') self.aligner = None def __del__(self): self._clean_index() @rpc.method def clean_index(self): """Destroy the aligner object, which will cleanup the index in memory.""" self._clean_index() @rpc.method @asyncio.coroutine def align(self, sequence): """Align a base sequence. :param sequence: sequence to align. :returns: the output of bwa mem call. """ if self.aligner is None: self.aligner = BwaAligner(self.index, options=self.bwa_opts) self.logger.debug("Aligning sequence of length {}.".format( len(sequence))) results = self.aligner.align_seq(sequence) self.logger.info("Aligned sequence of {} bases with {} hits.".format( len(sequence), len(results))) return results
def __init__(self, bwa_executable, reference, reference_filename, references_dict=None): self.bwa_executable = bwa_executable assert reference is not None or references_dict is not None, "one of reference or reference_dict must be not None" self.reference = reference self.references_dict = references_dict self.reference_filename = reference_filename # A heuristic to check if bwa index has been run already if not os.path.isfile(self.reference_filename + '.bwt'): subprocess.run([self.bwa_executable, 'index', self.reference_filename], stderr=subprocess.PIPE, check=True) self.bwapy_aligner = None try: from bwapy import BwaAligner options = '-x ont2d' self.bwapy_aligner = BwaAligner(reference_filename, options=options) except ImportError: sys.stderr.write("Could't import bwapy, will use bwa executable to align reads\n")
def __init__(self, index, *args, map_opts={'x': 'ont2d'}, **kwargs): """bwa mem alignment server implementation using python binding. :param index: bwa index base path, or list thereof. :param map_opts: command line options for bwa mem as dictionary. """ super().__init__(*args, **kwargs) self.logger = logging.getLogger('BwaServe') self.index = index # expand map_opts to a string: opts = [] for k, v in map_opts.items(): opts.append('-{} {}'.format(k, v)) self.bwa_opts = ' '.join(opts) self.aligner = None self.aligner = BwaAligner(self.index, options=self.bwa_opts) self.logger.info('bwa service started.')
class BwapyServe(rpc.AttrHandler): def __init__(self, index, *args, clean=False, bwa_opts='-x ont2d', **kwargs): """bwa mem alignment server implementation using python binding. :param index: bwa index base path, or list thereof. :param clean: clean-up shared memory on exit. :param bwa_opts: command line options for bwa mem. """ super().__init__(*args, **kwargs) self.logger = logging.getLogger('BwaServe') self.index = index self.bwa_opts = bwa_opts self.aligner = None if BwaAligner is None: raise ImportError( '{} requires BwaAligner which could not be imported.'.format( self.__class__.__name__ )) print(self.bwa_opts) self.aligner = BwaAligner(self.index, options=self.bwa_opts) self.logger.info('bwa service started.') def _clean_index(self): self.logger.info('Cleaning alignment proxy.') self.aligner = None def __del__(self): self._clean_index() @rpc.method def clean_index(self): """Destroy the aligner object, which will cleanup the index in memory.""" self._clean_index() @rpc.method @asyncio.coroutine def align(self, sequence): """Align a base sequence. :param sequence: sequence to align. :returns: the output of bwa mem call. """ if self.aligner is None: self.aligner = BwaAligner(self.index, options=self.bwa_opts) self.logger.debug("Aligning sequence of length {}.".format(len(sequence))) results = self.aligner.align_seq(sequence) self.logger.info("Aligned sequence of {} bases with {} hits.".format( len(sequence), len(results) )) return results
def __init__(self, bwa_executable, reference, reference_filename): self.bwa_executable = bwa_executable self.reference = reference self.reference_filename = reference_filename # A heuristic to check if bwa index has been run already if not os.path.isfile(self.reference_filename + '.bwt'): subprocess.run( [self.bwa_executable, 'index', self.reference_filename], stderr=subprocess.PIPE, check=True) self.bwapy_aligner = None try: from bwapy import BwaAligner self.bwapy_aligner = BwaAligner(reference_filename) except ImportError: sys.stderr.write( "Could't import bwapy, will use bwa executable to align reads\n" )
class BwapyServe(rpc.AttrHandler): def __init__(self, index, *args, map_opts={'x': 'ont2d'}, **kwargs): """bwa mem alignment server implementation using python binding. :param index: bwa index base path, or list thereof. :param map_opts: command line options for bwa mem as dictionary. """ super().__init__(*args, **kwargs) self.logger = logging.getLogger('BwaServe') self.index = index # expand map_opts to a string: opts = [] for k, v in map_opts.items(): opts.append('-{} {}'.format(k, v)) self.bwa_opts = ' '.join(opts) self.aligner = None self.aligner = BwaAligner(self.index, options=self.bwa_opts) self.logger.info('bwa service started.') def _clean_index(self): self.logger.info('Cleaning alignment proxy.') self.aligner = None def __del__(self): self._clean_index() @rpc.method def clean_index(self): """Destroy the aligner object, which will cleanup the index in memory.""" self._clean_index() @rpc.method @asyncio.coroutine def align(self, sequence): """Align a base sequence. :param sequence: sequence to align. :returns: the output of bwa mem call. """ if self.aligner is None: self.aligner = BwaAligner(self.index, options=self.bwa_opts) self.logger.debug("Aligning sequence of length {}.".format( len(sequence))) results = self.aligner.align_seq(sequence) self.logger.info("Aligned sequence of {} bases with {} hits.".format( len(sequence), len(results))) return results
def align(self, sequence): """Align a base sequence. :param sequence: sequence to align. :returns: the output of bwa mem call. """ if self.aligner is None: self.aligner = BwaAligner(self.index, options=self.bwa_opts) self.logger.debug("Aligning sequence of length {}.".format(len(sequence))) results = self.aligner.align_seq(sequence) self.logger.info("Aligned sequence of {} bases with {} hits.".format( len(sequence), len(results) )) return results
def grm_main(cmdargs): """ Builds a graph-genome from the vcf and reference, creates the sets of kmers at edges in the graph maps those kmers globally to the reference reports mapping metrics Todo: - document the bwa package and how to install - better column names along with documentation """ if not HASBWALIB: logging.error("bwapy isn't available on this machine") sys.exit(1) args = parse_args(cmdargs) if not args.regions: m_ranges = truvari.ref_ranges(args.reference) else: m_ranges = truvari.bed_ranges(args.regions) grm_shared.aligner = BwaAligner(args.reference, '-a') header = ["key"] for prefix in ["rup_", "rdn_", "aup_", "adn_"]: for key in [ "nhits", "avg_q", "avg_ed", "avg_mat", "avg_mis", "dir_hits", "com_hits", "max_q", "max_ed", "max_mat", "max_mis", "max_strand", "min_q", "min_ed", "min_mat", "min_mis", "min_strand" ]: header.append(prefix + key) grm_shared.header = header grm_shared.ref_filename = args.reference grm_shared.kmersize = args.kmersize grm_shared.input = args.input grm_shared.min_size = args.min_size with multiprocessing.Pool(args.threads, maxtasksperchild=1) as pool: logging.info("Processing") chunks = pool.imap(process_entries, m_ranges) pool.close() data = pd.concat(chunks, ignore_index=True) logging.info("Saving; df shape %s", data.shape) joblib.dump(data, args.output) logging.info("Finished grm") pool.join()
def recover_ccs_reads(short_reads, ref_fasta, ss_index, gtf_index, intron_index, is_canonical, out_dir, prefix, threads): from bwapy import BwaAligner # Second scanning of short reads genome = Fasta(ref_fasta) options = '-x ont2d -T 19' bwa_aligner = Aligner(BwaAligner(ref_fasta, options=options)) chunk_size = 250 jobs = [] pool = Pool(threads, env.initializer, (bwa_aligner, genome.contig_len, genome, gtf_index, intron_index, ss_index)) for reads in grouper(short_reads, chunk_size): chunk = [i for i in reads if i is not None] jobs.append(pool.apply_async(recover_ccs_chunk, (chunk, is_canonical))) pool.close() prog = ProgressBar() prog.update(0) finished_cnt = 0 reads_count = defaultdict(int) with open('{}/{}.cand_circ.fa'.format(out_dir, prefix), 'a') as out: for job in jobs: finished_cnt += 1 tmp_cnt, ret = job.get() for key, value in tmp_cnt.items(): reads_count[key] += value for read_id, circ_id, strand, cir_exon_tag, ss_id, clip_info, segments, circ_seq in ret: out.write('>{}\t{}\t{}\t{}\t{}\t{}\t{}\n{}\n'.format( read_id, circ_id, strand, cir_exon_tag, ss_id, clip_info, segments, circ_seq)) prog.update(100 * finished_cnt / len(jobs)) pool.join() prog.update(100) return reads_count
class exogenous_mapper(): def __init__(self, index, nm = 0.1): self.matched = re.compile('([0-9]+)M') self.clipped = re.compile('([0-9]+)S') self.aligner = BwaAligner(index, options = '-k 12 -B 3') self.NM = nm self.aligned=None def map(self, seq): self.aligned=None self.aligned = self.aligner.align_seq(seq) self.aligned = list(filter(self.filter_bad_cigar, self.aligned)) return 1 if self.aligned else 0 def filter_bad_cigar(self, aln): cigar = aln.cigar clipped_base = sum(map(int, self.clipped.findall(cigar))) or 0 mapped_base = sum(map(int, self.matched.findall(cigar))) return ( (float(clipped_base) + aln.NM)/mapped_base ) <= self.NM
class chrM_mapper(): def __init__(self, bam, index): self.bam = pysam.Samfile(bam) self.aligner = BwaAligner(index, options='-k 12') def __align__(self, seq): alignments = self.aligner.align_seq(seq) return 1 if alignments else 0 def run_peak(self, chrom, start, end, peak_strand): alns = 0 chrM_alns = 0 for aln in self.bam.fetch(chrom, start, end): if aln.is_read1: aln_strand = '-' if aln.is_reverse else '+' if aln_strand == peak_strand: alns += 1 seq = aln.get_forward_sequence() chrM_alns += self.__align__(seq) return alns, chrM_alns
def __init__(self, index, *args, clean=False, bwa_opts='-x ont2d', **kwargs): """bwa mem alignment server implementation using python binding. :param index: bwa index base path, or list thereof. :param clean: clean-up shared memory on exit. :param bwa_opts: command line options for bwa mem. """ super().__init__(*args, **kwargs) self.logger = logging.getLogger('BwaServe') self.index = index self.bwa_opts = bwa_opts self.aligner = None if BwaAligner is None: raise ImportError( '{} requires BwaAligner which could not be imported.'.format( self.__class__.__name__ )) print(self.bwa_opts) self.aligner = BwaAligner(self.index, options=self.bwa_opts) self.logger.info('bwa service started.')
# server.py # from flask import Flask, render_template # app = Flask(__name__, static_folder="../static/dist", template_folder="../static") # @app.route("/") # def index(): # return render_template("index.html") # @app.route("/hello") # def hello(): # return "Hello World!” # if __name__ == "__main__": # app.run() import pysam from bwapy import BwaAligner index = 'path/to/index' # the path given to bwa index seq = 'ACGATCGCGATCGA' aligner = BwaAligner(index) alignments = aligner.align_seq(seq) print('Found {} alignments.'.format(len(alignments)) for aln in alignments: print(' ', aln)
class Remap(): """ Class for remapping annotation """ def __init__(self, in_vcf, reference, out_vcf="/dev/stdout", min_length=50, threshold=0.8, min_distance=10, anno_hits=0): """ The setup """ self.in_vcf = in_vcf self.reference = reference self.out_vcf = out_vcf self.min_length = min_length self.threshold = threshold self.min_distance = min_distance self.anno_hits = anno_hits self.n_header = None self.aligner = BwaAligner(self.reference, options="-a") def edit_header(self, header=None): """ Edits and holds on to the header """ if header is None: with pysam.VariantFile(self.in_vcf, 'r') as fh: header = fh.header.copy() header.add_line(('##INFO=<ID=REMAP,Number=1,Type=String,' 'Description="Annotation of alt-seq remapping">')) if self.anno_hits: header.add_line(('##INFO=<ID=REMAPHits,Number=.,Type=String,' 'Description="List of chr:start-end of hits">')) self.n_header = header def get_end(self, pos, cigar): # pylint: disable=no-self-use """ Expand a cigar to get the end position and how many bases are clipped """ soft_bases = 0 for i in cigmatch.findall(cigar): if i[-1] == "S": soft_bases += int(i[:-1]) elif i[-1] in ["M", "D"]: pos += int(i[:-1]) return pos, soft_bases def remap_entry(self, entry, threshold=.8): """ Map a sequence and return the information from it """ is_del = truvari.entry_variant_type(entry) == "DEL" if is_del: seq = str(entry.ref) else: seq = entry.alts[0] all_hits = [] num_hits = 0 partial_hits = 0 close_dist = None for aln in self.aligner.align_seq(seq): # Take out the 'same spot' alignment for deletions dist = abs(aln.pos - entry.pos) if is_del and aln.rname == entry.chrom and dist < self.min_distance: continue # Filter hits below threshold end, soft = self.get_end(aln.pos, aln.cigar) seq_len = len(seq) pct_query = (seq_len - soft) / seq_len if pct_query < threshold: partial_hits += 1 continue hit = f"{aln.rname}:{aln.pos}-{end}.{int(pct_query*100)}" bisect.insort(all_hits, (pct_query, hit)) num_hits += 1 if aln.rname != entry.chrom: continue if close_dist is None or dist < close_dist: close_dist = dist if num_hits == 0 and partial_hits == 0: return "novel", all_hits if close_dist and close_dist <= len(seq): return "tandem", all_hits if num_hits == 0 and partial_hits != 0: return "partial", all_hits return "interspersed", all_hits def annotate_entry(self, entry): """ Annotates entries in the vcf and writes to new vcf """ if truvari.entry_size(entry) >= self.min_length: entry.translate(self.n_header) remap, hits = self.remap_entry(entry) entry.info["REMAP"] = remap if self.anno_hits and hits: entry.info["REMAPHits"] = [ _[1] for _ in hits[-self.anno_hits:] ] return entry def annotate_vcf(self): """ Annotates the vcf and writes to new vcf """ with pysam.VariantFile(self.in_vcf) as fh: self.edit_header(fh.header.copy()) out = pysam.VariantFile(self.out_vcf, 'w', header=self.n_header) for entry in fh: entry = self.annotate_entry(entry) out.write(entry)
class basecall_model(model): def __init__(self, reference_path, config, log_filename=None): self.aligner = BwaAligner(reference_path) self.log_SNP_insert = np.log(config['SNP_insert']) self.log_non_SNP_insert = np.log(config['non_SNP_insert']) self.log_SNP_delete = np.log(config['SNP_delete']) self.log_non_SNP_delete = np.log(config['non_SNP_delete']) self.log_SNP_false_good = np.log(config['SNP_false_good']) self.log_non_SNP_good = np.log(config['non_SNP_good']) self.log_SNP_true_substitute = np.log(config['SNP_true_substitute']) self.log_SNP_false_substitute = np.log(config['SNP_false_substitute']) self.log_non_SNP_substitute = np.log(config['non_SNP_substitute']) self.log_file = None if log_filename: self.log_file = open(log_filename, 'w') def update_probabilities_internal(self, reference, read, interesting_bases): alignments = self.aligner.align_seq("".join(read.basecall)) if len(alignments) == 0: return alignment = alignments[0] cigar = parse_cigar(alignment.cigar) aligned_reference_length = aligned_length(cigar) reference_aligned = [] corresponding_read_index = [None for i in reference] read_aligned = [] reference_index = alignment.pos if read.strand == '+' else len( reference) - alignment.pos - aligned_reference_length if read.strand == '-': if alignment.orient != '-': print("read and aligner don't agree on its orientation") return cigar = list(reversed(cigar)) read_index = 0 UNALIGNED, GOOD, SUBSTITUTION, DELETION, NEAR_INSERTION = 'U', 'G', 'S', 'D', 'I' reference_status = [UNALIGNED for i in reference[:reference_index]] insertions = [] for op in cigar: if op[1] == 'S': read_index += op[0] elif op[1] == 'M': reference_aligned += reference[ reference_index:reference_index + op[0]] for i in range(op[0]): corresponding_read_index[reference_index + i] = read_index + i if reference[reference_index + i] == read.basecall[read_index + i]: reference_status.append(GOOD) else: reference_status.append(SUBSTITUTION) reference_index += op[0] read_aligned += read.basecall[read_index:read_index + op[0]] read_index += op[0] elif op[1] == 'D': reference_aligned += reference[ reference_index:reference_index + op[0]] reference_status += [DELETION for i in range(op[0])] reference_index += op[0] read_aligned += ['-' for i in range(op[0])] elif op[1] == 'I': read_aligned += read.basecall[read_index:read_index + op[0]] read_index += op[0] reference_aligned += ['-' for i in range(op[0])] insertions.append(reference_index) else: raise ValueError("Unknown operation: {}".format(op[1])) reference_status += [UNALIGNED for i in reference[reference_index:]] for idx in insertions: if idx > 0 and reference_status[idx - 1] == GOOD: reference_status[idx - 1] = NEAR_INSERTION if idx < len(reference) and reference_status[idx] == GOOD: reference_status[idx] = NEAR_INSERTION for b in interesting_bases: if reference_status[b.id] == UNALIGNED: pass elif reference_status[b.id] == SUBSTITUTION: for i, c in enumerate(alphabet): if c == reference[b.id]: b.log_probability[i] += self.log_non_SNP_substitute elif c == read.basecall[corresponding_read_index[b.id]]: b.log_probability[i] += self.log_SNP_true_substitute else: b.log_probability[i] += self.log_SNP_false_substitute else: if_SNP, if_non_SNP = None, None if reference_status[b.id] == GOOD: if_SNP = self.log_SNP_false_good if_non_SNP = self.log_non_SNP_good elif reference_status[b.id] == NEAR_INSERTION: if_SNP = self.log_SNP_insert if_non_SNP = self.log_non_SNP_insert elif reference_status[b.id] == DELETION: if_SNP = self.log_SNP_delete if_non_SNP = self.log_non_SNP_delete for i, c in enumerate(alphabet): if c == reference[b.id]: b.log_probability[i] += if_non_SNP else: b.log_probability[i] += if_SNP if self.log_file and reference_status[ b.id] != UNALIGNED and read.strand == '+': self.log_file.write('{} {}'.format(b.id, reference_status[b.id])) if reference_status[b.id] == GOOD or reference_status[ b.id] == SUBSTITUTION: self.log_file.write(' {}\n'.format( read.basecall[corresponding_read_index[b.id]])) else: self.log_file.write('\n')
class ApproximateAligner: def __init__(self, bwa_executable, reference, reference_filename): self.bwa_executable = bwa_executable self.reference = reference self.reference_filename = reference_filename # A heuristic to check if bwa index has been run already if not os.path.isfile(self.reference_filename + '.bwt'): subprocess.run( [self.bwa_executable, 'index', self.reference_filename], stderr=subprocess.PIPE, check=True) self.bwapy_aligner = None try: from bwapy import BwaAligner self.bwapy_aligner = BwaAligner(reference_filename) except ImportError: sys.stderr.write( "Could't import bwapy, will use bwa executable to align reads\n" ) @staticmethod def _parse_cigar(cigar): num = 0 result = [] for character in cigar: if character >= '0' and character <= '9': num *= 10 num += ord(character) - ord('0') else: result.append((num, character)) num = 0 return result @staticmethod def convert_mapping(base_mapping, read): result = [] for index_in_read, index_in_reference in base_mapping: if index_in_read in read.sequence_to_signal_mapping: result.append((read.sequence_to_signal_mapping[index_in_read], index_in_reference)) return numpy.array(result, dtype=numpy.int) def _get_base_alignment(self, read): if self.bwapy_aligner: alignments = self.bwapy_aligner.align_seq(''.join(read.sequence)) if len(alignments) == 0: return None alignment = alignments[0] cigar = alignment.cigar is_reverse_complement = alignment.orient == '-' mapped_position = alignment.pos else: read_fastq_filename = None with tempfile.NamedTemporaryFile(mode='w', delete=False, prefix='nadavca_tmp', suffix='.fastq') as file: read_fastq_filename = file.name file.write(read.fastq) bwa_output_filename = None with tempfile.NamedTemporaryFile(delete=True, prefix='nadavca_tmp', suffix='.sam') as file: bwa_output_filename = file.name subprocess.run([ self.bwa_executable, 'mem', self.reference_filename, read_fastq_filename, '-o', bwa_output_filename ], stderr=subprocess.PIPE, check=True) with simplesam.Reader(open(bwa_output_filename, 'r')) as reader: sam = reader.next() if not sam.mapped: return None cigar = sam.cigar is_reverse_complement = sam.reverse mapped_position = sam.pos - 1 os.remove(read_fastq_filename) os.remove(bwa_output_filename) oriented_read = Genome.reverse_complement( read.sequence) if is_reverse_complement else read.sequence index_in_read = 0 index_in_reference = mapped_position base_mapping = [] parsed_cigar = self._parse_cigar(cigar) for num, operation in parsed_cigar: if operation == 'S': index_in_read += num elif operation == 'M': for i in range(num): if self.reference[index_in_reference] == oriented_read[ index_in_read]: base_mapping.append( (index_in_read, index_in_reference)) index_in_read += 1 index_in_reference += 1 elif operation == 'D': index_in_reference += num elif operation == 'I': index_in_read += num else: raise ValueError( 'Unknown cigar operation: {}'.format(operation)) if is_reverse_complement: for i, val in enumerate(base_mapping): base_mapping[i] = (len(read.sequence) - 1 - val[0], len(self.reference) - 1 - val[1]) base_mapping.reverse() return numpy.array(base_mapping, dtype=numpy.int), is_reverse_complement def get_signal_alignment(self, read, bandwidth): base_alignment = self._get_base_alignment(read) if base_alignment is None: return None base_mapping, is_reverse_complement = base_alignment signal_mapping = self.convert_mapping(base_mapping, read) start_in_reference = signal_mapping[0][1] end_in_reference = signal_mapping[-1][1] + 1 signal_mapping[:, 1] -= start_in_reference if is_reverse_complement: start_in_reference, end_in_reference = \ len(self.reference) - end_in_reference, \ len(self.reference) - start_in_reference reference_range = (start_in_reference, end_in_reference) start_in_signal = signal_mapping[0][0] end_in_signal = signal_mapping[-1][0] + 1 extended_start_in_signal = max(0, start_in_signal - bandwidth) extended_end_in_signal = min(len(read.normalized_signal), end_in_signal + bandwidth) signal_range = (extended_start_in_signal, extended_end_in_signal) signal_mapping[:, 0] -= extended_start_in_signal start_in_read_sequence = base_mapping[0][0] end_in_read_sequence = base_mapping[-1][0] + 1 read_sequence_range = (start_in_read_sequence, end_in_read_sequence) return ApproximateSignalAlignment( alignment=signal_mapping, signal_range=signal_range, reference_range=reference_range, read_sequence_range=read_sequence_range, reverse_complement=is_reverse_complement)
def __init__(self, index, nm = 0.1): self.matched = re.compile('([0-9]+)M') self.clipped = re.compile('([0-9]+)S') self.aligner = BwaAligner(index, options = '-k 12 -B 3') self.NM = nm self.aligned=None
def __init__(self, bam, index): self.bam = pysam.Samfile(bam) self.aligner = BwaAligner(index, options='-k 12')
def load_index(): global aligner print('Loading Index: ' + index) aligner = BwaAligner(index) print('Index Loaded')