Beispiel #1
0
    def __init__(self,
                 index,
                 *args,
                 clean=False,
                 bwa_opts='-x ont2d',
                 **kwargs):
        """bwa mem alignment server implementation using python binding.

        :param index: bwa index base path, or list thereof.
        :param clean: clean-up shared memory on exit.
        :param bwa_opts: command line options for bwa mem.

        """
        super().__init__(*args, **kwargs)
        self.logger = logging.getLogger('BwaServe')
        self.index = index
        self.bwa_opts = bwa_opts

        self.aligner = None
        if BwaAligner is None:
            raise ImportError(
                '{} requires BwaAligner which could not be imported.'.format(
                    self.__class__.__name__))
        self.aligner = BwaAligner(self.index, options=self.bwa_opts)
        self.logger.info('bwa service started.')
Beispiel #2
0
 def __init__(self):
     bam = '/stor/work/Lambowitz/cdw2854/cfNA/tgirt_map/merged_bam/dedup/unfragmented.chrM_filter.bam'
     index = '/stor/work/Lambowitz/ref/Ecoli/BL21_DE3.fa'
     self.bam = pysam.Samfile(bam)
     self.aligner = BwaAligner(index, options='-k 12')
     self.matched = re.compile('([0-9]+)M')
     self.clipped = re.compile('([0-9]+)S')
     self.alignments = None
Beispiel #3
0
    def align(self, sequence):
        """Align a base sequence.

        :param sequence: sequence to align.

        :returns: the output of bwa mem call.
        """
        if self.aligner is None:
            self.aligner = BwaAligner(self.index, options=self.bwa_opts)
        self.logger.debug("Aligning sequence of length {}.".format(
            len(sequence)))
        return self.aligner.align_seq(sequence)
Beispiel #4
0
 def __init__(self, reference_path, config, log_filename=None):
     self.aligner = BwaAligner(reference_path)
     self.log_SNP_insert = np.log(config['SNP_insert'])
     self.log_non_SNP_insert = np.log(config['non_SNP_insert'])
     self.log_SNP_delete = np.log(config['SNP_delete'])
     self.log_non_SNP_delete = np.log(config['non_SNP_delete'])
     self.log_SNP_false_good = np.log(config['SNP_false_good'])
     self.log_non_SNP_good = np.log(config['non_SNP_good'])
     self.log_SNP_true_substitute = np.log(config['SNP_true_substitute'])
     self.log_SNP_false_substitute = np.log(config['SNP_false_substitute'])
     self.log_non_SNP_substitute = np.log(config['non_SNP_substitute'])
     self.log_file = None
     if log_filename:
         self.log_file = open(log_filename, 'w')
Beispiel #5
0
class ecoli_mapper():
    def __init__(self):
        bam = '/stor/work/Lambowitz/cdw2854/cfNA/tgirt_map/merged_bam/dedup/unfragmented.chrM_filter.bam'
        index = '/stor/work/Lambowitz/ref/Ecoli/BL21_DE3.fa'
        self.bam = pysam.Samfile(bam)
        self.aligner = BwaAligner(index, options='-k 12')
        self.matched = re.compile('([0-9]+)M')
        self.clipped = re.compile('([0-9]+)S')
        self.alignments = None

    def ecoli_map(self, chrom, start, end):
        aligned = 0.0
        self.alignments = []
        for aln_count, aln in enumerate(self.bam.fetch(chrom, start, end)):
            alns = self.aligner.align_seq(aln.query_sequence)
            self.alignments.append(alns)
            filtered_alignments = filter(self.filter_bad_cigar, alns)
            if list(filtered_alignments):
                aligned += 1
        return aligned / (aln_count + 1)

    def filter_bad_cigar(self, aln):
        clipped_base = sum(map(int, self.clipped.findall(aln.cigar))) or 0
        mapped_base = sum(map(int, self.matched.findall(aln.cigar)))
        return (float(clipped_base) / mapped_base) < 0.2 and aln.NM < 3
Beispiel #6
0
 def __init__(self,
              in_vcf,
              reference,
              out_vcf="/dev/stdout",
              min_length=50,
              threshold=0.8,
              min_distance=10,
              anno_hits=0):
     """ The setup """
     self.in_vcf = in_vcf
     self.reference = reference
     self.out_vcf = out_vcf
     self.min_length = min_length
     self.threshold = threshold
     self.min_distance = min_distance
     self.anno_hits = anno_hits
     self.n_header = None
     self.aligner = BwaAligner(self.reference, options="-a")
Beispiel #7
0
class BwapyServe(rpc.AttrHandler):
    def __init__(self,
                 index,
                 *args,
                 clean=False,
                 bwa_opts='-x ont2d',
                 **kwargs):
        """bwa mem alignment server implementation using python binding.

        :param index: bwa index base path, or list thereof.
        :param clean: clean-up shared memory on exit.
        :param bwa_opts: command line options for bwa mem.

        """
        super().__init__(*args, **kwargs)
        self.logger = logging.getLogger('BwaServe')
        self.index = index
        self.bwa_opts = bwa_opts

        self.aligner = None
        if BwaAligner is None:
            raise ImportError(
                '{} requires BwaAligner which could not be imported.'.format(
                    self.__class__.__name__))
        print(self.bwa_opts)
        self.aligner = BwaAligner(self.index, options=self.bwa_opts)
        self.logger.info('bwa service started.')

    def _clean_index(self):
        self.logger.info('Cleaning alignment proxy.')
        self.aligner = None

    def __del__(self):
        self._clean_index()

    @rpc.method
    def clean_index(self):
        """Destroy the aligner object, which will cleanup the index in memory."""
        self._clean_index()

    @rpc.method
    @asyncio.coroutine
    def align(self, sequence):
        """Align a base sequence.

        :param sequence: sequence to align.

        :returns: the output of bwa mem call.
        """
        if self.aligner is None:
            self.aligner = BwaAligner(self.index, options=self.bwa_opts)
        self.logger.debug("Aligning sequence of length {}.".format(
            len(sequence)))
        results = self.aligner.align_seq(sequence)
        self.logger.info("Aligned sequence of {} bases with {} hits.".format(
            len(sequence), len(results)))
        return results
Beispiel #8
0
    def __init__(self, bwa_executable, reference, reference_filename, references_dict=None):
        self.bwa_executable = bwa_executable
        assert reference is not None or references_dict is not None, "one of reference or reference_dict must be not None"
        self.reference = reference
        self.references_dict = references_dict
        self.reference_filename = reference_filename

        # A heuristic to check if bwa index has been run already
        if not os.path.isfile(self.reference_filename + '.bwt'):
            subprocess.run([self.bwa_executable, 'index', self.reference_filename],
                           stderr=subprocess.PIPE, check=True)

        self.bwapy_aligner = None
        try:
            from bwapy import BwaAligner
            options = '-x ont2d'
            self.bwapy_aligner = BwaAligner(reference_filename, options=options)
        except ImportError:
            sys.stderr.write("Could't import bwapy, will use bwa executable to align reads\n")
Beispiel #9
0
    def __init__(self, index, *args, map_opts={'x': 'ont2d'}, **kwargs):
        """bwa mem alignment server implementation using python binding.

        :param index: bwa index base path, or list thereof.
        :param map_opts: command line options for bwa mem as dictionary.

        """
        super().__init__(*args, **kwargs)
        self.logger = logging.getLogger('BwaServe')
        self.index = index

        # expand map_opts to a string:
        opts = []
        for k, v in map_opts.items():
            opts.append('-{} {}'.format(k, v))
        self.bwa_opts = ' '.join(opts)

        self.aligner = None
        self.aligner = BwaAligner(self.index, options=self.bwa_opts)
        self.logger.info('bwa service started.')
Beispiel #10
0
class BwapyServe(rpc.AttrHandler):

    def __init__(self, index, *args, clean=False, bwa_opts='-x ont2d', **kwargs):
        """bwa mem alignment server implementation using python binding.

        :param index: bwa index base path, or list thereof.
        :param clean: clean-up shared memory on exit.
        :param bwa_opts: command line options for bwa mem.

        """
        super().__init__(*args, **kwargs)
        self.logger = logging.getLogger('BwaServe')
        self.index = index
        self.bwa_opts = bwa_opts

        self.aligner = None
        if BwaAligner is None:
            raise ImportError(
                '{} requires BwaAligner which could not be imported.'.format(
                self.__class__.__name__
            ))
        print(self.bwa_opts)
        self.aligner = BwaAligner(self.index, options=self.bwa_opts)
        self.logger.info('bwa service started.')

    def _clean_index(self):
        self.logger.info('Cleaning alignment proxy.')
        self.aligner = None

    def __del__(self):
            self._clean_index()

    @rpc.method
    def clean_index(self):
        """Destroy the aligner object, which will cleanup the index in memory."""
        self._clean_index()

    @rpc.method
    @asyncio.coroutine
    def align(self, sequence):
        """Align a base sequence.

        :param sequence: sequence to align.

        :returns: the output of bwa mem call.
        """
        if self.aligner is None:
            self.aligner = BwaAligner(self.index, options=self.bwa_opts)
        self.logger.debug("Aligning sequence of length {}.".format(len(sequence)))
        results = self.aligner.align_seq(sequence)
        self.logger.info("Aligned sequence of {} bases with {} hits.".format(
            len(sequence), len(results)
        ))
        return results
Beispiel #11
0
    def __init__(self, bwa_executable, reference, reference_filename):
        self.bwa_executable = bwa_executable
        self.reference = reference
        self.reference_filename = reference_filename

        # A heuristic to check if bwa index has been run already
        if not os.path.isfile(self.reference_filename + '.bwt'):
            subprocess.run(
                [self.bwa_executable, 'index', self.reference_filename],
                stderr=subprocess.PIPE,
                check=True)

        self.bwapy_aligner = None
        try:
            from bwapy import BwaAligner
            self.bwapy_aligner = BwaAligner(reference_filename)
        except ImportError:
            sys.stderr.write(
                "Could't import bwapy, will use bwa executable to align reads\n"
            )
Beispiel #12
0
class BwapyServe(rpc.AttrHandler):
    def __init__(self, index, *args, map_opts={'x': 'ont2d'}, **kwargs):
        """bwa mem alignment server implementation using python binding.

        :param index: bwa index base path, or list thereof.
        :param map_opts: command line options for bwa mem as dictionary.

        """
        super().__init__(*args, **kwargs)
        self.logger = logging.getLogger('BwaServe')
        self.index = index

        # expand map_opts to a string:
        opts = []
        for k, v in map_opts.items():
            opts.append('-{} {}'.format(k, v))
        self.bwa_opts = ' '.join(opts)

        self.aligner = None
        self.aligner = BwaAligner(self.index, options=self.bwa_opts)
        self.logger.info('bwa service started.')

    def _clean_index(self):
        self.logger.info('Cleaning alignment proxy.')
        self.aligner = None

    def __del__(self):
        self._clean_index()

    @rpc.method
    def clean_index(self):
        """Destroy the aligner object, which will cleanup the index in memory."""
        self._clean_index()

    @rpc.method
    @asyncio.coroutine
    def align(self, sequence):
        """Align a base sequence.

        :param sequence: sequence to align.

        :returns: the output of bwa mem call.
        """
        if self.aligner is None:
            self.aligner = BwaAligner(self.index, options=self.bwa_opts)
        self.logger.debug("Aligning sequence of length {}.".format(
            len(sequence)))
        results = self.aligner.align_seq(sequence)
        self.logger.info("Aligned sequence of {} bases with {} hits.".format(
            len(sequence), len(results)))
        return results
Beispiel #13
0
    def align(self, sequence):
        """Align a base sequence.

        :param sequence: sequence to align.

        :returns: the output of bwa mem call.
        """
        if self.aligner is None:
            self.aligner = BwaAligner(self.index, options=self.bwa_opts)
        self.logger.debug("Aligning sequence of length {}.".format(len(sequence)))
        results = self.aligner.align_seq(sequence)
        self.logger.info("Aligned sequence of {} bases with {} hits.".format(
            len(sequence), len(results)
        ))
        return results
Beispiel #14
0
def grm_main(cmdargs):
    """
    Builds a graph-genome from the vcf and reference,
    creates the sets of kmers at edges in the graph
    maps those kmers globally to the reference
    reports mapping metrics

    Todo:
    - document the bwa package and how to install
    - better column names along with documentation
    """
    if not HASBWALIB:
        logging.error("bwapy isn't available on this machine")
        sys.exit(1)
    args = parse_args(cmdargs)
    if not args.regions:
        m_ranges = truvari.ref_ranges(args.reference)
    else:
        m_ranges = truvari.bed_ranges(args.regions)

    grm_shared.aligner = BwaAligner(args.reference, '-a')
    header = ["key"]
    for prefix in ["rup_", "rdn_", "aup_", "adn_"]:
        for key in [
                "nhits", "avg_q", "avg_ed", "avg_mat", "avg_mis", "dir_hits",
                "com_hits", "max_q", "max_ed", "max_mat", "max_mis",
                "max_strand", "min_q", "min_ed", "min_mat", "min_mis",
                "min_strand"
        ]:
            header.append(prefix + key)
    grm_shared.header = header
    grm_shared.ref_filename = args.reference
    grm_shared.kmersize = args.kmersize
    grm_shared.input = args.input
    grm_shared.min_size = args.min_size
    with multiprocessing.Pool(args.threads, maxtasksperchild=1) as pool:
        logging.info("Processing")
        chunks = pool.imap(process_entries, m_ranges)
        pool.close()
        data = pd.concat(chunks, ignore_index=True)
        logging.info("Saving; df shape %s", data.shape)
        joblib.dump(data, args.output)
        logging.info("Finished grm")
        pool.join()
Beispiel #15
0
def recover_ccs_reads(short_reads, ref_fasta, ss_index, gtf_index,
                      intron_index, is_canonical, out_dir, prefix, threads):
    from bwapy import BwaAligner

    # Second scanning of short reads
    genome = Fasta(ref_fasta)

    options = '-x ont2d -T 19'
    bwa_aligner = Aligner(BwaAligner(ref_fasta, options=options))

    chunk_size = 250
    jobs = []
    pool = Pool(threads, env.initializer,
                (bwa_aligner, genome.contig_len, genome, gtf_index,
                 intron_index, ss_index))
    for reads in grouper(short_reads, chunk_size):
        chunk = [i for i in reads if i is not None]
        jobs.append(pool.apply_async(recover_ccs_chunk, (chunk, is_canonical)))
    pool.close()

    prog = ProgressBar()
    prog.update(0)
    finished_cnt = 0

    reads_count = defaultdict(int)
    with open('{}/{}.cand_circ.fa'.format(out_dir, prefix), 'a') as out:
        for job in jobs:
            finished_cnt += 1

            tmp_cnt, ret = job.get()
            for key, value in tmp_cnt.items():
                reads_count[key] += value

            for read_id, circ_id, strand, cir_exon_tag, ss_id, clip_info, segments, circ_seq in ret:
                out.write('>{}\t{}\t{}\t{}\t{}\t{}\t{}\n{}\n'.format(
                    read_id, circ_id, strand, cir_exon_tag, ss_id, clip_info,
                    segments, circ_seq))
            prog.update(100 * finished_cnt / len(jobs))

    pool.join()
    prog.update(100)

    return reads_count
Beispiel #16
0
class exogenous_mapper():
    def __init__(self, index, nm = 0.1):
        self.matched = re.compile('([0-9]+)M')
        self.clipped = re.compile('([0-9]+)S')
        self.aligner = BwaAligner(index, options = '-k 12 -B 3')
        self.NM = nm
        self.aligned=None

    def map(self, seq):
        self.aligned=None
        self.aligned = self.aligner.align_seq(seq)
        self.aligned = list(filter(self.filter_bad_cigar, self.aligned))
        return 1 if self.aligned else 0


    def filter_bad_cigar(self, aln):
        cigar = aln.cigar
        clipped_base = sum(map(int, self.clipped.findall(cigar))) or 0
        mapped_base = sum(map(int, self.matched.findall(cigar)))
        return ( (float(clipped_base)  + aln.NM)/mapped_base ) <= self.NM
Beispiel #17
0
class chrM_mapper():
    def __init__(self, bam, index):
        self.bam = pysam.Samfile(bam)
        self.aligner = BwaAligner(index, options='-k 12')

    def __align__(self, seq):
        alignments = self.aligner.align_seq(seq)
        return 1 if alignments else 0

    def run_peak(self, chrom, start, end, peak_strand):
        alns = 0
        chrM_alns = 0
        for aln in self.bam.fetch(chrom, start, end):
            if aln.is_read1:
                aln_strand = '-' if aln.is_reverse else '+'
                if aln_strand == peak_strand:
                    alns += 1
                    seq = aln.get_forward_sequence()
                    chrM_alns += self.__align__(seq)
        return alns, chrM_alns
Beispiel #18
0
    def __init__(self, index, *args, clean=False, bwa_opts='-x ont2d', **kwargs):
        """bwa mem alignment server implementation using python binding.

        :param index: bwa index base path, or list thereof.
        :param clean: clean-up shared memory on exit.
        :param bwa_opts: command line options for bwa mem.

        """
        super().__init__(*args, **kwargs)
        self.logger = logging.getLogger('BwaServe')
        self.index = index
        self.bwa_opts = bwa_opts

        self.aligner = None
        if BwaAligner is None:
            raise ImportError(
                '{} requires BwaAligner which could not be imported.'.format(
                self.__class__.__name__
            ))
        print(self.bwa_opts)
        self.aligner = BwaAligner(self.index, options=self.bwa_opts)
        self.logger.info('bwa service started.')
Beispiel #19
0
# server.py
# from flask import Flask, render_template

# app = Flask(__name__, static_folder="../static/dist", template_folder="../static")

# @app.route("/")
# def index():
#     return render_template("index.html")

# @app.route("/hello")
# def hello():
#     return "Hello World!”

# if __name__ == "__main__":
# app.run()


import pysam
from bwapy import BwaAligner
index = 'path/to/index'  # the path given to bwa index
seq = 'ACGATCGCGATCGA'

aligner = BwaAligner(index)
alignments = aligner.align_seq(seq)
print('Found {} alignments.'.format(len(alignments))
for aln in alignments:
    print('  ', aln)
Beispiel #20
0
class Remap():
    """ Class for remapping annotation """
    def __init__(self,
                 in_vcf,
                 reference,
                 out_vcf="/dev/stdout",
                 min_length=50,
                 threshold=0.8,
                 min_distance=10,
                 anno_hits=0):
        """ The setup """
        self.in_vcf = in_vcf
        self.reference = reference
        self.out_vcf = out_vcf
        self.min_length = min_length
        self.threshold = threshold
        self.min_distance = min_distance
        self.anno_hits = anno_hits
        self.n_header = None
        self.aligner = BwaAligner(self.reference, options="-a")

    def edit_header(self, header=None):
        """
        Edits and holds on to the header
        """
        if header is None:
            with pysam.VariantFile(self.in_vcf, 'r') as fh:
                header = fh.header.copy()
        header.add_line(('##INFO=<ID=REMAP,Number=1,Type=String,'
                         'Description="Annotation of alt-seq remapping">'))
        if self.anno_hits:
            header.add_line(('##INFO=<ID=REMAPHits,Number=.,Type=String,'
                             'Description="List of chr:start-end of hits">'))

        self.n_header = header

    def get_end(self, pos, cigar):  # pylint: disable=no-self-use
        """
        Expand a cigar to get the end position and how many bases are clipped
        """
        soft_bases = 0
        for i in cigmatch.findall(cigar):
            if i[-1] == "S":
                soft_bases += int(i[:-1])
            elif i[-1] in ["M", "D"]:
                pos += int(i[:-1])
        return pos, soft_bases

    def remap_entry(self, entry, threshold=.8):
        """
        Map a sequence and return the information from it
        """
        is_del = truvari.entry_variant_type(entry) == "DEL"
        if is_del:
            seq = str(entry.ref)
        else:
            seq = entry.alts[0]
        all_hits = []
        num_hits = 0
        partial_hits = 0
        close_dist = None
        for aln in self.aligner.align_seq(seq):
            # Take out the 'same spot' alignment for deletions
            dist = abs(aln.pos - entry.pos)
            if is_del and aln.rname == entry.chrom and dist < self.min_distance:
                continue

            # Filter hits below threshold
            end, soft = self.get_end(aln.pos, aln.cigar)
            seq_len = len(seq)
            pct_query = (seq_len - soft) / seq_len
            if pct_query < threshold:
                partial_hits += 1
                continue
            hit = f"{aln.rname}:{aln.pos}-{end}.{int(pct_query*100)}"
            bisect.insort(all_hits, (pct_query, hit))
            num_hits += 1
            if aln.rname != entry.chrom:
                continue
            if close_dist is None or dist < close_dist:
                close_dist = dist

        if num_hits == 0 and partial_hits == 0:
            return "novel", all_hits
        if close_dist and close_dist <= len(seq):
            return "tandem", all_hits
        if num_hits == 0 and partial_hits != 0:
            return "partial", all_hits

        return "interspersed", all_hits

    def annotate_entry(self, entry):
        """
        Annotates entries in the vcf and writes to new vcf
        """
        if truvari.entry_size(entry) >= self.min_length:
            entry.translate(self.n_header)
            remap, hits = self.remap_entry(entry)
            entry.info["REMAP"] = remap
            if self.anno_hits and hits:
                entry.info["REMAPHits"] = [
                    _[1] for _ in hits[-self.anno_hits:]
                ]
        return entry

    def annotate_vcf(self):
        """
        Annotates the vcf and writes to new vcf
        """
        with pysam.VariantFile(self.in_vcf) as fh:
            self.edit_header(fh.header.copy())
            out = pysam.VariantFile(self.out_vcf, 'w', header=self.n_header)
            for entry in fh:
                entry = self.annotate_entry(entry)
                out.write(entry)
Beispiel #21
0
class basecall_model(model):
    def __init__(self, reference_path, config, log_filename=None):
        self.aligner = BwaAligner(reference_path)
        self.log_SNP_insert = np.log(config['SNP_insert'])
        self.log_non_SNP_insert = np.log(config['non_SNP_insert'])
        self.log_SNP_delete = np.log(config['SNP_delete'])
        self.log_non_SNP_delete = np.log(config['non_SNP_delete'])
        self.log_SNP_false_good = np.log(config['SNP_false_good'])
        self.log_non_SNP_good = np.log(config['non_SNP_good'])
        self.log_SNP_true_substitute = np.log(config['SNP_true_substitute'])
        self.log_SNP_false_substitute = np.log(config['SNP_false_substitute'])
        self.log_non_SNP_substitute = np.log(config['non_SNP_substitute'])
        self.log_file = None
        if log_filename:
            self.log_file = open(log_filename, 'w')

    def update_probabilities_internal(self, reference, read,
                                      interesting_bases):
        alignments = self.aligner.align_seq("".join(read.basecall))
        if len(alignments) == 0:
            return
        alignment = alignments[0]
        cigar = parse_cigar(alignment.cigar)
        aligned_reference_length = aligned_length(cigar)
        reference_aligned = []
        corresponding_read_index = [None for i in reference]
        read_aligned = []
        reference_index = alignment.pos if read.strand == '+' else len(
            reference) - alignment.pos - aligned_reference_length
        if read.strand == '-':
            if alignment.orient != '-':
                print("read and aligner don't agree on its orientation")
                return
            cigar = list(reversed(cigar))
        read_index = 0

        UNALIGNED, GOOD, SUBSTITUTION, DELETION, NEAR_INSERTION = 'U', 'G', 'S', 'D', 'I'
        reference_status = [UNALIGNED for i in reference[:reference_index]]

        insertions = []

        for op in cigar:
            if op[1] == 'S':
                read_index += op[0]
            elif op[1] == 'M':
                reference_aligned += reference[
                    reference_index:reference_index + op[0]]
                for i in range(op[0]):
                    corresponding_read_index[reference_index +
                                             i] = read_index + i
                    if reference[reference_index +
                                 i] == read.basecall[read_index + i]:
                        reference_status.append(GOOD)
                    else:
                        reference_status.append(SUBSTITUTION)
                reference_index += op[0]
                read_aligned += read.basecall[read_index:read_index + op[0]]
                read_index += op[0]
            elif op[1] == 'D':
                reference_aligned += reference[
                    reference_index:reference_index + op[0]]
                reference_status += [DELETION for i in range(op[0])]
                reference_index += op[0]
                read_aligned += ['-' for i in range(op[0])]
            elif op[1] == 'I':
                read_aligned += read.basecall[read_index:read_index + op[0]]
                read_index += op[0]
                reference_aligned += ['-' for i in range(op[0])]
                insertions.append(reference_index)
            else:
                raise ValueError("Unknown operation: {}".format(op[1]))
        reference_status += [UNALIGNED for i in reference[reference_index:]]

        for idx in insertions:
            if idx > 0 and reference_status[idx - 1] == GOOD:
                reference_status[idx - 1] = NEAR_INSERTION
            if idx < len(reference) and reference_status[idx] == GOOD:
                reference_status[idx] = NEAR_INSERTION

        for b in interesting_bases:
            if reference_status[b.id] == UNALIGNED:
                pass
            elif reference_status[b.id] == SUBSTITUTION:
                for i, c in enumerate(alphabet):
                    if c == reference[b.id]:
                        b.log_probability[i] += self.log_non_SNP_substitute
                    elif c == read.basecall[corresponding_read_index[b.id]]:
                        b.log_probability[i] += self.log_SNP_true_substitute
                    else:
                        b.log_probability[i] += self.log_SNP_false_substitute
            else:
                if_SNP, if_non_SNP = None, None
                if reference_status[b.id] == GOOD:
                    if_SNP = self.log_SNP_false_good
                    if_non_SNP = self.log_non_SNP_good
                elif reference_status[b.id] == NEAR_INSERTION:
                    if_SNP = self.log_SNP_insert
                    if_non_SNP = self.log_non_SNP_insert
                elif reference_status[b.id] == DELETION:
                    if_SNP = self.log_SNP_delete
                    if_non_SNP = self.log_non_SNP_delete
                for i, c in enumerate(alphabet):
                    if c == reference[b.id]:
                        b.log_probability[i] += if_non_SNP
                    else:
                        b.log_probability[i] += if_SNP

            if self.log_file and reference_status[
                    b.id] != UNALIGNED and read.strand == '+':
                self.log_file.write('{} {}'.format(b.id,
                                                   reference_status[b.id]))
                if reference_status[b.id] == GOOD or reference_status[
                        b.id] == SUBSTITUTION:
                    self.log_file.write(' {}\n'.format(
                        read.basecall[corresponding_read_index[b.id]]))
                else:
                    self.log_file.write('\n')
Beispiel #22
0
class ApproximateAligner:
    def __init__(self, bwa_executable, reference, reference_filename):
        self.bwa_executable = bwa_executable
        self.reference = reference
        self.reference_filename = reference_filename

        # A heuristic to check if bwa index has been run already
        if not os.path.isfile(self.reference_filename + '.bwt'):
            subprocess.run(
                [self.bwa_executable, 'index', self.reference_filename],
                stderr=subprocess.PIPE,
                check=True)

        self.bwapy_aligner = None
        try:
            from bwapy import BwaAligner
            self.bwapy_aligner = BwaAligner(reference_filename)
        except ImportError:
            sys.stderr.write(
                "Could't import bwapy, will use bwa executable to align reads\n"
            )

    @staticmethod
    def _parse_cigar(cigar):
        num = 0
        result = []
        for character in cigar:
            if character >= '0' and character <= '9':
                num *= 10
                num += ord(character) - ord('0')
            else:
                result.append((num, character))
                num = 0
        return result

    @staticmethod
    def convert_mapping(base_mapping, read):
        result = []
        for index_in_read, index_in_reference in base_mapping:
            if index_in_read in read.sequence_to_signal_mapping:
                result.append((read.sequence_to_signal_mapping[index_in_read],
                               index_in_reference))
        return numpy.array(result, dtype=numpy.int)

    def _get_base_alignment(self, read):
        if self.bwapy_aligner:
            alignments = self.bwapy_aligner.align_seq(''.join(read.sequence))
            if len(alignments) == 0:
                return None
            alignment = alignments[0]
            cigar = alignment.cigar
            is_reverse_complement = alignment.orient == '-'
            mapped_position = alignment.pos

        else:
            read_fastq_filename = None
            with tempfile.NamedTemporaryFile(mode='w',
                                             delete=False,
                                             prefix='nadavca_tmp',
                                             suffix='.fastq') as file:
                read_fastq_filename = file.name
                file.write(read.fastq)

            bwa_output_filename = None
            with tempfile.NamedTemporaryFile(delete=True,
                                             prefix='nadavca_tmp',
                                             suffix='.sam') as file:
                bwa_output_filename = file.name

            subprocess.run([
                self.bwa_executable, 'mem', self.reference_filename,
                read_fastq_filename, '-o', bwa_output_filename
            ],
                           stderr=subprocess.PIPE,
                           check=True)
            with simplesam.Reader(open(bwa_output_filename, 'r')) as reader:
                sam = reader.next()
                if not sam.mapped:
                    return None
                cigar = sam.cigar
                is_reverse_complement = sam.reverse
                mapped_position = sam.pos - 1

            os.remove(read_fastq_filename)
            os.remove(bwa_output_filename)

        oriented_read = Genome.reverse_complement(
            read.sequence) if is_reverse_complement else read.sequence

        index_in_read = 0
        index_in_reference = mapped_position
        base_mapping = []
        parsed_cigar = self._parse_cigar(cigar)

        for num, operation in parsed_cigar:
            if operation == 'S':
                index_in_read += num
            elif operation == 'M':
                for i in range(num):
                    if self.reference[index_in_reference] == oriented_read[
                            index_in_read]:
                        base_mapping.append(
                            (index_in_read, index_in_reference))
                    index_in_read += 1
                    index_in_reference += 1
            elif operation == 'D':
                index_in_reference += num
            elif operation == 'I':
                index_in_read += num
            else:
                raise ValueError(
                    'Unknown cigar operation: {}'.format(operation))

        if is_reverse_complement:
            for i, val in enumerate(base_mapping):
                base_mapping[i] = (len(read.sequence) - 1 - val[0],
                                   len(self.reference) - 1 - val[1])
            base_mapping.reverse()

        return numpy.array(base_mapping,
                           dtype=numpy.int), is_reverse_complement

    def get_signal_alignment(self, read, bandwidth):
        base_alignment = self._get_base_alignment(read)
        if base_alignment is None:
            return None

        base_mapping, is_reverse_complement = base_alignment
        signal_mapping = self.convert_mapping(base_mapping, read)

        start_in_reference = signal_mapping[0][1]
        end_in_reference = signal_mapping[-1][1] + 1
        signal_mapping[:, 1] -= start_in_reference

        if is_reverse_complement:
            start_in_reference, end_in_reference = \
                len(self.reference) - end_in_reference, \
                len(self.reference) - start_in_reference
        reference_range = (start_in_reference, end_in_reference)

        start_in_signal = signal_mapping[0][0]
        end_in_signal = signal_mapping[-1][0] + 1
        extended_start_in_signal = max(0, start_in_signal - bandwidth)
        extended_end_in_signal = min(len(read.normalized_signal),
                                     end_in_signal + bandwidth)
        signal_range = (extended_start_in_signal, extended_end_in_signal)

        signal_mapping[:, 0] -= extended_start_in_signal

        start_in_read_sequence = base_mapping[0][0]
        end_in_read_sequence = base_mapping[-1][0] + 1
        read_sequence_range = (start_in_read_sequence, end_in_read_sequence)

        return ApproximateSignalAlignment(
            alignment=signal_mapping,
            signal_range=signal_range,
            reference_range=reference_range,
            read_sequence_range=read_sequence_range,
            reverse_complement=is_reverse_complement)
Beispiel #23
0
 def __init__(self, index, nm = 0.1):
     self.matched = re.compile('([0-9]+)M')
     self.clipped = re.compile('([0-9]+)S')
     self.aligner = BwaAligner(index, options = '-k 12 -B 3')
     self.NM = nm
     self.aligned=None
Beispiel #24
0
 def __init__(self, bam, index):
     self.bam = pysam.Samfile(bam)
     self.aligner = BwaAligner(index, options='-k 12')
def load_index():
    global aligner
    print('Loading Index: ' + index)
    aligner = BwaAligner(index)
    print('Index Loaded')