Esempio n. 1
0
File: bam.py Progetto: rtcz/varlock
 def __init__(self, bam_filename: str, start_index: int, end_index: int):
     """
     :param bam_filename:
     :param start_index: iterate from 0-based index inclusive
     :param end_index: iterate to 0-based index inclusive
     
     Iterator includes mapped reads within the range and all unmapped reads.
     BAM index ensures that BAM is sorted and is needed to resolve range.
     Iterator assumes that unplaced alignment can be anywhere in BAM file.
     """
     assert start_index <= end_index
     
     self._bam_file = open_bam(bam_filename, 'rb')
     
     if not self._bam_file.has_index():
         raise IndexError('BAM has no index')
     
     self._fai = FastaIndex.from_bam(self._bam_file)
     self._start_ref_name, self._start_ref_pos = self._fai.resolve_start_pos(start_index)
     self._end_ref_name, self._end_ref_pos = self._fai.resolve_end_pos(end_index)
     
     self._start_ref_id = self._fai.ref_id(self._start_ref_name)
     self._end_ref_id = self._fai.ref_id(self._end_ref_name)
     
     self._iterator = self._bam_file.fetch(until_eof=True)
Esempio n. 2
0
    def create_vac(self, bam_filename: str, vcf_filename: str,
                   out_vac_filename: str, ref_fasta_filename: str,
                   skip_indels: bool):
        """
        BAM and VCF should use same reference genome.
        VCF must contain INFO column with sub-fields AC and AN.
        :param bam_filename: filename of the SAM/BAM file, from which the header is extracted
        :param vcf_filename: filename of the input VCF file
        :param out_vac_filename: filename of the output VAC file
        :param ref_fasta_filename: filename to reference FASTA file
        :param skip_indels: whether to skip indels and keep only SNPs
        """
        # TODO use fasta index / vcf header instead of BAM header

        # load the reference FASTA
        ref_fasta = None
        if ref_fasta_filename is not None:
            if self._verbose:
                print('--- Loading Reference Fasta ---')
            ref_fasta = pyfaidx.Fasta(ref_fasta_filename)

        # is VCF gzipped?
        # is_gzipped = vcf_filename.endswith(('.gz', '.bgz'))

        # open all files and create the VAC file
        if self._verbose:
            print('--- Processing VCF %s ---' % vcf_filename)
        with pysam.VariantFile(vcf_filename) as vcf_file, \
                open_bam(bam_filename, 'rb') as sam_file, \
                open(out_vac_filename, 'wb') as out_vac_file:
            vac = Vac(FastaIndex.from_bam(sam_file), self._verbose)
            vac.vcf2vac(vcf_file, out_vac_file, ref_fasta, skip_indels)
Esempio n. 3
0
    def __init__(self, fai: FastaIndex, mut_p: float, rnd: VeryRandom):
        """
        :param fai:
        :param mut_p: random variant (mutation) probability per genome base
        :param rnd:
        """
        assert 0 <= mut_p <= 0.001
        self._fai = fai
        self._rnd = rnd
        length = fai.last_index() - fai.first_index()
        assert length >= 0
        mut_count = int(mut_p * length)

        # sample random genomic indices from uniform distribution
        self._indices = np.sort(
            rnd.rand_ints(fai.first_index(), fai.last_index(), mut_count))
        self._counter = 0
Esempio n. 4
0
    def __init__(self, filename: str, verbose: bool = False):
        """
        :param filename: BAM filename
        :param verbose:
        """
        self._verbose = verbose
        self._stats = {}
        self._bam_filename = filename

        with bam.open_bam(self._bam_filename, 'rb') as bam_file:
            self._bam_header = bam_file.header
            self._fai = FastaIndex.from_bam(bam_file)

        self._checksum = None
Esempio n. 5
0
File: bam.py Progetto: rtcz/varlock
    def __init__(self, bam_filename: str, start_index: int = None, end_index: int = None):
        """
        :param bam_filename:
        :param start_index: iterate from 0-based index inclusive
        :param end_index: iterate to 0-based index inclusive
        Iterates over mapped reads only.
        """
        if start_index is not None and end_index is not None:
            assert start_index <= end_index
        
        self._bam_file = open_bam(bam_filename, 'rb')
        
        if not self._bam_file.has_index():
            raise IndexError('BAM has no index')
        
        self._fai = FastaIndex.from_bam(self._bam_file)
        # empty iterator
        self._iterator = iter(())
        
        self.start_ref_id = None
        self.curr_ref_id = None
        self.end_ref_id = None
        self.counter = 0

        if start_index is end_index is None:
            # fetch all
            self._iterator = self._bam_file.fetch()
        else:
            self.start_ref_name, self.start_ref_pos = self._fai.resolve_start_pos(start_index)
            self.end_ref_name, self.end_ref_pos = self._fai.resolve_end_pos(end_index)
            
            if self.start_ref_name == self.end_ref_name:
                # single iterator
                self._iterator = self._bam_file.fetch(
                    reference=self.start_ref_name,
                    start=self.start_ref_pos,
                    end=self.end_ref_pos
                )
            else:
                # multiple iterators
                self.start_ref_id = self._fai.ref_id(self.start_ref_name)
                self.curr_ref_id = self.start_ref_id
                self.end_ref_id = self._fai.ref_id(self.end_ref_name)
                
                if self.curr_ref_id > self.end_ref_id:
                    raise ValueError("Start reference has position after end reference.")
Esempio n. 6
0
 def setUp(self):
     with pysam.AlignmentFile('tests/resources/fasta_index/input.sam',
                              "r") as sam_file:
         self._fai = FastaIndex.from_bam(sam_file)
Esempio n. 7
0
    def __init__(self, personal_vcf: str, masked_vcf: str, vac: str,
                 out_dir: str):
        assert os.path.isdir(out_dir)
        self._out_dir = out_dir

        self._index = FastaIndex.from_vcf(pysam.VariantFile(personal_vcf))

        self._personal_df = vcf2df(personal_vcf)
        self._personal_df = self._personal_df[self._personal_df['FILTER'] ==
                                              'PASS']
        self._personal_df.index = self._personal_df.apply(
            lambda row: self._index.pos2index(row['CHROM'], row['POS'] - 1),
            axis=1)
        self._personal_df['depth'] = self._personal_df.apply(lambda row: depth(
            row['FORMAT'], row[self._personal_df.columns.get_loc("FORMAT") + 1]
        ),
                                                             axis=1)
        self._personal_df['alt_freq'] = self._personal_df.apply(
            lambda row: altfreq(
                row['FORMAT'], row[self._personal_df.columns.get_loc("FORMAT")
                                   + 1]),
            axis=1)
        # this is deep
        self._personal_df = self._personal_df[(self._personal_df['depth'] > 30)
                                              &
                                              (self._personal_df['QUAL'] > 30)]

        self._masked_df = vcf2df(masked_vcf)
        self._masked_df = self._masked_df[self._masked_df['FILTER'] == 'PASS']
        self._masked_df.index = self._masked_df.apply(
            lambda row: self._index.pos2index(row['CHROM'], row['POS'] - 1),
            axis=1)
        self._masked_df['depth'] = self._masked_df.apply(lambda row: depth(
            row['FORMAT'], row[self._masked_df.columns.get_loc("FORMAT") + 1]),
                                                         axis=1)
        self._masked_df['alt_freq'] = self._masked_df.apply(
            lambda row: altfreq(
                row['FORMAT'], row[self._masked_df.columns.get_loc("FORMAT") +
                                   1]),
            axis=1)
        # this is deep
        self._masked_df = self._masked_df[(self._masked_df['depth'] > 30)
                                          & (self._masked_df['QUAL'] > 30)]

        self._population_df = vac2df(vac)
        self._population_df['alt_freq'] = self._population_df.apply(
            lambda row: countstr2altfreq(row['counts'], int(row['ref_id'])),
            axis=1)

        self._personal_ids = set(self._personal_df.index)
        self._masked_ids = set(self._masked_df.index)
        self._public_ids = set(self._population_df.index)

        self._masked_set = self._public_ids \
            .intersection(self._personal_ids) \
            .difference(self._masked_ids)

        self._not_masked_set = self._public_ids \
            .intersection(self._personal_ids) \
            .intersection(self._masked_ids)

        self._introduced_set = self._public_ids \
            .intersection(self._masked_ids) \
            .difference(self._personal_ids)

        self._not_covered_set = self._personal_ids \
            .intersection(self._masked_ids) \
            .difference(self._public_ids)

        self._not_found_set = self._public_ids \
            .difference(self._personal_ids) \
            .difference(self._masked_ids)

        self._not_covered_hidden_set = self._personal_ids \
            .difference(self._public_ids) \
            .difference(self._masked_ids)

        self._not_covered_revealed_set = self._masked_ids \
            .difference(self._public_ids) \
            .difference(self._personal_ids)
Esempio n. 8
0
 def vac(cls):
     with open_bam(cls.RESOURCE_PATH + 'input.sam', 'rb') as sam_file:
         return Vac(FastaIndex.from_bam(sam_file))