def __init__(self, bam_filename: str, start_index: int, end_index: int): """ :param bam_filename: :param start_index: iterate from 0-based index inclusive :param end_index: iterate to 0-based index inclusive Iterator includes mapped reads within the range and all unmapped reads. BAM index ensures that BAM is sorted and is needed to resolve range. Iterator assumes that unplaced alignment can be anywhere in BAM file. """ assert start_index <= end_index self._bam_file = open_bam(bam_filename, 'rb') if not self._bam_file.has_index(): raise IndexError('BAM has no index') self._fai = FastaIndex.from_bam(self._bam_file) self._start_ref_name, self._start_ref_pos = self._fai.resolve_start_pos(start_index) self._end_ref_name, self._end_ref_pos = self._fai.resolve_end_pos(end_index) self._start_ref_id = self._fai.ref_id(self._start_ref_name) self._end_ref_id = self._fai.ref_id(self._end_ref_name) self._iterator = self._bam_file.fetch(until_eof=True)
def create_vac(self, bam_filename: str, vcf_filename: str, out_vac_filename: str, ref_fasta_filename: str, skip_indels: bool): """ BAM and VCF should use same reference genome. VCF must contain INFO column with sub-fields AC and AN. :param bam_filename: filename of the SAM/BAM file, from which the header is extracted :param vcf_filename: filename of the input VCF file :param out_vac_filename: filename of the output VAC file :param ref_fasta_filename: filename to reference FASTA file :param skip_indels: whether to skip indels and keep only SNPs """ # TODO use fasta index / vcf header instead of BAM header # load the reference FASTA ref_fasta = None if ref_fasta_filename is not None: if self._verbose: print('--- Loading Reference Fasta ---') ref_fasta = pyfaidx.Fasta(ref_fasta_filename) # is VCF gzipped? # is_gzipped = vcf_filename.endswith(('.gz', '.bgz')) # open all files and create the VAC file if self._verbose: print('--- Processing VCF %s ---' % vcf_filename) with pysam.VariantFile(vcf_filename) as vcf_file, \ open_bam(bam_filename, 'rb') as sam_file, \ open(out_vac_filename, 'wb') as out_vac_file: vac = Vac(FastaIndex.from_bam(sam_file), self._verbose) vac.vcf2vac(vcf_file, out_vac_file, ref_fasta, skip_indels)
def __init__(self, filename: str, verbose: bool = False): """ :param filename: BAM filename :param verbose: """ self._verbose = verbose self._stats = {} self._bam_filename = filename with bam.open_bam(self._bam_filename, 'rb') as bam_file: self._bam_header = bam_file.header self._fai = FastaIndex.from_bam(bam_file) self._checksum = None
def __init__(self, bam_filename: str, start_index: int = None, end_index: int = None): """ :param bam_filename: :param start_index: iterate from 0-based index inclusive :param end_index: iterate to 0-based index inclusive Iterates over mapped reads only. """ if start_index is not None and end_index is not None: assert start_index <= end_index self._bam_file = open_bam(bam_filename, 'rb') if not self._bam_file.has_index(): raise IndexError('BAM has no index') self._fai = FastaIndex.from_bam(self._bam_file) # empty iterator self._iterator = iter(()) self.start_ref_id = None self.curr_ref_id = None self.end_ref_id = None self.counter = 0 if start_index is end_index is None: # fetch all self._iterator = self._bam_file.fetch() else: self.start_ref_name, self.start_ref_pos = self._fai.resolve_start_pos(start_index) self.end_ref_name, self.end_ref_pos = self._fai.resolve_end_pos(end_index) if self.start_ref_name == self.end_ref_name: # single iterator self._iterator = self._bam_file.fetch( reference=self.start_ref_name, start=self.start_ref_pos, end=self.end_ref_pos ) else: # multiple iterators self.start_ref_id = self._fai.ref_id(self.start_ref_name) self.curr_ref_id = self.start_ref_id self.end_ref_id = self._fai.ref_id(self.end_ref_name) if self.curr_ref_id > self.end_ref_id: raise ValueError("Start reference has position after end reference.")
def setUp(self): with pysam.AlignmentFile('tests/resources/fasta_index/input.sam', "r") as sam_file: self._fai = FastaIndex.from_bam(sam_file)
def vac(cls): with open_bam(cls.RESOURCE_PATH + 'input.sam', 'rb') as sam_file: return Vac(FastaIndex.from_bam(sam_file))