def _to_virtual_offset(self, offset): """ Convert an offset in uncompressed bytes to a virtual offset that the bgzf reader can use. """ isel = np.where(self._data_start <= offset)[0] start_offset, block_length, data_start, data_len = self._blocks[ isel[-1]] return make_virtual_offset(start_offset, offset - data_start)
def check_random(self, filename): """Check BGZF random access by reading blocks in forward & reverse order""" h = gzip.open(filename, "rb") old = h.read() h.close() h = open(filename, "rb") blocks = list(bgzf.BgzfBlocks(h)) h.close() #Forward new = _empty_bytes_string h = bgzf.BgzfReader(filename, "rb") self.assertTrue(h.seekable()) self.assertFalse(h.isatty()) self.assertEqual(h.fileno(), h._handle.fileno()) for start, raw_len, data_start, data_len in blocks: #print start, raw_len, data_start, data_len h.seek(bgzf.make_virtual_offset(start,0)) data = h.read(data_len) self.assertEqual(len(data), data_len) #self.assertEqual(start + raw_len, h._handle.tell()) self.assertEqual(len(new), data_start) new += data h.close() self.assertEqual(len(old), len(new)) self.assertEqual(old, new) #Reverse new = _empty_bytes_string h = bgzf.BgzfReader(filename, "rb") for start, raw_len, data_start, data_len in blocks[::-1]: #print start, raw_len, data_start, data_len h.seek(bgzf.make_virtual_offset(start,0)) data = h.read(data_len) self.assertEqual(len(data), data_len) #self.assertEqual(start + raw_len, h._handle.tell()) new = data + new h.close() self.assertEqual(len(old), len(new)) self.assertEqual(old, new) #Jump back - non-sequential seeking if len(blocks) >= 3: h = bgzf.BgzfReader(filename, "rb", max_cache = 1) #Seek to a late block in the file, #half way into the third last block start, raw_len, data_start, data_len = blocks[-3] voffset = bgzf.make_virtual_offset(start, data_len // 2) h.seek(voffset) self.assertEqual(voffset, h.tell()) data = h.read(1000) self.assertTrue(data in old) self.assertEqual(old.find(data), data_start + data_len // 2) #Now seek to an early block in the file, #half way into the second block start, raw_len, data_start, data_len = blocks[1] h.seek(bgzf.make_virtual_offset(start, data_len // 2)) voffset = bgzf.make_virtual_offset(start, data_len // 2) h.seek(voffset) self.assertEqual(voffset, h.tell()) #Now read all rest of this block and start of next block data = h.read(data_len + 1000) self.assertTrue(data in old) self.assertEqual(old.find(data), data_start + data_len // 2) h.close() #Check seek/tell at block boundaries v_offsets = [] for start, raw_len, data_start, data_len in blocks: for within_offset in [0, 1, data_len // 2, data_len - 1]: if within_offset < 0 or data_len <= within_offset: continue voffset = bgzf.make_virtual_offset(start, within_offset) real_offset = data_start + within_offset v_offsets.append((voffset, real_offset)) shuffle(v_offsets) h = bgzf.BgzfReader(filename, "rb", max_cache = 1) for voffset, real_offset in v_offsets: h.seek(0) assert voffset >= 0 and real_offset >= 0 self.assertEqual(h.read(real_offset), old[:real_offset]) self.assertEqual(h.tell(), voffset) for voffset, real_offset in v_offsets: h.seek(voffset) self.assertEqual(h.tell(), voffset) h.close()
def calculate_chunks(filename, num_chunks): """ Calculate the boundaries in the BAM file and partition into chunks. :param str filename: name of the BAM file :param int num_chunks: number of chunks to partition the boundaries into :return: a list of tuples containing the start and end boundaries """ if num_chunks <= 0: raise ValueError("The number of chunks to calculate should be >= 1") if num_chunks == 1: # aln_file = pysam.AlignmentFile(filename) # header_size = bgzf.split_virtual_offset(aln_file.tell())[0] # aln_file.close() pr = ParseRecord(0, 0, 0, 0, -1, 0, 0) return [pr] try: f = open(filename, 'r') # get all the block start offsets block_offsets = [] decompressed_lengths = [] i = 0 for values in FastBgzfBlocks(f): block_offsets.append(values[0]) decompressed_lengths.append(values[3]) #if i % 50000 == 0: # print 'Block {}'.format(i) i += 1 # partition the starts into manageable chunks div, mod = divmod(len(block_offsets), num_chunks) fastq_fh = bgzf.BgzfReader(filename, 'r') header_size = 0 partitioned_offsets = [(header_size, 0)] for i in range(1, num_chunks): index = div * i + min(i, mod) virtual_offset = bgzf.make_virtual_offset(block_offsets[index], 0) fastq_fh.seek(virtual_offset) line = fastq_fh.readline().strip() while line != '+': line = fastq_fh.readline().strip() quality_line = fastq_fh.readline() virtual_offset = fastq_fh.tell() # block start & within block offset partitioned_offsets.append( bgzf.split_virtual_offset(virtual_offset)) fastq_fh.close() # now let's calculate beginning and ends params = [] for i, offset in enumerate(partitioned_offsets): index = block_offsets.index(partitioned_offsets[i][0]) begin_read_offset = 0 begin_read_size = 0 file_offset = 0 file_bytes = 0 end_read_offset = 0 end_read_size = 0 if i == 0: # first begin_read_offset = 0 begin_read_size = 0 file_offset = block_offsets[index] # print 'file_offset=', file_offset file_bytes = partitioned_offsets[i + 1][0] - file_offset # print 'file_bytes=', file_bytes end_read_offset = bgzf.make_virtual_offset( partitioned_offsets[i + 1][0], 0) end_read_size = partitioned_offsets[i + 1][1] elif i == num_chunks - 1: # last begin_read_offset = bgzf.make_virtual_offset( partitioned_offsets[i][0], partitioned_offsets[i][1]) begin_read_size = decompressed_lengths[index] - \ partitioned_offsets[i][1] file_offset = block_offsets[index + 1] file_bytes = -1 end_read_offset = 0 end_read_size = 0 else: # all others if offset[1] == 0: # bgzf boundary print('****************HUH') return begin_read_offset = bgzf.make_virtual_offset( partitioned_offsets[i][0], partitioned_offsets[i][1]) begin_read_size = decompressed_lengths[index] - \ partitioned_offsets[i][1] file_offset = block_offsets[index + 1] file_bytes = partitioned_offsets[i + 1][0] - file_offset end_read_offset = bgzf.make_virtual_offset( partitioned_offsets[i + 1][0], 0) end_read_size = partitioned_offsets[i + 1][1] pr = ParseRecord(header_size, begin_read_offset, begin_read_size, file_offset, file_bytes, end_read_offset, end_read_size) params.append(pr) return params except Exception as e: print('calculate_chunks error: {}'.format(str(e)))
def get_sequence(self, contig, start, end, strand=1, all_upper=False): """ Return the genomic DNA sequence spanning [start, end) on contig. :param contig: The name of the contig on which the start and end coordinates are located :param start: The start location of the sequence to be returned (this endpoint is included in the interval) :param end: The end location of the sequence to be returned (tis endpoint is not included in the interval) :param strand: The DNA strand of the sequence to be returned (-1 for negative strand, 1 for positive strand) :param all_upper: If true, return the sequence in all uppercase letters. Otherwise return lowercase letters for positions that are "soft-masked" (see https://genomevolution.org/wiki/index.php/Masked). :return: A string of DNA nucleotides of length end-start """ if contig not in self._index: raise ContigNotFound(message='Contig {} not found'.format(contig), requested_contig=contig, valid_contigs=list(self._index.keys())) if start < 0: raise CoordinateOutOfBounds( message='Start coordinate below 0', problematic_coordinate=start, problem_with_start=True, coordinate_too_small=True, valid_coordinate_range=(0, self.contig_lengths[contig]), current_contig=contig) if start > self.contig_lengths[contig]: raise CoordinateOutOfBounds( message='Start coordinate past end of contig', problematic_coordinate=start, problem_with_start=True, coordinate_too_small=False, valid_coordinate_range=(0, self.contig_lengths[contig]), current_contig=contig) if end > self.contig_lengths[contig]: raise CoordinateOutOfBounds( message='End coordinate past end of contig', problematic_coordinate=end, problem_with_start=False, coordinate_too_small=False, valid_coordinate_range=(0, self.contig_lengths[contig]), current_contig=contig) if end < 0: raise CoordinateOutOfBounds( message='End coordinate below 0', problematic_coordinate=end, problem_with_start=False, coordinate_too_small=True, valid_coordinate_range=(0, self.contig_lengths[contig]), current_contig=contig) if start >= end: raise InvalidCoordinates(start=start, end=end) query_length = end - start start_pos_file_distance = self._text_distance_to_file_distance(start) start_block = sorted( self._index[contig].search(start_pos_file_distance))[0] start_block_offset = start_block.data verbose_print('Retrieving sequence for {} [{},{}) ...'.format( contig, start, end)) sequence_start_offset = start_pos_file_distance - start_block.begin retrieved_sequence = '' with bgzf.BgzfReader(self.bgzipped_fasta_filename, 'rt') as fasta_file: fasta_file.seek( bgzf.make_virtual_offset(start_block_offset, sequence_start_offset)) while len(retrieved_sequence) < query_length: retrieved_sequence += fasta_file.readline().rstrip() trimmed_sequence = retrieved_sequence[:query_length] if all_upper: trimmed_sequence = trimmed_sequence.upper() if strand == -1: return reverse_complement(trimmed_sequence) else: return trimmed_sequence