Ejemplo n.º 1
0
def chunk_fastq_file(fastq_filename, new_filename, parse_rec):
    """
    Create a new FASTQ file from an existing one.

    :param str fastq_filename: the name of the original BAM file
    :param str new_filename: the name of the new BAM file
    :param class:`ParseRecord` parse_rec: the information containing where to extract
    :return:
    """
    try:
        os.remove(new_filename)
    except Exception as e:
        pass

    # copy the header from original BAM file to new
    bytes_from_file(fastq_filename, new_filename, 0, parse_rec.header_size)

    if parse_rec.begin_read_offset > 0:
        # if there are reads before a chunk offset, we need to extract them
        b = bgzf.BgzfReader(fastq_filename)
        b2 = bgzf.BgzfWriter(new_filename, mode="a")
        b.seek(parse_rec.begin_read_offset)
        b2.write(b.read(parse_rec.begin_read_size))
        b2.close()

    # grab bgzf chunks from the OLD FASTQ file and append to NEW FASTQ file
    bytes_from_file(fastq_filename, new_filename, parse_rec.file_offset, parse_rec.file_bytes)

    if parse_rec.end_read_offset > 0:
        # if there are reads after a chunk offset, we need to extract them
        b = bgzf.BgzfReader(fastq_filename)
        b2 = bgzf.BgzfWriter(new_filename, mode="a")
        b.seek(parse_rec.end_read_offset)
        b2.write(b.read(parse_rec.end_read_size))
        b2.close()
Ejemplo n.º 2
0
 def __init__(self, pcons_file):
     try:
         self.pcons_file_handle = bgzf.BgzfReader(pcons_file,
                                                  'r',
                                                  max_cache=5000)
     except IOError as e:
         msg = "Could not read Pcons file!\nI/O error({0}): {1}\n".format(
             e.errno, e.strerror)
         raise FileException(msg)
     self.filename = pcons_file
     try:
         index_file = open(pcons_file + '.idx3', 'rb')
         self.uid_index = pickle.load(index_file)
     except IOError as e:
         msg = "Could not read index file!\nI/O error({0}): {1}\n".format(
             e.errno, e.strerror)
         raise FileException(msg)
     except pickle.UnpicklingError as e:
         msg = "Could not unpickle the index file - possibly wrong format!"
         msg += "\nUnpickling error: {}\n".format(e.message)
         raise FileException(msg)
     except Exception as e:
         msg = "Could not read/unpickle the index file - unknown error! : "
         msg += "{}\n".format(e.__repr__())
         raise FileException(msg)
Ejemplo n.º 3
0
  def __build_index__(self):  
    numbytes = 0
    self._bcode_off_map = {}
    num_pe = 0

    if self.fq_path.endswith('.gz'):
      index_name = self.fq_path + "i"
      if not os.path.exists(index_name):
        raise Exception("Only BGZF compression is supported")

      handle = bgzf.BgzfReader(self.fq_path)
    else:
      handle = open(self.fq_path)

    seen_set = set()
    for bcode, reads_iter in groupby(
      util.fastq_iter_pos(handle),
      lambda(x): x[0],
    ):
      assert bcode == None or bcode not in seen_set, \
"fastq {} NOT in barcode sorted order. Ensure reads that share barcodes \
are in a block together".format(self.fq_path)
      seen_set.add(bcode)
      for _, qname, file_pos, lines in reads_iter:
        if bcode != None and bcode not in self._bcode_off_map:
          self._bcode_off_map[bcode] = file_pos
        num_pe += 1
Ejemplo n.º 4
0
    def check_by_char(self, old_file, new_file, old_gzip=False):
        for mode in ["r", "rb"]:
            if old_gzip:
                h = gzip.open(old_file,mode)
            else:
                h = open(old_file, mode)
            old = h.read()
            #Seems gzip can return bytes even if mode="r",
            #perhaps a bug in Python 3.2?
            if "b" in mode:
                old = _as_bytes(old)
            else:
                old = _as_string(old)
            h.close()

            for cache in [1,10]:
                h = bgzf.BgzfReader(new_file, mode, max_cache=cache)
                temp = []
                while True:
                    char = h.read(1)
                    if not char: break
                    temp.append(char)
                if "b" in mode:
                    new = _empty_bytes_string.join(temp)
                else:
                    new = "".join(temp)
                del temp
                h.close()

                self.assertEqual(len(old), len(new))
                #If bytes vs unicode mismatch, give a short error message:
                self.assertEqual(old[:10], new[:10], \
                                 "%r vs %r, mode %r" % (old[:10], new[:10], mode))
                self.assertEqual(old, new)
Ejemplo n.º 5
0
def merge_vcf_split_chr(vcf_file, split_chr_inf):
    split_bed_df = pd.read_csv(split_chr_inf,
                               index_col=3,
                               header=None,
                               names=['chrom', 'start', 'end'],
                               sep='\t')
    merge_chr_size_str = merged_chr_size_inf(split_bed_df)
    vcf_file = PurePath(vcf_file)
    is_gz_file = vcf_file.suffix == '.gz'

    if is_gz_file:
        split_vcf_inf = bgzf.BgzfReader(vcf_file)
    else:
        split_vcf_inf = open(vcf_file)

    chr_header_flag = 1
    # TODO add merge chr command information in vcf header
    for eachline in split_vcf_inf:
        eachline = eachline.strip()
        eachline_inf = eachline.split('\t')
        chrom = eachline_inf[0]
        # split chrom size -> merged chrom size
        if eachline.startswith('##contig='):
            if chr_header_flag:
                print(merge_chr_size_str)
                chr_header_flag = 0
            continue
        elif chrom in split_bed_df.index:
            new_chrom, offset = split_bed_df.loc[chrom, ['chrom', 'start']]
            eachline_inf[0] = new_chrom
            eachline_inf[1] = str(offset + int(eachline_inf[1]))
            eachline = '\t'.join(eachline_inf)
        print(eachline)
    split_vcf_inf.close()
Ejemplo n.º 6
0
def _is_sorted_bam(bam):
    """
    Checks if a BAM file is sorted by coordinate.
    """
    with bgzf.BgzfReader(bam, "rb") as fin:
        bam_header = fin.readline().strip()
        return b"SO:coordinate" in bam_header
Ejemplo n.º 7
0
    def check_by_line(self, old_file, new_file, old_gzip=False):
        for mode in ["r", "rb"]:
            if old_gzip:
                h = gzip.open(old_file, mode)
            else:
                h = open(old_file, mode)
            old = h.read()
            if "b" in mode:
                old = _as_bytes(old)
            else:
                old = _as_string(old)
            h.close()

            for cache in [1, 10]:
                h = bgzf.BgzfReader(new_file, mode, max_cache=cache)
                if "b" in mode:
                    new = b"".join(line for line in h)
                else:
                    new = "".join(line for line in h)
                h.close()

                self.assertEqual(len(old), len(new))
                self.assertEqual(
                    old[:10], new[:10],
                    "%r vs %r, mode %r" % (old[:10], new[:10], mode))
                self.assertEqual(old, new)
Ejemplo n.º 8
0
    def check_by_line(self, old_file, new_file, old_gzip=False):
        if old_gzip:
            with gzip.open(old_file) as handle:
                old = handle.read()
        else:
            with open(old_file, "rb") as handle:
                old = handle.read()
        for mode in ["rb", "r"]:
            if "b" in mode:
                assert isinstance(old, bytes)
            else:
                # BGZF text mode is hard coded as latin1
                # and does not do universal new line mode
                old = old.decode("latin1")

            for cache in [1, 10]:
                with bgzf.BgzfReader(new_file, mode, max_cache=cache) as h:
                    if "b" in mode:
                        new = b"".join(line for line in h)
                    else:
                        new = "".join(line for line in h)

                self.assertEqual(len(old), len(new))
                self.assertEqual(
                    old[:10], new[:10], "%r vs %r, mode %r" % (old[:10], new[:10], mode)
                )
                self.assertEqual(old, new)
Ejemplo n.º 9
0
    def check_by_line(self, old_file, new_file, old_gzip=False):
        for mode in ["r", "rb"]:
            if old_gzip:
                h = gzip.open(old_file, mode)
            else:
                h = open(old_file, mode)
            old = h.read()
            #Seems gzip can return bytes even if mode="r",
            #perhaps a bug in Python 3.2?
            if "b" in mode:
                old = _as_bytes(old)
            else:
                old = _as_string(old)
            h.close()

            for cache in [1,10]:
                h = bgzf.BgzfReader(new_file, mode, max_cache=cache)
                if "b" in mode:
                    new = _empty_bytes_string.join(line for line in h)
                else:
                    new = "".join(line for line in h)
                h.close()

                self.assertEqual(len(old), len(new))
                self.assertEqual(old[:10], new[:10], \
                                 "%r vs %r, mode %r" % (old[:10], new[:10], mode))
                self.assertEqual(old, new)
Ejemplo n.º 10
0
    def __init__(self, path, mode='r'):
        """
        Store tabular information tied to genomic locations in a bgzipped file
        Args:
            path (str) : path to file
            mode (str) : mode, r: read, w: write
        """
        self.path = path
        self.index_path = f'{path}.idx'
        self.prev_contig = None
        self.mode = mode
        self.index = {}

        if self.mode == 'w':
            self.bgzf_handle = bgzf.BgzfWriter(self.path, 'w')
            self.index_handle = open(self.index_path, 'wt')
        elif self.mode == 'r':
            if not os.path.exists(self.path):
                raise ValueError(f'BGZIP index file missing at {self.path}')
            self.bgzf_handle = bgzf.BgzfReader(self.path, 'rt')
            if not os.path.exists(self.index_path):
                raise ValueError(
                    f'BGZIP index file missing at {self.index_path}')
            self.index_handle = open(self.index_path, 'rt')

            for line in self.index_handle:
                contig, start = line.strip().split()
                self.index[contig] = int(start)
        else:
            raise ValueError('Mode can be r or w')
        self.cache = {}
Ejemplo n.º 11
0
 def __init__(self, filename, **kwargs):
     h = open(filename, 'rb')
     try:
         self._handle = bgzf.BgzfReader(mode="rb", fileobj=h)
     except ValueError, e:
         assert "BGZF" in str(e)
         #Not a BGZF file
         h.seek(0)
         self._handle = h
Ejemplo n.º 12
0
 def __init__(self, filename, format, alphabet):
     h = open(filename, "rb")
     try:
         self._handle = bgzf.BgzfReader(mode="rb", fileobj=h)
     except ValueError, e:
         assert "BGZF" in str(e)
         #Not a BGZF file
         h.seek(0)
         self._handle = h
Ejemplo n.º 13
0
 def open(self, fn):
     try:
         from Bio import bgzf
     except:
         print >> sys.stderr, "Cannot import Bio.bgzf, need to check the installation"
     self.handle = bgzf.BgzfReader(fn)
     # read fai
     for ln in myopen(fn + '.fai'):
         fd = ln.strip().split()
         self.index[fd[0].replace('chr', '')] = [int(i) for i in fd[1:]]
     return len(self.index)
Ejemplo n.º 14
0
    def check_text_with(self, old_file, new_file):
        """Check text mode using context manager (with statement)"""
        with open(old_file) as h:  # text mode!
            old_line = h.readline()
            old = old_line + h.read()

        with bgzf.BgzfReader(new_file, "r") as h:  # Text mode!
            new_line = h.readline()
            new = new_line + h.read(len(old))

        self.assertEqual(old_line, new_line)
        self.assertEqual(len(old), len(new))
        self.assertEqual(old, new)
Ejemplo n.º 15
0
    def __init__(self, path, mode='r', read_all=False):
        """
        Store tabular information tied to genomic locations in a bgzipped file
        Args:
            path (str) : path to file
            mode (str) : mode, r: read, w: write

            read_all(bool) : when enabled all data is read from the file and the handles are closed
        """
        self.path = path
        self.index_path = f'{path}.idx'
        self.prev_contig = None
        self.mode = mode
        self.index = {}
        self.cache = {}

        if self.mode == 'w':
            self.bgzf_handle = bgzf.BgzfWriter(self.path, 'w')
            self.index_handle = open(self.index_path, 'wt')
        elif self.mode == 'r':
            if not os.path.exists(self.path):
                raise ValueError(f'BGZIP index file missing at {self.path}')
            self.bgzf_handle = bgzf.BgzfReader(self.path, 'rt')
            if not os.path.exists(self.index_path):
                raise ValueError(
                    f'BGZIP index file missing at {self.index_path}')
            self.index_handle = open(self.index_path, 'rt')

            for line in self.index_handle:
                contig, start = line.strip().split()
                self.index[contig] = int(start)

            if read_all:

                for line in self.bgzf_handle:
                    if len(line) == 0:
                        continue
                    line_contig, line_pos, line_strand, rest = self.read_file_line(
                        line)
                    #print((line_pos, line_strand,rest))
                    if not line_contig in self.cache:
                        self.cache[line_contig] = {}
                    self.cache[line_contig][(line_pos, line_strand)] = rest
                    cpos = line_pos
                self.bgzf_handle.close()
                self.bgzf_handle = None
                self.index_handle.close()
                self.index_handle = None

        else:
            raise ValueError('Mode can be r or w')
Ejemplo n.º 16
0
    def check_text(self, old_file, new_file):
        """Check text mode using explicit open/close."""
        with open(old_file) as h:  # text mode!
            old_line = h.readline()
            old = old_line + h.read()

        h = bgzf.BgzfReader(new_file, "r")  # Text mode!
        new_line = h.readline()
        new = new_line + h.read(len(old))
        h.close()

        self.assertEqual(old_line, new_line)
        self.assertEqual(len(old), len(new))
        self.assertEqual(old, new)
Ejemplo n.º 17
0
    def check_text(self, old_file, new_file):
        h = open(old_file) #text mode!
        old_line = h.readline()
        old = old_line + h.read()
        h.close()

        h = bgzf.BgzfReader(new_file, "r") #Text mode!
        new_line = h.readline()
        new = new_line + h.read(len(old))
        h.close()

        self.assertEqual(old_line, new_line)
        self.assertEqual(len(old), len(new))
        self.assertEqual(old, new)
Ejemplo n.º 18
0
def open_haps(pathname):

    ##http://stackoverflow.com/questions/21529163/python-gzipped-fileinput-returns-binary-string-instead-of-text-string/21529243

    ext = os.path.splitext(pathname)[1]
    if ext == '.gz':
        stop_convert_to_bgz
        f = gzip.open(pathname, 'rt')
    elif ext == '.bgz':
        f = bgzf.BgzfReader(pathname)
    else:
        f = open(filename)

    return f
Ejemplo n.º 19
0
def _open_for_random_access(filename):
    """Open a file in binary mode, spot if it is BGZF format etc (PRIVATE).

    This funcationality is used by the Bio.SeqIO and Bio.SearchIO index
    and index_db functions.
    """
    handle = open(filename, "rb")
    from Bio import bgzf
    try:
        return bgzf.BgzfReader(mode="rb", fileobj=handle)
    except ValueError as e:
        assert "BGZF" in str(e)
        # Not a BGZF file after all, rewind to start:
        handle.seek(0)
    return handle
Ejemplo n.º 20
0
 def sideEffect(self, filename, *args, **kwargs):
     if self.count <= 1:
         self.test.assertEqual('filename.fasta.bgz', filename)
         self.count += 1
         writerIO = BytesIO()
         writer = bgzf.BgzfWriter(fileobj=writerIO)
         writer.write(b'>id0\nAC\n')
         writer.flush()
         fileobj = BytesIO(writerIO.getvalue())
         fileobj.mode = 'rb'
         return bgzf.BgzfReader(fileobj=fileobj)
     else:
         self.test.fail(
             'Open called too many times. Filename: %r, Args: %r, '
             'Keyword args: %r.' % (filename, args, kwargs))
Ejemplo n.º 21
0
  def open(self):
    assert self.f_map == None, "fp map already populated"
    self.f_map = {}

    if self.fq_path.endswith('.gz'):
      index_name = self.fq_path + "i"
      if not os.path.exists(index_name):
        raise Exception("Only BGZF compression is supported")

      handle = bgzf.BgzfReader(self.fq_path)
      self.gzipped = True
    else:
      handle = open(self.fq_path)
      self.gzipped = False

    self.f_map[self.fq_path] = handle
    return self
Ejemplo n.º 22
0
def dinucleotide_to_count(logger, dinucleotide_file, count_file):
    logger.info("Counting %s ..." % dinucleotide_file)
    count_map = OrderedDict()
    lineCount = 0
    with bgzf.BgzfReader(dinucleotide_file, "r") as fin:
        for line in fin:
            lineCount += 1
            if lineCount % 100000 == 0:
                logger.info(lineCount)

            parts = line.rstrip().split('\t')
            chrom = parts[0]
            dinucleotide = parts[3]
            count = int(parts[4])
            chrom_map = count_map.setdefault(chrom, {})
            countVec = chrom_map.setdefault(dinucleotide, [0, 0])
            countVec[0] = countVec[0] + count
            countVec[1] = countVec[1] + 1
    write_count_file(logger, count_file, count_map)
Ejemplo n.º 23
0
    def read_sites(self, logger, item):
        logger.info("Reading %s ..." % item.dinucleotide_file)
        curSites = {}
        line_count = 0
        with bgzf.BgzfReader(item.dinucleotide_file, "r") as fin:
            for line in fin:
                line_count += 1
                if line_count % 1000000 == 0:
                    logger.info(line_count)

                parts = line.rstrip().split('\t')

                #ignore the chromomsome contigs
                if is_contig_reference(parts[0]):
                    continue

                key = "%s_%s_%s" % (parts[0], parts[1], parts[3])
                curSites[key] = parts[4]
        return (curSites)
Ejemplo n.º 24
0
    def __call__(self, string):
        # the special argument "-" means sys.std{in,out}
        if string == '-':
            if 'r' in self._mode:
                return sys.stdin
            elif 'w' in self._mode:
                return sys.stdout
            else:
                raise ValueError('argument "-" with mode %r' % self._mode)

        # all other arguments are used as file names
        try:
            if string[-3:] == ".gz":
                from Bio import bgzf
                if 'r' in self._mode:
                    return bgzf.BgzfReader(string, self._mode)
                elif 'w' in self._mode or 'a' in self._mode:
                    return bgzf.BgzfWriter(string, self._mode)
            else:
                return open(string, self._mode, self._bufsize)
        except OSError as e:
            raise ArgumentTypeError("can't open '%s': %s" % (string, e))
Ejemplo n.º 25
0
    def check_by_char(self, old_file, new_file, old_gzip=False):
        if old_gzip:
            with gzip.open(old_file) as handle:
                old = handle.read()
        else:
            with open(old_file, "rb") as handle:
                old = handle.read()
        for mode in ["rb", "r"]:
            if "b" in mode:
                assert isinstance(old, bytes)
            else:
                # BGZF text mode is hard coded as latin1
                # and does not do universal new line mode
                old = old.decode("latin1")

            for cache in [1, 10]:
                h = bgzf.BgzfReader(new_file, mode, max_cache=cache)
                temp = []
                while True:
                    char = h.read(1)
                    if not char:
                        break
                    temp.append(char)
                if "b" in mode:
                    new = b"".join(temp)
                else:
                    new = "".join(temp)
                del temp
                h.close()

                self.assertEqual(len(old), len(new))
                # If bytes vs unicode mismatch, give a short error message:
                self.assertEqual(
                    old[:10], new[:10], "%r vs %r, mode %r" % (old[:10], new[:10], mode)
                )
                self.assertEqual(old, new)
Ejemplo n.º 26
0
    def get_sequence(self, contig, start, end, strand=1, all_upper=False):
        """
        Return the genomic DNA sequence spanning [start, end) on contig.
        :param contig: The name of the contig on which the start and end coordinates are located
        :param start: The start location of the sequence to be returned (this endpoint is included in the interval)
        :param end: The end location of the sequence to be returned (tis endpoint is not included in the interval)
        :param strand: The DNA strand of the sequence to be returned (-1 for negative strand, 1 for positive strand)
        :param all_upper: If true, return the sequence in all uppercase letters. Otherwise return lowercase letters
            for positions that are "soft-masked" (see https://genomevolution.org/wiki/index.php/Masked).
        :return: A string of DNA nucleotides of length end-start
        """
        if contig not in self._index:
            raise ContigNotFound(message='Contig {} not found'.format(contig),
                                 requested_contig=contig,
                                 valid_contigs=list(self._index.keys()))
        if start < 0:
            raise CoordinateOutOfBounds(
                message='Start coordinate below 0',
                problematic_coordinate=start,
                problem_with_start=True,
                coordinate_too_small=True,
                valid_coordinate_range=(0, self.contig_lengths[contig]),
                current_contig=contig)
        if start > self.contig_lengths[contig]:
            raise CoordinateOutOfBounds(
                message='Start coordinate past end of contig',
                problematic_coordinate=start,
                problem_with_start=True,
                coordinate_too_small=False,
                valid_coordinate_range=(0, self.contig_lengths[contig]),
                current_contig=contig)
        if end > self.contig_lengths[contig]:
            raise CoordinateOutOfBounds(
                message='End coordinate past end of contig',
                problematic_coordinate=end,
                problem_with_start=False,
                coordinate_too_small=False,
                valid_coordinate_range=(0, self.contig_lengths[contig]),
                current_contig=contig)
        if end < 0:
            raise CoordinateOutOfBounds(
                message='End coordinate below 0',
                problematic_coordinate=end,
                problem_with_start=False,
                coordinate_too_small=True,
                valid_coordinate_range=(0, self.contig_lengths[contig]),
                current_contig=contig)
        if start >= end:
            raise InvalidCoordinates(start=start, end=end)

        query_length = end - start
        start_pos_file_distance = self._text_distance_to_file_distance(start)

        start_block = sorted(
            self._index[contig].search(start_pos_file_distance))[0]
        start_block_offset = start_block.data
        verbose_print('Retrieving sequence for {} [{},{}) ...'.format(
            contig, start, end))

        sequence_start_offset = start_pos_file_distance - start_block.begin

        retrieved_sequence = ''
        with bgzf.BgzfReader(self.bgzipped_fasta_filename, 'rt') as fasta_file:
            fasta_file.seek(
                bgzf.make_virtual_offset(start_block_offset,
                                         sequence_start_offset))
            while len(retrieved_sequence) < query_length:
                retrieved_sequence += fasta_file.readline().rstrip()
        trimmed_sequence = retrieved_sequence[:query_length]

        if all_upper:
            trimmed_sequence = trimmed_sequence.upper()

        if strand == -1:
            return reverse_complement(trimmed_sequence)
        else:
            return trimmed_sequence
Ejemplo n.º 27
0
    def check_random(self, filename):
        """Check BGZF random access by reading blocks in forward & reverse order"""
        h = gzip.open(filename, "rb")
        old = h.read()
        h.close()

        h = open(filename, "rb")
        blocks = list(bgzf.BgzfBlocks(h))
        h.close()

        #Forward
        new = _empty_bytes_string
        h = bgzf.BgzfReader(filename, "rb")
        self.assertTrue(h.seekable())
        self.assertFalse(h.isatty())
        self.assertEqual(h.fileno(), h._handle.fileno())
        for start, raw_len, data_start, data_len in blocks:
            #print start, raw_len, data_start, data_len
            h.seek(bgzf.make_virtual_offset(start,0))
            data = h.read(data_len)
            self.assertEqual(len(data), data_len)
            #self.assertEqual(start + raw_len, h._handle.tell())
            self.assertEqual(len(new), data_start)
            new += data
        h.close()
        self.assertEqual(len(old), len(new))
        self.assertEqual(old, new)

        #Reverse
        new = _empty_bytes_string
        h = bgzf.BgzfReader(filename, "rb")
        for start, raw_len, data_start, data_len in blocks[::-1]:
            #print start, raw_len, data_start, data_len
            h.seek(bgzf.make_virtual_offset(start,0))
            data = h.read(data_len)
            self.assertEqual(len(data), data_len)
            #self.assertEqual(start + raw_len, h._handle.tell())
            new = data + new
        h.close()
        self.assertEqual(len(old), len(new))
        self.assertEqual(old, new)

        #Jump back - non-sequential seeking
        if len(blocks) >= 3:
            h = bgzf.BgzfReader(filename, "rb", max_cache = 1)
            #Seek to a late block in the file,
            #half way into the third last block
            start, raw_len, data_start, data_len = blocks[-3]
            voffset = bgzf.make_virtual_offset(start, data_len // 2)
            h.seek(voffset)
            self.assertEqual(voffset, h.tell())
            data = h.read(1000)
            self.assertTrue(data in old)
            self.assertEqual(old.find(data), data_start + data_len // 2)
            #Now seek to an early block in the file,
            #half way into the second block
            start, raw_len, data_start, data_len = blocks[1]
            h.seek(bgzf.make_virtual_offset(start, data_len // 2))
            voffset = bgzf.make_virtual_offset(start, data_len // 2)
            h.seek(voffset)
            self.assertEqual(voffset, h.tell())
            #Now read all rest of this block and start of next block
            data = h.read(data_len + 1000)
            self.assertTrue(data in old)
            self.assertEqual(old.find(data), data_start + data_len // 2)
            h.close()

        #Check seek/tell at block boundaries
        v_offsets = []
        for start, raw_len, data_start, data_len in blocks:
            for within_offset in [0, 1, data_len // 2, data_len - 1]:
                if within_offset < 0 or data_len <= within_offset:
                    continue
                voffset = bgzf.make_virtual_offset(start, within_offset)
                real_offset = data_start + within_offset
                v_offsets.append((voffset, real_offset))
        shuffle(v_offsets)
        h = bgzf.BgzfReader(filename, "rb", max_cache = 1)
        for voffset, real_offset in v_offsets:
            h.seek(0)
            assert voffset >= 0 and real_offset >= 0
            self.assertEqual(h.read(real_offset), old[:real_offset])
            self.assertEqual(h.tell(), voffset)
        for voffset, real_offset in v_offsets:
            h.seek(voffset)
            self.assertEqual(h.tell(), voffset)
        h.close()
Ejemplo n.º 28
0
def unzip(file_in):
    #orchid_dict = SeqIO.index(file_in, "genbank")
    #print(len(orchid_dict))
    handle = bgzf.BgzfReader(file_in, "r")
    aa = handle.read(100000)
    print(aa)
Ejemplo n.º 29
0
def calculate_chunks(filename, num_chunks):
    """
    Calculate the boundaries in the BAM file and partition into chunks.

    :param str filename: name of the BAM file
    :param int num_chunks: number of chunks to partition the boundaries into
    :return: a list of tuples containing the start and end boundaries
    """
    if num_chunks <= 0:
        raise ValueError("The number of chunks to calculate should be >= 1")

    if num_chunks == 1:
        # aln_file = pysam.AlignmentFile(filename)
        # header_size = bgzf.split_virtual_offset(aln_file.tell())[0]
        # aln_file.close()

        pr = ParseRecord(0, 0, 0, 0, -1, 0, 0)
        return [pr]

    try:
        f = open(filename, 'r')
        # get all the block start offsets
        block_offsets = []
        decompressed_lengths = []
        i = 0

        for values in FastBgzfBlocks(f):
            block_offsets.append(values[0])
            decompressed_lengths.append(values[3])

            #if i % 50000 == 0:
            #   print  'Block {}'.format(i)
            i += 1

        # partition the starts into manageable chunks
        div, mod = divmod(len(block_offsets), num_chunks)

        fastq_fh = bgzf.BgzfReader(filename, 'r')
        header_size = 0
        partitioned_offsets = [(header_size, 0)]

        for i in range(1, num_chunks):
            index = div * i + min(i, mod)

            virtual_offset = bgzf.make_virtual_offset(block_offsets[index], 0)
            fastq_fh.seek(virtual_offset)
            line = fastq_fh.readline().strip()
            while line != '+':
                line = fastq_fh.readline().strip()
            quality_line = fastq_fh.readline()
            virtual_offset = fastq_fh.tell()

            # block start & within block offset
            partitioned_offsets.append(
                bgzf.split_virtual_offset(virtual_offset))

        fastq_fh.close()

        # now let's calculate beginning and ends
        params = []

        for i, offset in enumerate(partitioned_offsets):
            index = block_offsets.index(partitioned_offsets[i][0])
            begin_read_offset = 0
            begin_read_size = 0
            file_offset = 0
            file_bytes = 0
            end_read_offset = 0
            end_read_size = 0

            if i == 0:
                # first
                begin_read_offset = 0
                begin_read_size = 0
                file_offset = block_offsets[index]
                # print 'file_offset=', file_offset
                file_bytes = partitioned_offsets[i + 1][0] - file_offset
                # print 'file_bytes=', file_bytes
                end_read_offset = bgzf.make_virtual_offset(
                    partitioned_offsets[i + 1][0], 0)
                end_read_size = partitioned_offsets[i + 1][1]
            elif i == num_chunks - 1:
                # last
                begin_read_offset = bgzf.make_virtual_offset(
                    partitioned_offsets[i][0], partitioned_offsets[i][1])
                begin_read_size = decompressed_lengths[index] - \
                                  partitioned_offsets[i][1]
                file_offset = block_offsets[index + 1]
                file_bytes = -1
                end_read_offset = 0
                end_read_size = 0
            else:
                # all others
                if offset[1] == 0:
                    # bgzf boundary
                    print('****************HUH')
                    return

                begin_read_offset = bgzf.make_virtual_offset(
                    partitioned_offsets[i][0], partitioned_offsets[i][1])
                begin_read_size = decompressed_lengths[index] - \
                                  partitioned_offsets[i][1]
                file_offset = block_offsets[index + 1]
                file_bytes = partitioned_offsets[i + 1][0] - file_offset

                end_read_offset = bgzf.make_virtual_offset(
                    partitioned_offsets[i + 1][0], 0)
                end_read_size = partitioned_offsets[i + 1][1]

            pr = ParseRecord(header_size, begin_read_offset, begin_read_size,
                             file_offset, file_bytes, end_read_offset,
                             end_read_size)
            params.append(pr)

        return params

    except Exception as e:
        print('calculate_chunks error: {}'.format(str(e)))