Exemple #1
0
    def __init__(self,
                 input_path,
                 ref_path=None,
                 read_requirements=None,
                 parse_aux_fields=False,
                 hts_block_size=None,
                 downsample_fraction=None,
                 random_seed=None):
        """Initializes a NativeSamReader.

    Args:
      input_path: str. A path to a resource containing SAM/BAM/CRAM records.
        Currently supports SAM text format, BAM binary format, and CRAM.
      ref_path: optional str or None. Only used for CRAM decoding, and only
        necessary if the UR encoded path in the CRAM itself needs to be
        overridden. If provided, we will tell the CRAM decoder to use this FASTA
        for the reference sequence.
      read_requirements: optional ReadRequirement proto. If not None, this proto
        is used to control which reads are filtered out by the reader before
        they are passed to the client.
      parse_aux_fields: optional bool, defaulting to False. If False, we do not
        parse the auxiliary fields of the SAM/BAM/CRAM records (see SAM spec for
        details). Parsing the aux fields is unnecessary for many applications,
        and adds a significant parsing cost to access. If you need these aux
        fields, set parse_aux_fields to True and these fields will be parsed and
        populate the appropriate Read proto fields (e.g., read.info).
      hts_block_size: int or None. If specified, this configures the block size
        of the underlying htslib file object. Larger values (e.g. 1M) may be
        beneficial for reading remote files. If None, the reader uses the
        default htslib block size.
      downsample_fraction: float in the interval [0.0, 1.0] or None. If
        specified as a positive float, the reader will only keep each read with
        probability downsample_fraction, randomly. If None or zero, all reads
        are kept.
      random_seed: None or int. The random seed to use with this sam reader, if
        needed. If None, a fixed random value will be assigned.

    Raises:
      ValueError: If downsample_fraction is not None and not in the interval
        (0.0, 1.0].
      ImportError: If someone tries to load a tfbam file.
    """
        if input_path.endswith('.tfbam'):
            # Delayed loading of tfbam_lib.
            try:
                from tfbam_lib import tfbam_reader  # pylint: disable=g-import-not-at-top
                self._reader = tfbam_reader.make_sam_reader(
                    input_path,
                    read_requirements=read_requirements,
                    unused_block_size=hts_block_size,
                    downsample_fraction=downsample_fraction,
                    random_seed=random_seed)
            except ImportError:
                raise ImportError(
                    'tfbam_lib module not found, cannot read .tfbam files.')
        else:
            aux_field_handling = reads_pb2.SamReaderOptions.SKIP_AUX_FIELDS
            if parse_aux_fields:
                aux_field_handling = reads_pb2.SamReaderOptions.PARSE_ALL_AUX_FIELDS

            # We make 0 be a valid value that means "keep all reads" so that proto
            # defaults (=0) do not omit all reads.
            if downsample_fraction is not None and downsample_fraction != 0:
                if not 0.0 < downsample_fraction <= 1.0:
                    raise ValueError(
                        'downsample_fraction must be in the interval (0.0, 1.0]',
                        downsample_fraction)

            if random_seed is None:
                # Fixed random seed produced with 'od -vAn -N4 -tu4 < /dev/urandom'.
                random_seed = 2928130004

            self._reader = sam_reader.SamReader.from_file(
                input_path.encode('utf8'),
                ref_path.encode('utf8') if ref_path is not None else '',
                reads_pb2.SamReaderOptions(
                    read_requirements=read_requirements,
                    aux_field_handling=aux_field_handling,
                    hts_block_size=(hts_block_size or 0),
                    downsample_fraction=downsample_fraction,
                    random_seed=random_seed))

            self.header = self._reader.header

        super(NativeSamReader, self).__init__()
Exemple #2
0
def make_sam_reader(reads_source,
                    read_requirements=None,
                    use_index=True,
                    parse_aux_fields=False,
                    hts_block_size=None,
                    downsample_fraction=None,
                    random_seed=None):
  """Creates a SamReader for reads_source.

  This function creates a SAM/BAM reader from reads_source, configured by the
  additional arguments to this function. This returned reader object supports
  the iterate() and query(range) APIs.

  Args:
    reads_source: string. A path to a resource containing SAM/BAM records.
      Currently supports SAM text format and BAM binary format.
    read_requirements: optional ReadRequirement proto. If not None, this proto
      is used to control which reads are filtered out by the reader before they
      are passed to the client.
    use_index: optional bool, defaulting to True. If True, we will attempt to
      load an index file for reads_source to enable the query() API call. If
      True an index file must exist. If False, we will not attempt to load an
      index for reads_source, disabling the query() call.
    parse_aux_fields: optional bool. If False, the default, we will not parse
      the auxillary fields of the SAM/BAM records (see SAM spec for details).
      Parsing the aux fields is often unnecessary for many applications, and
      adds a significant parsing cost to access. If you need these aux fields,
      set parse_aux_fields to True and these fields will be parsed and populate
      the appropriate Read proto fields (e.g., read.info).
    hts_block_size: integer or None.  If None, will use the default htslib
      block size.  Otherwise, will configure the underlying block size of the
      underlying htslib file object.  Larger values (e.g. 1M) may be beneficial
      for reading remote files.
    downsample_fraction: None or float in the interval (0.0, 1.0]. If not None,
      this sam_reader will only keep each read with probability
      downsample_fraction, randomly.
    random_seed: None or int. The random seed to use with this sam reader, if
      needed. If None, a fixed random value will be assigned.

  Returns:
    A sam_reader object. The exact class implementing this API is not specified.

  Raises:
    ValueError: If downsample_fraction is not None and not in the interval
      (0.0, 1.0].
    ImportError: If someone tries to load a tfbam file.
  """
  if reads_source.endswith('.tfbam'):
    # delay load tfbam_lib. This is a simple plugin mechanism.
    try:
      from tfbam_lib import tfbam_reader  # pylint: disable=g-import-not-at-top
      return tfbam_reader.make_sam_reader(
          reads_source,
          read_requirements=read_requirements,
          use_index=use_index,
          unused_block_size=hts_block_size,
          downsample_fraction=downsample_fraction,
          random_seed=random_seed)
    except ImportError:
      raise ImportError('tfbam_lib module not found, cannot read .tfbam files.')
  aux_field_handling = core_pb2.SamReaderOptions.SKIP_AUX_FIELDS
  if parse_aux_fields:
    aux_field_handling = core_pb2.SamReaderOptions.PARSE_ALL_AUX_FIELDS
  index_mode = (
      core_pb2.INDEX_BASED_ON_FILENAME
      if use_index else core_pb2.DONT_USE_INDEX)

  if downsample_fraction:
    if not 0.0 < downsample_fraction <= 1.0:
      raise ValueError('downsample_fraction must be in the interval (0.0, 1.0]',
                       downsample_fraction)

  if not random_seed:
    # Fixed random seed produced with 'od -vAn -N4 -tu4 < /dev/urandom'.
    random_seed = 2928130004

  return sam_reader_.SamReader.from_file(
      reads_source.encode('utf8'),
      core_pb2.SamReaderOptions(
          read_requirements=read_requirements,
          index_mode=index_mode,
          aux_field_handling=aux_field_handling,
          hts_block_size=(hts_block_size or 0),
          downsample_fraction=downsample_fraction,
          random_seed=random_seed))
Exemple #3
0
  def __init__(self, input_path,
               use_index=True,
               read_requirements=None,
               parse_aux_fields=False,
               hts_block_size=None,
               downsample_fraction=None,
               random_seed=None):
    """Initializes a NativeSamReader.

    Args:
      input_path: string. A path to a resource containing SAM/BAM records.
        Currently supports SAM text format and BAM binary format.
      use_index: optional bool, defaulting to True. If True, we will attempt to
        load an index file for reads_source to enable the query() API call. If
        True an index file must exist. If False, we will not attempt to load an
        index for reads_source, disabling the query() call.
      read_requirements: optional ReadRequirement proto. If not None, this proto
        is used to control which reads are filtered out by the reader before
        they are passed to the client.
      parse_aux_fields: optional bool. If False, the default, we will not parse
        the auxillary fields of the SAM/BAM records (see SAM spec for details).
        Parsing the aux fields is often unnecessary for many applications, and
        adds a significant parsing cost to access. If you need these aux fields,
        set parse_aux_fields to True and these fields will be parsed and
        populate the appropriate Read proto fields (e.g., read.info).
      hts_block_size: integer or None.  If None, will use the default htslib
        block size.  Otherwise, will configure the underlying block size of the
        underlying htslib file object.  Larger values (e.g. 1M) may be
        beneficial for reading remote files.
      downsample_fraction: None or float in the interval [0.0, 1.0]. If not
        None or 0.0, the reader will only keep each read with probability
        downsample_fraction, randomly.
      random_seed: None or int. The random seed to use with this sam reader, if
        needed. If None, a fixed random value will be assigned.

    Raises:
      ValueError: If downsample_fraction is not None and not in the interval
        (0.0, 1.0].
      ImportError: If someone tries to load a tfbam file.
    """
    if input_path.endswith('.tfbam'):
      # Delayed loading of tfbam_lib.
      try:
        from tfbam_lib import tfbam_reader  # pylint: disable=g-import-not-at-top
        self._reader = tfbam_reader.make_sam_reader(
            input_path,
            read_requirements=read_requirements,
            use_index=use_index,
            unused_block_size=hts_block_size,
            downsample_fraction=downsample_fraction,
            random_seed=random_seed)
      except ImportError:
        raise ImportError(
            'tfbam_lib module not found, cannot read .tfbam files.')
    else:
      index_mode = index_pb2.INDEX_BASED_ON_FILENAME
      if not use_index:
        index_mode = index_pb2.DONT_USE_INDEX

      aux_field_handling = reads_pb2.SamReaderOptions.SKIP_AUX_FIELDS
      if parse_aux_fields:
        aux_field_handling = reads_pb2.SamReaderOptions.PARSE_ALL_AUX_FIELDS

      if downsample_fraction:
        if not 0.0 < downsample_fraction <= 1.0:
          raise ValueError(
              'downsample_fraction must be in the interval (0.0, 1.0]',
              downsample_fraction)

      if random_seed is None:
        # Fixed random seed produced with 'od -vAn -N4 -tu4 < /dev/urandom'.
        random_seed = 2928130004

      self._reader = sam_reader.SamReader.from_file(
          input_path.encode('utf8'),
          reads_pb2.SamReaderOptions(
              read_requirements=read_requirements,
              index_mode=index_mode,
              aux_field_handling=aux_field_handling,
              hts_block_size=(hts_block_size or 0),
              downsample_fraction=downsample_fraction,
              random_seed=random_seed))

      self.header = self._reader.header

    super(NativeSamReader, self).__init__()