Exemple #1
0
    def __init__(self,
                 file_name,
                 range_tracker,
                 file_pattern,
                 compression_type,
                 allow_malformed_records,
                 **kwargs):
      self._header_lines = []
      self._last_record = None
      self._file_name = file_name
      self._allow_malformed_records = allow_malformed_records

      text_source = TextSource(
          file_pattern,
          0,  # min_bundle_size
          compression_type,
          True,  # strip_trailing_newlines
          coders.StrUtf8Coder(),  # coder
          validate=False,
          header_processor_fns=(lambda x: x.startswith('#'),
                                self._store_header_lines),
          **kwargs)

      self._text_lines = text_source.read_records(self._file_name,
                                                  range_tracker)
      try:
        self._vcf_reader = vcf.Reader(fsock=self._create_generator())
      except SyntaxError as e:
        # Throw the exception inside the generator to ensure file is properly
        # closed (it's opened inside TextSource.read_records).
        self._text_lines.throw(
            ValueError('An exception was raised when reading header from VCF '
                       'file %s: %s' % (self._file_name,
                                        traceback.format_exc(e))))
Exemple #2
0
    def __init__(self,
                 file_name,
                 range_tracker,
                 file_pattern,
                 compression_type,
                 allow_malformed_records,
                 **kwargs):
      self._header_lines = []
      self._last_record = None
      self._file_name = file_name
      self._allow_malformed_records = allow_malformed_records

      text_source = TextSource(
          file_pattern,
          0,  # min_bundle_size
          compression_type,
          True,  # strip_trailing_newlines
          coders.StrUtf8Coder(),  # coder
          validate=False,
          header_processor_fns=(lambda x: x.startswith('#'),
                                self._store_header_lines),
          **kwargs)

      self._text_lines = text_source.read_records(self._file_name,
                                                  range_tracker)
      try:
        self._vcf_reader = vcf.Reader(fsock=self._create_generator())
      except SyntaxError as e:
        # Throw the exception inside the generator to ensure file is properly
        # closed (it's opened inside TextSource.read_records).
        self._text_lines.throw(
            ValueError('An exception was raised when reading header from VCF '
                       'file %s: %s' % (self._file_name,
                                        traceback.format_exc(e))))
Exemple #3
0
    def test_header_processing(self):
        file_name, expected_data = write_data(10)
        assert len(expected_data) == 10

        def header_matcher(line):
            return line in expected_data[:5]

        header_lines = []

        def store_header(lines):
            for line in lines:
                header_lines.append(line)

        source = TextSource(file_name,
                            0,
                            CompressionTypes.UNCOMPRESSED,
                            True,
                            coders.StrUtf8Coder(),
                            header_processor_fns=(header_matcher,
                                                  store_header))
        splits = list(source.split(desired_bundle_size=100000))
        assert len(splits) == 1
        range_tracker = splits[0].source.get_range_tracker(
            splits[0].start_position, splits[0].stop_position)
        read_data = list(source.read_records(file_name, range_tracker))

        self.assertCountEqual(expected_data[:5], header_lines)
        self.assertCountEqual(expected_data[5:], read_data)
Exemple #4
0
        def __init__(self, file_name, range_tracker, file_pattern,
                     compression_type, **kwargs):
            self._header_lines = []
            self._last_record = None
            self._file_name = file_name

            text_source = TextSource(
                file_pattern,
                0,  # min_bundle_size
                compression_type,
                True,  # strip_trailing_newlines
                coders.StrUtf8Coder(),  # coder
                validate=False,
                header_processor_fns=(lambda x: x.startswith('#'),
                                      self._store_header_lines),
                **kwargs)

            self._text_lines = text_source.read_records(
                self._file_name, range_tracker)
            try:
                self._vcf_reader = vcf.Reader(fsock=self._create_generator())
            except SyntaxError as e:
                raise ValueError('Invalid VCF header %s' % str(e))
  def test_header_processing(self):
    file_name, expected_data = write_data(10)
    assert len(expected_data) == 10

    def header_matcher(line):
      return line in expected_data[:5]

    header_lines = []

    def store_header(lines):
      for line in lines:
        header_lines.append(line)

    source = TextSource(file_name, 0, CompressionTypes.UNCOMPRESSED, True,
                        coders.StrUtf8Coder(),
                        header_processor_fns=(header_matcher, store_header))
    splits = list(source.split(desired_bundle_size=100000))
    assert len(splits) == 1
    range_tracker = splits[0].source.get_range_tracker(
        splits[0].start_position, splits[0].stop_position)
    read_data = list(source.read_records(file_name, range_tracker))

    self.assertCountEqual(expected_data[:5], header_lines)
    self.assertCountEqual(expected_data[5:], read_data)