def __init__(self, file_name, range_tracker, file_pattern, compression_type, allow_malformed_records, **kwargs): self._header_lines = [] self._last_record = None self._file_name = file_name self._allow_malformed_records = allow_malformed_records text_source = TextSource( file_pattern, 0, # min_bundle_size compression_type, True, # strip_trailing_newlines coders.StrUtf8Coder(), # coder validate=False, header_processor_fns=(lambda x: x.startswith('#'), self._store_header_lines), **kwargs) self._text_lines = text_source.read_records(self._file_name, range_tracker) try: self._vcf_reader = vcf.Reader(fsock=self._create_generator()) except SyntaxError as e: # Throw the exception inside the generator to ensure file is properly # closed (it's opened inside TextSource.read_records). self._text_lines.throw( ValueError('An exception was raised when reading header from VCF ' 'file %s: %s' % (self._file_name, traceback.format_exc(e))))
def __init__(self, file_name, range_tracker, file_pattern, compression_type, allow_malformed_records, **kwargs): self._header_lines = [] self._last_record = None self._file_name = file_name self._allow_malformed_records = allow_malformed_records text_source = TextSource( file_pattern, 0, # min_bundle_size compression_type, True, # strip_trailing_newlines coders.StrUtf8Coder(), # coder validate=False, header_processor_fns=(lambda x: x.startswith('#'), self._store_header_lines), **kwargs) self._text_lines = text_source.read_records(self._file_name, range_tracker) try: self._vcf_reader = vcf.Reader(fsock=self._create_generator()) except SyntaxError as e: # Throw the exception inside the generator to ensure file is properly # closed (it's opened inside TextSource.read_records). self._text_lines.throw( ValueError('An exception was raised when reading header from VCF ' 'file %s: %s' % (self._file_name, traceback.format_exc(e))))
def test_header_processing(self): file_name, expected_data = write_data(10) assert len(expected_data) == 10 def header_matcher(line): return line in expected_data[:5] header_lines = [] def store_header(lines): for line in lines: header_lines.append(line) source = TextSource(file_name, 0, CompressionTypes.UNCOMPRESSED, True, coders.StrUtf8Coder(), header_processor_fns=(header_matcher, store_header)) splits = list(source.split(desired_bundle_size=100000)) assert len(splits) == 1 range_tracker = splits[0].source.get_range_tracker( splits[0].start_position, splits[0].stop_position) read_data = list(source.read_records(file_name, range_tracker)) self.assertCountEqual(expected_data[:5], header_lines) self.assertCountEqual(expected_data[5:], read_data)
def __init__(self, file_name, range_tracker, file_pattern, compression_type, **kwargs): self._header_lines = [] self._last_record = None self._file_name = file_name text_source = TextSource( file_pattern, 0, # min_bundle_size compression_type, True, # strip_trailing_newlines coders.StrUtf8Coder(), # coder validate=False, header_processor_fns=(lambda x: x.startswith('#'), self._store_header_lines), **kwargs) self._text_lines = text_source.read_records( self._file_name, range_tracker) try: self._vcf_reader = vcf.Reader(fsock=self._create_generator()) except SyntaxError as e: raise ValueError('Invalid VCF header %s' % str(e))
def test_header_processing(self): file_name, expected_data = write_data(10) assert len(expected_data) == 10 def header_matcher(line): return line in expected_data[:5] header_lines = [] def store_header(lines): for line in lines: header_lines.append(line) source = TextSource(file_name, 0, CompressionTypes.UNCOMPRESSED, True, coders.StrUtf8Coder(), header_processor_fns=(header_matcher, store_header)) splits = list(source.split(desired_bundle_size=100000)) assert len(splits) == 1 range_tracker = splits[0].source.get_range_tracker( splits[0].start_position, splits[0].stop_position) read_data = list(source.read_records(file_name, range_tracker)) self.assertCountEqual(expected_data[:5], header_lines) self.assertCountEqual(expected_data[5:], read_data)