class ReadFromBGZF(beam.PTransform): """Reads variants from BGZF.""" def __init__(self, input_files, representative_header_lines, allow_malformed_records, pre_infer_headers, sample_name_encoding=SampleNameEncoding.WITHOUT_FILE_PATH, use_1_based_coordinate=False ): # type: (List[str], List[str], bool, bool, int, bool) -> None """Initializes the transform. Args: input_files: The BGZF file paths to read from. representative_header_lines: Header definitions to be used for parsing VCF files. allow_malformed_records: If true, malformed records from VCF files will be returned as `MalformedVcfRecord` instead of failing the pipeline. pre_infer_headers: If true, drop headers and make sure PySam return the exact data for variants and calls, without type matching. sample_name_encoding: specify how we want to encode sample_name mainly to deal with same sample_name used across multiple VCF files. use_1_based_coordinate: specify whether the coordinates should be stored in BQ using 0 based exclusive (default) or 1 based inclusive coordinate. """ self._input_files = input_files self._representative_header_lines = representative_header_lines self._allow_malformed_records = allow_malformed_records self._pre_infer_headers = pre_infer_headers self._sample_name_encoding = sample_name_encoding self._use_1_based_coordinate = use_1_based_coordinate def _read_records(self, (file_path, block)): # type: (Tuple[str, Block]) -> Iterable(Variant) """Reads records from `file_path` in `block`.""" record_iterator = vcf_parser.PySamParser( file_path, block, filesystems.CompressionTypes.GZIP, self._allow_malformed_records, representative_header_lines=self._representative_header_lines, splittable_bgzf=True, pre_infer_headers=self._pre_infer_headers, sample_name_encoding=self._sample_name_encoding, use_1_based_coordinate=self._use_1_based_coordinate) for record in record_iterator: yield record
def _read_records(self, file_path_and_block_tuple): # type: (Tuple[str, Block]) -> Iterable(Variant) """Reads records from `file_path` in `block`.""" (file_path, block) = file_path_and_block_tuple record_iterator = vcf_parser.PySamParser( file_path, block, filesystems.CompressionTypes.GZIP, self._allow_malformed_records, representative_header_lines=self._representative_header_lines, splittable_bgzf=True, pre_infer_headers=self._pre_infer_headers, sample_name_encoding=self._sample_name_encoding, use_1_based_coordinate=self._use_1_based_coordinate) for record in record_iterator: yield record
def read_records(self, file_name, # type: str range_tracker # type: range_trackers.OffsetRangeTracker ): # type: (...) -> Iterable[MalformedVcfRecord] record_iterator = vcf_parser.PySamParser( file_name, range_tracker, self._compression_type, self._allow_malformed_records, file_pattern=self._pattern, representative_header_lines=self._representative_header_lines, pre_infer_headers=self._pre_infer_headers, sample_name_encoding=self._sample_name_encoding, use_1_based_coordinate=self._use_1_based_coordinate, buffer_size=self._buffer_size, skip_header_lines=0) # Convert iterator to generator to abstract behavior for record in record_iterator: yield record