Python _TextSource Examples, apache_beam.io.textio._TextSource Python Examples

Example #1

0

Show file

File: vcf_parser.py Project: vijbon01/gcp-variant-transforms

    def __init__(
            self,
            file_name,  # type: str
            range_tracker,  # type: range_trackers.OffsetRangeTracker
            file_pattern,  # type: str
            compression_type,  # type: str
            allow_malformed_records,  # type: bool
            representative_header_lines=None,  # type:  List[str]
            **kwargs  # type: **str
    ):
        # type: (...) -> None
        # If `representative_header_lines` is given, header lines in `file_name`
        # are ignored; refer to _process_header_lines() logic.
        self._representative_header_lines = representative_header_lines
        self._file_name = file_name
        self._allow_malformed_records = allow_malformed_records

        text_source = textio._TextSource(
            file_pattern,
            0,  # min_bundle_size
            compression_type,
            True,  # strip_trailing_newlines
            coders.StrUtf8Coder(),  # coder
            validate=False,
            header_processor_fns=(lambda x: not x.strip() or x.startswith('#'),
                                  self._process_header_lines),
            **kwargs)

        self._text_lines = text_source.read_records(self._file_name,
                                                    range_tracker)

Example #2

0

Show file

 def _create_source(self, path, schema):
     if not self.use_json_exports:
         return _create_avro_source(path, use_fastavro=True)
     else:
         return _TextSource(path,
                            min_bundle_size=0,
                            compression_type=CompressionTypes.UNCOMPRESSED,
                            strip_trailing_newlines=True,
                            coder=_JsonToDictCoder(schema))

Example #3

0

Show file

File: vcfio.py Project: sanjaysiddhanti/gcp-variant-transforms

        def __init__(
                self,
                file_name,  # type: str
                range_tracker,  # type: range_trackers.OffsetRangeTracker
                file_pattern,  # type: str
                compression_type,  # type: str
                allow_malformed_records,  # type: bool
                representative_header_lines=None,  # type:  List[str]
                **kwargs  # type: **str
        ):
            # type: (...) -> None
            # If `representative_header_lines` is given, header lines in `file_name`
            # are ignored.
            self._header_lines = []
            self._representative_header_lines = representative_header_lines
            self._last_record = None
            self._file_name = file_name
            self._allow_malformed_records = allow_malformed_records

            text_source = textio._TextSource(
                file_pattern,
                0,  # min_bundle_size
                compression_type,
                True,  # strip_trailing_newlines
                coders.StrUtf8Coder(),  # coder
                validate=False,
                header_processor_fns=(lambda x: x.startswith('#'),
                                      self._store_header_lines),
                **kwargs)

            self._text_lines = text_source.read_records(
                self._file_name, range_tracker)
            try:
                self._vcf_reader = vcf.Reader(fsock=self._create_generator())
            except SyntaxError as e:
                raise ValueError('Invalid VCF header in %s: %s' %
                                 (self._file_name, str(e)))

Example #4

0

Show file

    def __init__(
            self,
            file_name,  # type: str
            range_tracker,  # type: range_trackers.OffsetRangeTracker
            file_pattern,  # type: str
            compression_type,  # type: str
            allow_malformed_records,  # type: bool
            representative_header_lines=None,  # type:  List[str]
            splittable_bgzf=False,  # type: bool
            pre_infer_headers=False,  # type: bool
            sample_name_encoding=SampleNameEncoding.
        WITHOUT_FILE_PATH,  # type: int
            use_1_based_coordinate=False,  # type: bool
            **kwargs  # type: **str
    ):
        # type: (...) -> None
        # If `representative_header_lines` is given, header lines in `file_name`
        # are ignored; refer to _process_header_lines() logic.
        self._representative_header_lines = representative_header_lines
        self._file_name = file_name
        self._allow_malformed_records = allow_malformed_records
        self._pre_infer_headers = pre_infer_headers
        self._sample_name_encoding = sample_name_encoding
        self._use_1_based_coordinate = use_1_based_coordinate

        if splittable_bgzf:
            text_source = bgzf.BGZFBlockSource(
                file_name,
                range_tracker,
                representative_header_lines,
                compression_type,
                header_processor_fns=(
                    lambda x: not x.strip() or x.startswith('#'),
                    self._process_header_lines),
                **kwargs)
        elif compression_type == filesystems.CompressionTypes.GZIP:
            text_source = bgzf.BGZFSource(
                file_pattern,
                0,  # min_bundle_size
                compression_type,
                True,  # strip_trailing_newlines
                coders.StrUtf8Coder(),  # coder
                validate=False,
                header_processor_fns=(
                    lambda x: not x.strip() or x.startswith('#'),
                    self._process_header_lines),
                **kwargs)
        else:
            text_source = textio._TextSource(
                file_pattern,
                0,  # min_bundle_size
                compression_type,
                True,  # strip_trailing_newlines
                coders.StrUtf8Coder(),  # coder
                validate=False,
                header_processor_fns=(
                    lambda x: not x.strip() or x.startswith('#'),
                    self._process_header_lines),
                **kwargs)

        self._text_lines = text_source.read_records(self._file_name,
                                                    range_tracker)