Exemple #1
0
    def __init__(self,
                 min_bundle_size=0,
                 desired_bundle_size=DEFAULT_DESIRED_BUNDLE_SIZE,
                 use_fastavro=_use_fastavro(),
                 label='ReadAllFiles'):
        """Initializes ``ReadAllFromAvro``.

    Args:
      min_bundle_size: the minimum size in bytes, to be considered when
                       splitting the input into bundles.
      desired_bundle_size: the desired size in bytes, to be considered when
                       splitting the input into bundles.
    """
        if sys.version_info[0] >= 3 and not use_fastavro:
            warnings.warn(
                "Due to a known issue in avro-python3 package, it is "
                "recommended to use fastavro with Beam Avro IO on "
                "Python 3 until BEAM-6522 is addressed.")
        source_from_file = partial(_create_avro_source,
                                   min_bundle_size=min_bundle_size,
                                   use_fastavro=use_fastavro)
        self._read_all_files = filebasedsource.ReadAllFiles(
            True, CompressionTypes.AUTO, desired_bundle_size, min_bundle_size,
            source_from_file)

        self.label = label
Exemple #2
0
    def __init__(self,
                 min_bundle_size=0,
                 desired_bundle_size=DEFAULT_DESIRED_BUNDLE_SIZE,
                 use_fastavro=True,
                 with_filename=False,
                 label='ReadAllFiles'):
        """Initializes ``ReadAllFromAvro``.

    Args:
      min_bundle_size: the minimum size in bytes, to be considered when
                       splitting the input into bundles.
      desired_bundle_size: the desired size in bytes, to be considered when
                       splitting the input into bundles.
      use_fastavro (bool); when set, use the `fastavro` library for IO, which
        is significantly faster, and is now the default.
      with_filename: If True, returns a Key Value with the key being the file
        name and the value being the actual data. If False, it only returns
        the data.
    """
        source_from_file = partial(_create_avro_source,
                                   min_bundle_size=min_bundle_size,
                                   use_fastavro=use_fastavro)
        self._read_all_files = filebasedsource.ReadAllFiles(
            True, CompressionTypes.AUTO, desired_bundle_size, min_bundle_size,
            source_from_file, with_filename)

        self.label = label
Exemple #3
0
    def __init__(
            self,
            representative_header_lines=None,  # type: List[str]
            desired_bundle_size=DEFAULT_DESIRED_BUNDLE_SIZE,  # type: int
            compression_type=CompressionTypes.AUTO,  # type: str
            allow_malformed_records=False,  # type: bool
            pre_infer_headers=False,  # type: bool
            sample_name_encoding=SampleNameEncoding.
        WITHOUT_FILE_PATH,  # type: int
            use_1_based_coordinate=False,  # type: bool
            move_hom_ref_calls=False,  # type: bool
            **kwargs  # type: **str
    ):
        # type: (...) -> None
        """Initialize the :class:`ReadAllFromVcf` transform.

    Args:
      representative_header_lines: Header definitions to be used for parsing VCF
        files. If supplied, header definitions in VCF files are ignored.
      desired_bundle_size: Desired size of bundles that should be generated when
        splitting this source into bundles. See
        :class:`~apache_beam.io.filebasedsource.FileBasedSource` for more
        details.
      compression_type: Used to handle compressed input files.
        Typical value is :attr:`CompressionTypes.AUTO
        <apache_beam.io.filesystem.CompressionTypes.AUTO>`, in which case the
        underlying file_path's extension will be used to detect the compression.
      allow_malformed_records: If true, malformed records from VCF files will be
        returned as :class:`MalformedVcfRecord` instead of failing the pipeline.
      pre_infer_headers: If true, drop headers and make sure PySam return the
        exact data for variants and calls, without type matching.
      sample_name_encoding: specify how we want to encode sample_name mainly
        to deal with same sample_name used across multiple VCF files.
      use_1_based_coordinate: specify whether the coordinates should be stored
        in BQ using 0-based exclusive (default) or 1-based inclusive coordinate.
      move_hom_ref_calls: If true, filter out 0 GT data out of call list and add
        the call name to a hom_ref_calls column.
    """
        super().__init__(**kwargs)
        source_from_file = partial(
            _create_vcf_source,
            representative_header_lines=representative_header_lines,
            compression_type=compression_type,
            allow_malformed_records=allow_malformed_records,
            pre_infer_headers=pre_infer_headers,
            sample_name_encoding=sample_name_encoding,
            use_1_based_coordinate=use_1_based_coordinate,
            move_hom_ref_calls=move_hom_ref_calls)
        self._read_all_files = filebasedsource.ReadAllFiles(
            True,  # splittable
            CompressionTypes.AUTO,
            desired_bundle_size,
            0,  # min_bundle_size
            source_from_file)
Exemple #4
0
  def __init__(self, min_bundle_size=0,
               desired_bundle_size=DEFAULT_DESIRED_BUNDLE_SIZE):
    """Initializes ``ReadAllFromAvro``.

    Args:
      min_bundle_size: the minimum size in bytes, to be considered when
                       splitting the input into bundles.
      desired_bundle_size: the desired size in bytes, to be considered when
                       splitting the input into bundles.
    """
    source_from_file = partial(
        _create_avro_source, min_bundle_size=min_bundle_size)
    self._read_all_files = filebasedsource.ReadAllFiles(
        True, CompressionTypes.AUTO, desired_bundle_size, min_bundle_size,
        source_from_file)