def from_source(self, source: Source, watermark_strategy: WatermarkStrategy, source_name: str, type_info: TypeInformation = None) -> 'DataStream': """ Adds a data :class:`~pyflink.datastream.connectors.Source` to the environment to get a :class:`~pyflink.datastream.DataStream`. The result will be either a bounded data stream (that can be processed in a batch way) or an unbounded data stream (that must be processed in a streaming way), based on the boundedness property of the source. This method takes an explicit type information for the produced data stream, so that callers can define directly what type/serializer will be used for the produced stream. For sources that describe their produced type, the parameter type_info should not be specified to avoid specifying the produced type redundantly. .. versionadded:: 1.13.0 """ if type_info: j_type_info = type_info.get_java_type_info() else: j_type_info = None j_data_stream = self._j_stream_execution_environment.fromSource( source.get_java_function(), watermark_strategy._j_watermark_strategy, source_name, j_type_info) return DataStream(j_data_stream=j_data_stream)
def key_by(self, key_selector: Union[Callable, KeySelector], key_type_info: TypeInformation = None) -> 'KeyedStream': """ Creates a new KeyedStream that uses the provided key for partitioning its operator states. :param key_selector: The KeySelector to be used for extracting the key for partitioning. :param key_type_info: The type information describing the key type. :return: The DataStream with partitioned state(i.e. KeyedStream). """ if callable(key_selector): key_selector = KeySelectorFunctionWrapper(key_selector) if not isinstance(key_selector, (KeySelector, KeySelectorFunctionWrapper)): raise TypeError("Parameter key_selector should be a type of KeySelector.") gateway = get_gateway() PickledKeySelector = gateway.jvm \ .org.apache.flink.datastream.runtime.functions.python.PickledKeySelector j_output_type_info = self._j_data_stream.getTransformation().getOutputType() output_type_info = typeinfo._from_java_type(j_output_type_info) is_key_pickled_byte_array = False if key_type_info is None: key_type_info = Types.PICKLED_BYTE_ARRAY() is_key_pickled_byte_array = True intermediate_map_stream = self.map(lambda x: (key_selector.get_key(x), x), type_info=Types.ROW([key_type_info, output_type_info])) intermediate_map_stream.name(gateway.jvm.org.apache.flink.python.util.PythonConfigUtil .STREAM_KEY_BY_MAP_OPERATOR_NAME) generated_key_stream = KeyedStream(intermediate_map_stream._j_data_stream .keyBy(PickledKeySelector(is_key_pickled_byte_array), key_type_info.get_java_type_info()), self) generated_key_stream._original_data_type_info = output_type_info return generated_key_stream
def from_type_info(type_info: TypeInformation) -> FieldCoder: """ Mappings from type_info to Coder """ if isinstance(type_info, PickledBytesTypeInfo): return PickleCoder() elif isinstance(type_info, BasicTypeInfo): return _basic_type_info_mappings[type_info._basic_type] elif isinstance(type_info, DateTypeInfo): return DateCoder() elif isinstance(type_info, TimeTypeInfo): return TimeCoder() elif isinstance(type_info, TimestampTypeInfo): return TimestampCoder(3) elif isinstance(type_info, PrimitiveArrayTypeInfo): element_type = type_info._element_type if isinstance( element_type, BasicTypeInfo) and element_type._basic_type == BasicType.BYTE: return BinaryCoder() else: return PrimitiveArrayCoder(from_type_info(element_type)) elif isinstance(type_info, (BasicArrayTypeInfo, ObjectArrayTypeInfo)): return GenericArrayCoder(from_type_info(type_info._element_type)) elif isinstance(type_info, ListTypeInfo): return GenericArrayCoder(from_type_info(type_info.elem_type)) elif isinstance(type_info, MapTypeInfo): return MapCoder(from_type_info(type_info._key_type_info), from_type_info(type_info._value_type_info)) elif isinstance(type_info, TupleTypeInfo): return TupleCoder([ from_type_info(field_type) for field_type in type_info.get_field_types() ]) elif isinstance(type_info, RowTypeInfo): return RowCoder( [from_type_info(f) for f in type_info.get_field_types()], [f for f in type_info.get_field_names()]) elif isinstance(type_info, ExternalTypeInfo): return from_type_info(type_info._type_info) elif isinstance(type_info, GenericRecordAvroTypeInfo): return AvroCoder(type_info._schema) else: raise ValueError("Unsupported type_info %s." % type_info)
def __init__(self, type_info: TypeInformation): if type_info is None: raise TypeError("Type information must not be None") if isinstance(type_info, WrapperTypeInfo): self._j_builder = get_gateway().jvm\ .org.apache.flink.formats.csv.CsvRowSerializationSchema.Builder( type_info.get_java_type_info()) else: raise ValueError('type_info must be WrapperTypeInfo')
def from_collection(self, collection: List[Any], type_info: TypeInformation = None) -> DataStream: """ Creates a data stream from the given non-empty collection. The type of the data stream is that of the elements in the collection. Note that this operation will result in a non-parallel data stream source, i.e. a data stream source with parallelism one. :param collection: The collection of elements to create the data stream from. :param type_info: The TypeInformation for the produced data stream :return: the data stream representing the given collection. """ if type_info is not None: collection = [type_info.to_internal_type(element) for element in collection] return self._from_collection(collection, type_info)
def add_source(self, source_func: SourceFunction, source_name: str = 'Custom Source', type_info: TypeInformation = None) -> 'DataStream': """ Adds a data source to the streaming topology. :param source_func: the user defined function. :param source_name: name of the data source. Optional. :param type_info: type of the returned stream. Optional. :return: the data stream constructed. """ j_type_info = type_info.get_java_type_info() if type_info is not None else None j_data_stream = self._j_stream_execution_environment.addSource(source_func .get_java_function(), source_name, j_type_info) return DataStream(j_data_stream=j_data_stream)
def __init__(self, type_info: TypeInformation): if type_info is None: raise TypeError("Type information must not be None") self._j_builder = get_gateway().jvm\ .org.apache.flink.formats.csv.CsvRowSerializationSchema.Builder( type_info.get_java_type_info())