def max_travellers_per_destination(): env = StreamExecutionEnvironment.get_execution_environment() t_env = StreamTableEnvironment.create(stream_execution_environment=env) t_env.execute_sql( create_table_ddl( "WATERMARK FOR dropOffTime AS dropOffTime - INTERVAL '30' SECONDS") ) taxi_ride = t_env.from_path('TaxiRide') no_of_travelers_per_dest = taxi_ride \ .select(taxi_ride.passengerCount, taxi_ride.dropOffTime, taxi_ride.destLocationZone) \ .window(Tumble().over('1.hour').on(taxi_ride.dropOffTime).alias('w')) \ .group_by(taxi_ride.destLocationZone, col('w')) \ .select(taxi_ride.destLocationZone, \ col('w').start.alias('start'), \ col('w').end.alias('end'), \ taxi_ride.passengerCount.count.alias('cnt')) t_env.to_append_stream( no_of_travelers_per_dest, Types.ROW_NAMED(['destLocationZone', 'start', 'end', 'cnt'], [ Types.STRING(), Types.SQL_TIMESTAMP(), Types.SQL_TIMESTAMP(), Types.LONG() ])).print() env.execute('Max-Travellers-Per-Destination')
def popular_taxi_vendor(): env = StreamExecutionEnvironment.get_execution_environment() env.set_parallelism(1) t_env = StreamTableEnvironment.create(stream_execution_environment=env) t_env.execute_sql( create_table_ddl( "WATERMARK FOR pickupTime AS pickupTime - INTERVAL '30' SECONDS")) taxi_ride = t_env.from_path('TaxiRide') popular_rides = taxi_ride.select(taxi_ride.vendorId, taxi_ride.pickupTime) \ .window(Slide.over('15.minutes').every('5.minutes').on(taxi_ride.pickupTime).alias('w')) \ .group_by(taxi_ride.vendorId, col('w')) \ .select(taxi_ride.vendorId, \ col('w').start.alias('start'), \ col('w').end.alias('end'), \ taxi_ride.vendorId.count.alias('cnt')) t_env.to_append_stream( popular_rides, Types.ROW_NAMED(['vendorId', 'start', 'end', 'cnt'], [ Types.INT(), Types.SQL_TIMESTAMP(), Types.SQL_TIMESTAMP(), Types.LONG() ])).print() env.execute('Popular-Taxi-Vendor')
def _create_orc_basic_row_and_data() -> Tuple[RowType, RowTypeInfo, List[Row]]: row_type = DataTypes.ROW([ DataTypes.FIELD('char', DataTypes.CHAR(10)), DataTypes.FIELD('varchar', DataTypes.VARCHAR(10)), DataTypes.FIELD('bytes', DataTypes.BYTES()), DataTypes.FIELD('boolean', DataTypes.BOOLEAN()), DataTypes.FIELD('decimal', DataTypes.DECIMAL(2, 0)), DataTypes.FIELD('int', DataTypes.INT()), DataTypes.FIELD('bigint', DataTypes.BIGINT()), DataTypes.FIELD('double', DataTypes.DOUBLE()), DataTypes.FIELD('date', DataTypes.DATE().bridged_to('java.sql.Date')), DataTypes.FIELD('timestamp', DataTypes.TIMESTAMP(3).bridged_to('java.sql.Timestamp')), ]) row_type_info = Types.ROW_NAMED( ['char', 'varchar', 'bytes', 'boolean', 'decimal', 'int', 'bigint', 'double', 'date', 'timestamp'], [Types.STRING(), Types.STRING(), Types.PRIMITIVE_ARRAY(Types.BYTE()), Types.BOOLEAN(), Types.BIG_DEC(), Types.INT(), Types.LONG(), Types.DOUBLE(), Types.SQL_DATE(), Types.SQL_TIMESTAMP()] ) data = [Row( char='char', varchar='varchar', bytes=b'varbinary', boolean=True, decimal=Decimal(1.5), int=2147483647, bigint=-9223372036854775808, double=2e-308, date=date(1970, 1, 1), timestamp=datetime(1970, 1, 2, 3, 4, 5, 600000), )] return row_type, row_type_info, data
def test_sql_timestamp_type_info(self): ds = self.env.from_collection([(datetime.date(2021, 1, 9), datetime.time(12, 0, 0), datetime.datetime(2021, 1, 9, 12, 0, 0, 11000))], type_info=Types.ROW([Types.SQL_DATE(), Types.SQL_TIME(), Types.SQL_TIMESTAMP()])) ds.map(lambda x: x, output_type=Types.ROW([Types.SQL_DATE(), Types.SQL_TIME(), Types.SQL_TIMESTAMP()]))\ .add_sink(self.test_sink) self.env.execute("test sql timestamp type info") results = self.test_sink.get_results() expected = ['+I[2021-01-09, 12:00:00, 2021-01-09 12:00:00.011]'] self.assertEqual(expected, results)
def pickled_bytes_to_python_converter(data, field_type): if isinstance(field_type, RowTypeInfo): data = zip(list(data[1:]), field_type.get_field_types()) fields = [] for d, d_type in data: fields.append(pickled_bytes_to_python_converter(d, d_type)) return tuple(fields) else: data = pickle.loads(data) if field_type == Types.SQL_TIME(): seconds, microseconds = divmod(data, 10**6) minutes, seconds = divmod(seconds, 60) hours, minutes = divmod(minutes, 60) return datetime.time(hours, minutes, seconds, microseconds) elif field_type == Types.SQL_DATE(): return field_type.from_internal_type(data) elif field_type == Types.SQL_TIMESTAMP(): return field_type.from_internal_type(int(data.timestamp() * 10**6)) elif field_type == Types.FLOAT(): return field_type.from_internal_type(ast.literal_eval(data)) elif is_basic_array_type_info( field_type) or is_primitive_array_type_info(field_type): element_type = typeinfo._from_java_type( field_type.get_java_type_info().getComponentInfo()) elements = [] for element_bytes in data: elements.append( pickled_bytes_to_python_converter(element_bytes, element_type)) return elements else: return field_type.from_internal_type(data)
def popular_destination_query(): env = StreamExecutionEnvironment.get_execution_environment() t_env = StreamTableEnvironment.create(stream_execution_environment=env) t_env.execute_sql( create_table_ddl( "WATERMARK FOR pickupTime AS pickupTime - INTERVAL '30' SECONDS")) query = f"""SELECT destLocationId, wstart, wend, cnt FROM (SELECT destLocationId, HOP_START(pickupTime, INTERVAL '5' MINUTE, INTERVAL '15' MINUTE) AS wstart, HOP_END(pickupTime, INTERVAL '5' MINUTE, INTERVAL '15' MINUTE) AS wend, COUNT(destLocationId) AS cnt FROM (SELECT pickupTime, destLocationId FROM TaxiRide) GROUP BY destLocationId, HOP(pickupTime, INTERVAL '5' MINUTE, INTERVAL '15' MINUTE) ) WHERE cnt > {args.threshold} """ results = t_env.sql_query(query) t_env.to_append_stream( results, Types.ROW_NAMED(['destLocationId', 'wstart', 'wend', 'cnt'], [ Types.INT(), Types.SQL_TIMESTAMP(), Types.SQL_TIMESTAMP(), Types.LONG() ])).print() env.execute('Popular-Destination')
def test_from_collection_with_data_types(self): # verify from_collection for the collection with single object. ds = self.env.from_collection(['Hi', 'Hello'], type_info=Types.STRING()) ds.add_sink(self.test_sink) self.env.execute("test from collection with single object") results = self.test_sink.get_results(False) expected = ['Hello', 'Hi'] results.sort() expected.sort() self.assertEqual(expected, results) # verify from_collection for the collection with multiple objects like tuple. ds = self.env.from_collection([(1, None, 1, True, 32767, -2147483648, 1.23, 1.98932, bytearray(b'flink'), 'pyflink', datetime.date(2014, 9, 13), datetime.time(hour=12, minute=0, second=0, microsecond=123000), datetime.datetime(2018, 3, 11, 3, 0, 0, 123000), [1, 2, 3], decimal.Decimal('1000000000000000000.05'), decimal.Decimal('1000000000000000000.0599999999999' '9999899999999999')), (2, None, 2, True, 43878, 9147483648, 9.87, 2.98936, bytearray(b'flink'), 'pyflink', datetime.date(2015, 10, 14), datetime.time(hour=11, minute=2, second=2, microsecond=234500), datetime.datetime(2020, 4, 15, 8, 2, 6, 235000), [2, 4, 6], decimal.Decimal('2000000000000000000.74'), decimal.Decimal('2000000000000000000.061111111111111' '11111111111111'))], type_info=Types.ROW( [Types.LONG(), Types.LONG(), Types.SHORT(), Types.BOOLEAN(), Types.SHORT(), Types.INT(), Types.FLOAT(), Types.DOUBLE(), Types.PICKLED_BYTE_ARRAY(), Types.STRING(), Types.SQL_DATE(), Types.SQL_TIME(), Types.SQL_TIMESTAMP(), Types.BASIC_ARRAY(Types.LONG()), Types.BIG_DEC(), Types.BIG_DEC()])) ds.add_sink(self.test_sink) self.env.execute("test from collection with tuple object") results = self.test_sink.get_results(False) # if user specifies data types of input data, the collected result should be in row format. expected = [ '+I[1, null, 1, true, 32767, -2147483648, 1.23, 1.98932, [102, 108, 105, 110, 107], ' 'pyflink, 2014-09-13, 12:00:00, 2018-03-11 03:00:00.123, [1, 2, 3], ' '1000000000000000000.05, 1000000000000000000.05999999999999999899999999999]', '+I[2, null, 2, true, -21658, 557549056, 9.87, 2.98936, [102, 108, 105, 110, 107], ' 'pyflink, 2015-10-14, 11:02:02, 2020-04-15 08:02:06.235, [2, 4, 6], ' '2000000000000000000.74, 2000000000000000000.06111111111111111111111111111]'] results.sort() expected.sort() self.assertEqual(expected, results)
def pickled_bytes_to_python_converter(data, field_type): if isinstance(field_type, RowTypeInfo): row_kind = RowKind(int.from_bytes(data[0], 'little')) data = zip(list(data[1:]), field_type.get_field_types()) fields = [] for d, d_type in data: fields.append(pickled_bytes_to_python_converter(d, d_type)) row = Row.of_kind(row_kind, *fields) return row else: data = pickle.loads(data) if field_type == Types.SQL_TIME(): seconds, microseconds = divmod(data, 10**6) minutes, seconds = divmod(seconds, 60) hours, minutes = divmod(minutes, 60) return datetime.time(hours, minutes, seconds, microseconds) elif field_type == Types.SQL_DATE(): return field_type.from_internal_type(data) elif field_type == Types.SQL_TIMESTAMP(): return field_type.from_internal_type(int(data.timestamp() * 10**6)) elif field_type == Types.FLOAT(): return field_type.from_internal_type(ast.literal_eval(data)) elif isinstance( field_type, (BasicArrayTypeInfo, PrimitiveArrayTypeInfo, ObjectArrayTypeInfo)): element_type = field_type._element_type elements = [] for element_bytes in data: elements.append( pickled_bytes_to_python_converter(element_bytes, element_type)) return elements elif isinstance(field_type, MapTypeInfo): key_type = field_type._key_type_info value_type = field_type._value_type_info zip_kv = zip(data[0], data[1]) return dict((pickled_bytes_to_python_converter(k, key_type), pickled_bytes_to_python_converter(v, value_type)) for k, v in zip_kv) elif isinstance(field_type, ListTypeInfo): element_type = field_type.elem_type elements = [] for element_bytes in data: elements.append( pickled_bytes_to_python_converter(element_bytes, element_type)) return elements else: return field_type.from_internal_type(data)
def _create_parquet_basic_row_and_data() -> Tuple[RowType, RowTypeInfo, List[Row]]: row_type = DataTypes.ROW([ DataTypes.FIELD('char', DataTypes.CHAR(10)), DataTypes.FIELD('varchar', DataTypes.VARCHAR(10)), DataTypes.FIELD('binary', DataTypes.BINARY(10)), DataTypes.FIELD('varbinary', DataTypes.VARBINARY(10)), DataTypes.FIELD('boolean', DataTypes.BOOLEAN()), DataTypes.FIELD('decimal', DataTypes.DECIMAL(2, 0)), DataTypes.FIELD('int', DataTypes.INT()), DataTypes.FIELD('bigint', DataTypes.BIGINT()), DataTypes.FIELD('double', DataTypes.DOUBLE()), DataTypes.FIELD('date', DataTypes.DATE().bridged_to('java.sql.Date')), DataTypes.FIELD('time', DataTypes.TIME().bridged_to('java.sql.Time')), DataTypes.FIELD('timestamp', DataTypes.TIMESTAMP(3).bridged_to('java.sql.Timestamp')), DataTypes.FIELD('timestamp_ltz', DataTypes.TIMESTAMP_LTZ(3)), ]) row_type_info = Types.ROW_NAMED( ['char', 'varchar', 'binary', 'varbinary', 'boolean', 'decimal', 'int', 'bigint', 'double', 'date', 'time', 'timestamp', 'timestamp_ltz'], [Types.STRING(), Types.STRING(), Types.PRIMITIVE_ARRAY(Types.BYTE()), Types.PRIMITIVE_ARRAY(Types.BYTE()), Types.BOOLEAN(), Types.BIG_DEC(), Types.INT(), Types.LONG(), Types.DOUBLE(), Types.SQL_DATE(), Types.SQL_TIME(), Types.SQL_TIMESTAMP(), Types.INSTANT()] ) datetime_ltz = datetime.datetime(1970, 2, 3, 4, 5, 6, 700000, tzinfo=pytz.timezone('UTC')) timestamp_ltz = Instant.of_epoch_milli( ( calendar.timegm(datetime_ltz.utctimetuple()) + calendar.timegm(time.localtime(0)) ) * 1000 + datetime_ltz.microsecond // 1000 ) data = [Row( char='char', varchar='varchar', binary=b'binary', varbinary=b'varbinary', boolean=True, decimal=Decimal(1.5), int=2147483647, bigint=-9223372036854775808, double=2e-308, date=datetime.date(1970, 1, 1), time=datetime.time(1, 1, 1), timestamp=datetime.datetime(1970, 1, 2, 3, 4, 5, 600000), timestamp_ltz=timestamp_ltz )] return row_type, row_type_info, data
def test_from_java_type(self): basic_int_type_info = Types.INT() self.assertEqual(basic_int_type_info, _from_java_type(basic_int_type_info.get_java_type_info())) basic_short_type_info = Types.SHORT() self.assertEqual(basic_short_type_info, _from_java_type(basic_short_type_info.get_java_type_info())) basic_long_type_info = Types.LONG() self.assertEqual(basic_long_type_info, _from_java_type(basic_long_type_info.get_java_type_info())) basic_float_type_info = Types.FLOAT() self.assertEqual(basic_float_type_info, _from_java_type(basic_float_type_info.get_java_type_info())) basic_double_type_info = Types.DOUBLE() self.assertEqual(basic_double_type_info, _from_java_type(basic_double_type_info.get_java_type_info())) basic_char_type_info = Types.CHAR() self.assertEqual(basic_char_type_info, _from_java_type(basic_char_type_info.get_java_type_info())) basic_byte_type_info = Types.BYTE() self.assertEqual(basic_byte_type_info, _from_java_type(basic_byte_type_info.get_java_type_info())) basic_big_int_type_info = Types.BIG_INT() self.assertEqual(basic_big_int_type_info, _from_java_type(basic_big_int_type_info.get_java_type_info())) basic_big_dec_type_info = Types.BIG_DEC() self.assertEqual(basic_big_dec_type_info, _from_java_type(basic_big_dec_type_info.get_java_type_info())) basic_sql_date_type_info = Types.SQL_DATE() self.assertEqual(basic_sql_date_type_info, _from_java_type(basic_sql_date_type_info.get_java_type_info())) basic_sql_time_type_info = Types.SQL_TIME() self.assertEqual(basic_sql_time_type_info, _from_java_type(basic_sql_time_type_info.get_java_type_info())) basic_sql_timestamp_type_info = Types.SQL_TIMESTAMP() self.assertEqual(basic_sql_timestamp_type_info, _from_java_type(basic_sql_timestamp_type_info.get_java_type_info())) row_type_info = Types.ROW([Types.INT(), Types.STRING()]) self.assertEqual(row_type_info, _from_java_type(row_type_info.get_java_type_info())) tuple_type_info = Types.TUPLE([Types.CHAR(), Types.INT()]) self.assertEqual(tuple_type_info, _from_java_type(tuple_type_info.get_java_type_info())) primitive_int_array_type_info = Types.PRIMITIVE_ARRAY(Types.INT()) self.assertEqual(primitive_int_array_type_info, _from_java_type(primitive_int_array_type_info.get_java_type_info())) object_array_type_info = Types.OBJECT_ARRAY(Types.SQL_DATE()) self.assertEqual(object_array_type_info, _from_java_type(object_array_type_info.get_java_type_info())) pickled_byte_array_type_info = Types.PICKLED_BYTE_ARRAY() self.assertEqual(pickled_byte_array_type_info, _from_java_type(pickled_byte_array_type_info.get_java_type_info())) sql_date_type_info = Types.SQL_DATE() self.assertEqual(sql_date_type_info, _from_java_type(sql_date_type_info.get_java_type_info())) map_type_info = Types.MAP(Types.INT(), Types.STRING()) self.assertEqual(map_type_info, _from_java_type(map_type_info.get_java_type_info())) list_type_info = Types.LIST(Types.INT()) self.assertEqual(list_type_info, _from_java_type(list_type_info.get_java_type_info()))