def setUp(self): super(MinMaxScalerTest, self).setUp() self.train_data = self.t_env.from_data_stream( self.env.from_collection([ (Vectors.dense([0.0, 3.0]), ), (Vectors.dense([2.1, 0.0]), ), (Vectors.dense([4.1, 5.1]), ), (Vectors.dense([6.1, 8.1]), ), (Vectors.dense([200., 400.]), ), ], type_info=Types.ROW_NAMED( ['input'], [DenseVectorTypeInfo()]))) self.predict_data = self.t_env.from_data_stream( self.env.from_collection([ (Vectors.dense([150.0, 90.0]), ), (Vectors.dense([50.0, 40.0]), ), (Vectors.dense([100.0, 50.0]), ), ], type_info=Types.ROW_NAMED( ['input'], [DenseVectorTypeInfo()]))) self.expected_data = [ Vectors.dense(0.25, 0.1), Vectors.dense(0.5, 0.125), Vectors.dense(0.75, 0.225) ]
def setUp(self): super(OneHotEncoderTest, self).setUp() self.train_data = self.t_env.from_data_stream( self.env.from_collection([ (0.0,), (1.0,), (2.0,), (0.0,), ], type_info=Types.ROW_NAMED( ['input'], [Types.DOUBLE()]))) self.predict_data = self.t_env.from_data_stream( self.env.from_collection([ (0.0,), (1.0,), (2.0,), ], type_info=Types.ROW_NAMED( ['input'], [Types.DOUBLE()]))) self.expected_data = { 0.0: Vectors.sparse(2, [0], [1.0]), 1.0: Vectors.sparse(2, [1], [1.0]), 2.0: Vectors.sparse(2, [], []) } self.estimator = OneHotEncoder().set_input_cols('input').set_output_cols('output')
def setUp(self): super(StringIndexerTest, self).setUp() self.train_table = self.t_env.from_data_stream( self.env.from_collection([ ('a', 1.0), ('b', 1.0), ('b', 2.0), ('c', 0.0), ('d', 2.0), ('a', 2.0), ('b', 2.0), ('b', -1.0), ('a', -1.0), ('c', -1.0), ], type_info=Types.ROW_NAMED( ['input_col1', 'input_col2'], [Types.STRING(), Types.DOUBLE()]))) self.predict_table = self.t_env.from_data_stream( self.env.from_collection([ ('a', 2.0), ('b', 1.0), ('e', 2.0), ], type_info=Types.ROW_NAMED( ['input_col1', 'input_col2'], [Types.STRING(), Types.DOUBLE()]))) self.expected_alphabetic_asc_predict_data = [ Row('a', 2.0, 0, 3), Row('b', 1.0, 1, 2), Row('e', 2.0, 4, 3) ]
def test_fewer_distinct_points_than_cluster(self): input = self.t_env.from_data_stream( self.env.from_collection( [ (Vectors.dense([0.0, 0.1]), ), (Vectors.dense([0.0, 0.1]), ), (Vectors.dense([0.0, 0.1]), ), ], type_info=Types.ROW_NAMED(['features'], [DenseVectorTypeInfo()]))) kmeans = KMeans().set_k(2) model = kmeans.fit(input) output = model.transform(input)[0] results = [ result for result in self.t_env.to_data_stream( output).execute_and_collect() ] field_names = output.get_schema().get_field_names() actual_groups = group_features_by_prediction( results, field_names.index(kmeans.features_col), field_names.index(kmeans.prediction_col)) expected_groups = [{DenseVector([0.0, 0.1])}] self.assertEqual(actual_groups, expected_groups)
def setUp(self): super(StandardScalerTest, self).setUp() self.dense_input = self.t_env.from_data_stream( self.env.from_collection([ (Vectors.dense(-2.5, 9.0, 1.0), ), (Vectors.dense(1.4, -5.0, 1.0), ), (Vectors.dense(2.0, -1.0, -2.0), ), ], type_info=Types.ROW_NAMED( ['input'], [DenseVectorTypeInfo()]))) self.expected_res_with_mean = [ Vectors.dense(-2.8, 8.0, 1.0), Vectors.dense(1.1, -6.0, 1.0), Vectors.dense(1.7, -2.0, -2.0) ] self.expected_res_with_std = [ Vectors.dense(-1.0231819, 1.2480754, 0.5773502), Vectors.dense(0.5729819, -0.6933752, 0.5773503), Vectors.dense(0.8185455, -0.1386750, -1.1547005) ] self.expected_res_with_mean_and_std = [ Vectors.dense(-1.1459637, 1.1094004, 0.5773503), Vectors.dense(0.45020003, -0.8320503, 0.5773503), Vectors.dense(0.69576368, -0.2773501, -1.1547005) ] self.expected_mean = [0.3, 1.0, 0.0] self.expected_std = [2.4433583, 7.2111026, 1.7320508]
def test_json_row_serialization_deserialization_schema(self): jvm = get_gateway().jvm jsons = [ "{\"svt\":\"2020-02-24T12:58:09.209+0800\"}", "{\"svt\":\"2020-02-24T12:58:09.209+0800\", " "\"ops\":{\"id\":\"281708d0-4092-4c21-9233-931950b6eccf\"},\"ids\":[1, 2, 3]}", "{\"svt\":\"2020-02-24T12:58:09.209+0800\"}" ] expected_jsons = [ "{\"svt\":\"2020-02-24T12:58:09.209+0800\",\"ops\":null,\"ids\":null}", "{\"svt\":\"2020-02-24T12:58:09.209+0800\"," "\"ops\":{\"id\":\"281708d0-4092-4c21-9233-931950b6eccf\"}," "\"ids\":[1,2,3]}", "{\"svt\":\"2020-02-24T12:58:09.209+0800\",\"ops\":null,\"ids\":null}" ] row_schema = Types.ROW_NAMED(["svt", "ops", "ids"], [ Types.STRING(), Types.ROW_NAMED(['id'], [Types.STRING()]), Types.PRIMITIVE_ARRAY(Types.INT()) ]) json_row_serialization_schema = JsonRowSerializationSchema.builder() \ .with_type_info(row_schema).build() json_row_deserialization_schema = JsonRowDeserializationSchema.builder() \ .type_info(row_schema).build() json_row_serialization_schema._j_serialization_schema.open( jvm.org.apache.flink.connector.testutils.formats. DummyInitializationContext()) json_row_deserialization_schema._j_deserialization_schema.open( jvm.org.apache.flink.connector.testutils.formats. DummyInitializationContext()) for i in range(len(jsons)): j_row = json_row_deserialization_schema._j_deserialization_schema\ .deserialize(bytes(jsons[i], encoding='utf-8')) result = str(json_row_serialization_schema._j_serialization_schema. serialize(j_row), encoding='utf-8') self.assertEqual(expected_jsons[i], result)
def setUp(self): super(NaiveBayesTest, self).setUp() self.env.set_parallelism(1) self.train_data = self.t_env.from_data_stream( self.env.from_collection( [ (Vectors.dense([0, 0.]), 11.), (Vectors.dense([1, 0]), 10.), (Vectors.dense([1, 1.]), 10.), ], type_info=Types.ROW_NAMED( ['features', 'label'], [DenseVectorTypeInfo(), Types.DOUBLE()]))) self.predict_data = self.t_env.from_data_stream( self.env.from_collection( [ (Vectors.dense([0, 1.]), ), (Vectors.dense([0, 0.]), ), (Vectors.dense([1, 0]), ), (Vectors.dense([1, 1.]), ), ], type_info=Types.ROW_NAMED(['features'], [DenseVectorTypeInfo()]))) self.expected_output = { Vectors.dense([0, 1.]): 11., Vectors.dense([0, 0.]): 11., Vectors.dense([1, 0.]): 10., Vectors.dense([1, 1.]): 10., } self.estimator = NaiveBayes() \ .set_smoothing(1.0) \ .set_features_col('features') \ .set_label_col('label') \ .set_prediction_col('prediction') \ .set_model_type('multinomial') # type: NaiveBayes
def setUp(self): super(KNNTest, self).setUp() self.train_data = self.t_env.from_data_stream( self.env.from_collection([ (Vectors.dense([2.0, 3.0]), 1.0), (Vectors.dense([2.1, 3.1]), 1.0), (Vectors.dense([200.1, 300.1]), 2.0), (Vectors.dense([200.2, 300.2]), 2.0), (Vectors.dense([200.3, 300.3]), 2.0), (Vectors.dense([200.4, 300.4]), 2.0), (Vectors.dense([200.4, 300.4]), 2.0), (Vectors.dense([200.6, 300.6]), 2.0), (Vectors.dense([2.1, 3.1]), 1.0), (Vectors.dense([2.1, 3.1]), 1.0), (Vectors.dense([2.1, 3.1]), 1.0), (Vectors.dense([2.1, 3.1]), 1.0), (Vectors.dense([2.3, 3.2]), 1.0), (Vectors.dense([2.3, 3.2]), 1.0), (Vectors.dense([2.8, 3.2]), 3.0), (Vectors.dense([300., 3.2]), 4.0), (Vectors.dense([2.2, 3.2]), 1.0), (Vectors.dense([2.4, 3.2]), 5.0), (Vectors.dense([2.5, 3.2]), 5.0), (Vectors.dense([2.5, 3.2]), 5.0), (Vectors.dense([2.1, 3.1]), 1.0) ], type_info=Types.ROW_NAMED( ['features', 'label'], [DenseVectorTypeInfo(), Types.DOUBLE()]))) self.predict_data = self.t_env.from_data_stream( self.env.from_collection([ (Vectors.dense([4.0, 4.1]), 5.0), (Vectors.dense([300, 42]), 2.0), ], type_info=Types.ROW_NAMED( ['features', 'label'], [DenseVectorTypeInfo(), Types.DOUBLE()])))
def test_max_value_equas_min_value_but_predict_value_not_equals(self): train_data = self.t_env.from_data_stream( self.env.from_collection([ (Vectors.dense([40.0, 80.0]), ), ], type_info=Types.ROW_NAMED( ['input'], [DenseVectorTypeInfo()]))) predict_data = self.t_env.from_data_stream( self.env.from_collection([ (Vectors.dense([30.0, 50.0]), ), ], type_info=Types.ROW_NAMED( ['input'], [DenseVectorTypeInfo()]))) min_max_scalar = MinMaxScaler() \ .set_min(0.0) \ .set_max(10.0) model = min_max_scalar.fit(train_data) result = model.transform(predict_data)[0] self.verify_output_result(result, min_max_scalar.get_output_col(), result.get_schema().get_field_names(), [Vectors.dense(5.0, 5.0)])
def basic_operations(): env = StreamExecutionEnvironment.get_execution_environment() env.set_parallelism(1) # define the source ds = env.from_collection(collection=[ (1, '{"name": "Flink", "tel": 123, "addr": {"country": "Germany", "city": "Berlin"}}' ), (2, '{"name": "hello", "tel": 135, "addr": {"country": "China", "city": "Shanghai"}}' ), (3, '{"name": "world", "tel": 124, "addr": {"country": "USA", "city": "NewYork"}}' ), (4, '{"name": "PyFlink", "tel": 32, "addr": {"country": "China", "city": "Hangzhou"}}' ) ], type_info=Types.ROW_NAMED( ["id", "info"], [Types.INT(), Types.STRING()])) # map def update_tel(data): # parse the json json_data = json.loads(data.info) json_data['tel'] += 1 return data.id, json.dumps(json_data) show(ds.map(update_tel), env) # (1, '{"name": "Flink", "tel": 124, "addr": {"country": "Germany", "city": "Berlin"}}') # (2, '{"name": "hello", "tel": 136, "addr": {"country": "China", "city": "Shanghai"}}') # (3, '{"name": "world", "tel": 125, "addr": {"country": "USA", "city": "NewYork"}}') # (4, '{"name": "PyFlink", "tel": 33, "addr": {"country": "China", "city": "Hangzhou"}}') # filter show(ds.filter(lambda data: data.id == 1).map(update_tel), env) # (1, '{"name": "Flink", "tel": 124, "addr": {"country": "Germany", "city": "Berlin"}}') # key by show( ds.map(lambda data: (json.loads(data.info)['addr']['country'], json.loads(data.info)['tel'])).key_by( lambda data: data[0]).sum(1), env)
def setUp(self): super(LogisticRegressionTest, self).setUp() self.binomial_data_table = self.t_env.from_data_stream( self.env.from_collection( [ (Vectors.dense([1, 2, 3, 4]), 0., 1.), (Vectors.dense([2, 2, 3, 4]), 0., 2.), (Vectors.dense([3, 2, 3, 4]), 0., 3.), (Vectors.dense([4, 2, 3, 4]), 0., 4.), (Vectors.dense([5, 2, 3, 4]), 0., 5.), (Vectors.dense([11, 2, 3, 4]), 1., 1.), (Vectors.dense([12, 2, 3, 4]), 1., 2.), (Vectors.dense([13, 2, 3, 4]), 1., 3.), (Vectors.dense([14, 2, 3, 4]), 1., 4.), (Vectors.dense([15, 2, 3, 4]), 1., 5.), ], type_info=Types.ROW_NAMED( ['features', 'label', 'weight'], [DenseVectorTypeInfo(), Types.DOUBLE(), Types.DOUBLE()])))
def setUp(self): super(VectorAssemblerTest, self).setUp() self.input_data_table = self.t_env.from_data_stream( self.env.from_collection([ (0, Vectors.dense(2.1, 3.1), 1.0, Vectors.sparse( 5, [3], [1.0])), (1, Vectors.dense(2.1, 3.1), 1.0, Vectors.sparse(5, [1, 2, 3, 4], [1.0, 2.0, 3.0, 4.0])), (2, None, None, None), ], type_info=Types.ROW_NAMED( ['id', 'vec', 'num', 'sparse_vec'], [ Types.INT(), DenseVectorTypeInfo(), Types.DOUBLE(), SparseVectorTypeInfo() ]))) self.expected_output_data_1 = Vectors.sparse(8, [0, 1, 2, 6], [2.1, 3.1, 1.0, 1.0]) self.expected_output_data_2 = Vectors.dense(2.1, 3.1, 1.0, 0.0, 1.0, 2.0, 3.0, 4.0)
def setUp(self): super(KMeansTest, self).setUp() self.data_table = self.t_env.from_data_stream( self.env.from_collection( [ (Vectors.dense([0.0, 0.0]), ), (Vectors.dense([0.0, 0.3]), ), (Vectors.dense([0.3, 3.0]), ), (Vectors.dense([9.0, 0.0]), ), (Vectors.dense([9.0, 0.6]), ), (Vectors.dense([9.6, 0.0]), ), ], type_info=Types.ROW_NAMED(['features'], [DenseVectorTypeInfo()]))) self.expected_groups = [{ DenseVector([0.0, 0.3]), DenseVector([0.3, 3.0]), DenseVector([0.0, 0.0]) }, { DenseVector([9.6, 0.0]), DenseVector([9.0, 0.0]), DenseVector([9.0, 0.6]) }]
def setUp(self): super(BinaryClassificationEvaluatorTest, self).setUp() self.input_data_table = self.t_env.from_data_stream( self.env.from_collection([ (1.0, Vectors.dense(0.1, 0.9)), (1.0, Vectors.dense(0.2, 0.8)), (1.0, Vectors.dense(0.3, 0.7)), (0.0, Vectors.dense(0.25, 0.75)), (0.0, Vectors.dense(0.4, 0.6)), (1.0, Vectors.dense(0.35, 0.65)), (1.0, Vectors.dense(0.45, 0.55)), (0.0, Vectors.dense(0.6, 0.4)), (0.0, Vectors.dense(0.7, 0.3)), (1.0, Vectors.dense(0.65, 0.35)), (0.0, Vectors.dense(0.8, 0.2)), (1.0, Vectors.dense(0.9, 0.1)) ], type_info=Types.ROW_NAMED( ['label', 'rawPrediction'], [Types.DOUBLE(), DenseVectorTypeInfo()])) ) self.input_data_table_score = self.t_env.from_data_stream( self.env.from_collection([ (1, 0.9), (1, 0.8), (1, 0.7), (0, 0.75), (0, 0.6), (1, 0.65), (1, 0.55), (0, 0.4), (0, 0.3), (1, 0.35), (0, 0.2), (1, 0.1) ], type_info=Types.ROW_NAMED( ['label', 'rawPrediction'], [Types.INT(), Types.DOUBLE()])) ) self.input_data_table_with_multi_score = self.t_env.from_data_stream( self.env.from_collection([ (1.0, Vectors.dense(0.1, 0.9)), (1.0, Vectors.dense(0.1, 0.9)), (1.0, Vectors.dense(0.1, 0.9)), (0.0, Vectors.dense(0.25, 0.75)), (0.0, Vectors.dense(0.4, 0.6)), (1.0, Vectors.dense(0.1, 0.9)), (1.0, Vectors.dense(0.1, 0.9)), (0.0, Vectors.dense(0.6, 0.4)), (0.0, Vectors.dense(0.7, 0.3)), (1.0, Vectors.dense(0.1, 0.9)), (0.0, Vectors.dense(0.8, 0.2)), (1.0, Vectors.dense(0.9, 0.1)) ], type_info=Types.ROW_NAMED( ['label', 'rawPrediction'], [Types.DOUBLE(), DenseVectorTypeInfo()])) ) self.input_data_table_with_weight = self.t_env.from_data_stream( self.env.from_collection([ (1.0, Vectors.dense(0.1, 0.9), 0.8), (1.0, Vectors.dense(0.1, 0.9), 0.7), (1.0, Vectors.dense(0.1, 0.9), 0.5), (0.0, Vectors.dense(0.25, 0.75), 1.2), (0.0, Vectors.dense(0.4, 0.6), 1.3), (1.0, Vectors.dense(0.1, 0.9), 1.5), (1.0, Vectors.dense(0.1, 0.9), 1.4), (0.0, Vectors.dense(0.6, 0.4), 0.3), (0.0, Vectors.dense(0.7, 0.3), 0.5), (1.0, Vectors.dense(0.1, 0.9), 1.9), (0.0, Vectors.dense(0.8, 0.2), 1.2), (1.0, Vectors.dense(0.9, 0.1), 1.0) ], type_info=Types.ROW_NAMED( ['label', 'rawPrediction', 'weight'], [Types.DOUBLE(), DenseVectorTypeInfo(), Types.DOUBLE()])) ) self.expected_data = [0.7691481137909708, 0.3714285714285714, 0.6571428571428571] self.expected_data_m = [0.8571428571428571, 0.9377705627705628, 0.8571428571428571, 0.6488095238095237] self.expected_data_w = 0.8911680911680911 self.eps = 1e-5