Ejemplo n.º 1
0
    def setUp(self):
        super(MinMaxScalerTest, self).setUp()
        self.train_data = self.t_env.from_data_stream(
            self.env.from_collection([
                (Vectors.dense([0.0, 3.0]), ),
                (Vectors.dense([2.1, 0.0]), ),
                (Vectors.dense([4.1, 5.1]), ),
                (Vectors.dense([6.1, 8.1]), ),
                (Vectors.dense([200., 400.]), ),
            ],
                                     type_info=Types.ROW_NAMED(
                                         ['input'], [DenseVectorTypeInfo()])))

        self.predict_data = self.t_env.from_data_stream(
            self.env.from_collection([
                (Vectors.dense([150.0, 90.0]), ),
                (Vectors.dense([50.0, 40.0]), ),
                (Vectors.dense([100.0, 50.0]), ),
            ],
                                     type_info=Types.ROW_NAMED(
                                         ['input'], [DenseVectorTypeInfo()])))
        self.expected_data = [
            Vectors.dense(0.25, 0.1),
            Vectors.dense(0.5, 0.125),
            Vectors.dense(0.75, 0.225)
        ]
Ejemplo n.º 2
0
    def setUp(self):
        super(OneHotEncoderTest, self).setUp()
        self.train_data = self.t_env.from_data_stream(
            self.env.from_collection([
                (0.0,),
                (1.0,),
                (2.0,),
                (0.0,),
            ],
                type_info=Types.ROW_NAMED(
                    ['input'],
                    [Types.DOUBLE()])))

        self.predict_data = self.t_env.from_data_stream(
            self.env.from_collection([
                (0.0,),
                (1.0,),
                (2.0,),
            ],
                type_info=Types.ROW_NAMED(
                    ['input'],
                    [Types.DOUBLE()])))
        self.expected_data = {
            0.0: Vectors.sparse(2, [0], [1.0]),
            1.0: Vectors.sparse(2, [1], [1.0]),
            2.0: Vectors.sparse(2, [], [])
        }

        self.estimator = OneHotEncoder().set_input_cols('input').set_output_cols('output')
Ejemplo n.º 3
0
    def setUp(self):
        super(StringIndexerTest, self).setUp()
        self.train_table = self.t_env.from_data_stream(
            self.env.from_collection([
                ('a', 1.0),
                ('b', 1.0),
                ('b', 2.0),
                ('c', 0.0),
                ('d', 2.0),
                ('a', 2.0),
                ('b', 2.0),
                ('b', -1.0),
                ('a', -1.0),
                ('c', -1.0),
            ],
                type_info=Types.ROW_NAMED(
                    ['input_col1', 'input_col2'],
                    [Types.STRING(), Types.DOUBLE()])))

        self.predict_table = self.t_env.from_data_stream(
            self.env.from_collection([
                ('a', 2.0),
                ('b', 1.0),
                ('e', 2.0),
            ],
                type_info=Types.ROW_NAMED(
                    ['input_col1', 'input_col2'],
                    [Types.STRING(), Types.DOUBLE()])))

        self.expected_alphabetic_asc_predict_data = [
            Row('a', 2.0, 0, 3),
            Row('b', 1.0, 1, 2),
            Row('e', 2.0, 4, 3)
        ]
Ejemplo n.º 4
0
    def test_fewer_distinct_points_than_cluster(self):
        input = self.t_env.from_data_stream(
            self.env.from_collection(
                [
                    (Vectors.dense([0.0, 0.1]), ),
                    (Vectors.dense([0.0, 0.1]), ),
                    (Vectors.dense([0.0, 0.1]), ),
                ],
                type_info=Types.ROW_NAMED(['features'],
                                          [DenseVectorTypeInfo()])))

        kmeans = KMeans().set_k(2)
        model = kmeans.fit(input)
        output = model.transform(input)[0]
        results = [
            result for result in self.t_env.to_data_stream(
                output).execute_and_collect()
        ]
        field_names = output.get_schema().get_field_names()
        actual_groups = group_features_by_prediction(
            results, field_names.index(kmeans.features_col),
            field_names.index(kmeans.prediction_col))

        expected_groups = [{DenseVector([0.0, 0.1])}]

        self.assertEqual(actual_groups, expected_groups)
Ejemplo n.º 5
0
    def setUp(self):
        super(StandardScalerTest, self).setUp()
        self.dense_input = self.t_env.from_data_stream(
            self.env.from_collection([
                (Vectors.dense(-2.5, 9.0, 1.0), ),
                (Vectors.dense(1.4, -5.0, 1.0), ),
                (Vectors.dense(2.0, -1.0, -2.0), ),
            ],
                                     type_info=Types.ROW_NAMED(
                                         ['input'], [DenseVectorTypeInfo()])))

        self.expected_res_with_mean = [
            Vectors.dense(-2.8, 8.0, 1.0),
            Vectors.dense(1.1, -6.0, 1.0),
            Vectors.dense(1.7, -2.0, -2.0)
        ]

        self.expected_res_with_std = [
            Vectors.dense(-1.0231819, 1.2480754, 0.5773502),
            Vectors.dense(0.5729819, -0.6933752, 0.5773503),
            Vectors.dense(0.8185455, -0.1386750, -1.1547005)
        ]

        self.expected_res_with_mean_and_std = [
            Vectors.dense(-1.1459637, 1.1094004, 0.5773503),
            Vectors.dense(0.45020003, -0.8320503, 0.5773503),
            Vectors.dense(0.69576368, -0.2773501, -1.1547005)
        ]

        self.expected_mean = [0.3, 1.0, 0.0]
        self.expected_std = [2.4433583, 7.2111026, 1.7320508]
Ejemplo n.º 6
0
    def test_json_row_serialization_deserialization_schema(self):
        jvm = get_gateway().jvm
        jsons = [
            "{\"svt\":\"2020-02-24T12:58:09.209+0800\"}",
            "{\"svt\":\"2020-02-24T12:58:09.209+0800\", "
            "\"ops\":{\"id\":\"281708d0-4092-4c21-9233-931950b6eccf\"},\"ids\":[1, 2, 3]}",
            "{\"svt\":\"2020-02-24T12:58:09.209+0800\"}"
        ]
        expected_jsons = [
            "{\"svt\":\"2020-02-24T12:58:09.209+0800\",\"ops\":null,\"ids\":null}",
            "{\"svt\":\"2020-02-24T12:58:09.209+0800\","
            "\"ops\":{\"id\":\"281708d0-4092-4c21-9233-931950b6eccf\"},"
            "\"ids\":[1,2,3]}",
            "{\"svt\":\"2020-02-24T12:58:09.209+0800\",\"ops\":null,\"ids\":null}"
        ]

        row_schema = Types.ROW_NAMED(["svt", "ops", "ids"], [
            Types.STRING(),
            Types.ROW_NAMED(['id'], [Types.STRING()]),
            Types.PRIMITIVE_ARRAY(Types.INT())
        ])

        json_row_serialization_schema = JsonRowSerializationSchema.builder() \
            .with_type_info(row_schema).build()
        json_row_deserialization_schema = JsonRowDeserializationSchema.builder() \
            .type_info(row_schema).build()
        json_row_serialization_schema._j_serialization_schema.open(
            jvm.org.apache.flink.connector.testutils.formats.
            DummyInitializationContext())
        json_row_deserialization_schema._j_deserialization_schema.open(
            jvm.org.apache.flink.connector.testutils.formats.
            DummyInitializationContext())

        for i in range(len(jsons)):
            j_row = json_row_deserialization_schema._j_deserialization_schema\
                .deserialize(bytes(jsons[i], encoding='utf-8'))
            result = str(json_row_serialization_schema._j_serialization_schema.
                         serialize(j_row),
                         encoding='utf-8')
            self.assertEqual(expected_jsons[i], result)
Ejemplo n.º 7
0
    def setUp(self):
        super(NaiveBayesTest, self).setUp()
        self.env.set_parallelism(1)
        self.train_data = self.t_env.from_data_stream(
            self.env.from_collection(
                [
                    (Vectors.dense([0, 0.]), 11.),
                    (Vectors.dense([1, 0]), 10.),
                    (Vectors.dense([1, 1.]), 10.),
                ],
                type_info=Types.ROW_NAMED(
                    ['features', 'label'],
                    [DenseVectorTypeInfo(),
                     Types.DOUBLE()])))

        self.predict_data = self.t_env.from_data_stream(
            self.env.from_collection(
                [
                    (Vectors.dense([0, 1.]), ),
                    (Vectors.dense([0, 0.]), ),
                    (Vectors.dense([1, 0]), ),
                    (Vectors.dense([1, 1.]), ),
                ],
                type_info=Types.ROW_NAMED(['features'],
                                          [DenseVectorTypeInfo()])))

        self.expected_output = {
            Vectors.dense([0, 1.]): 11.,
            Vectors.dense([0, 0.]): 11.,
            Vectors.dense([1, 0.]): 10.,
            Vectors.dense([1, 1.]): 10.,
        }

        self.estimator = NaiveBayes() \
            .set_smoothing(1.0) \
            .set_features_col('features') \
            .set_label_col('label') \
            .set_prediction_col('prediction') \
            .set_model_type('multinomial')  # type: NaiveBayes
Ejemplo n.º 8
0
    def setUp(self):
        super(KNNTest, self).setUp()
        self.train_data = self.t_env.from_data_stream(
            self.env.from_collection([
                (Vectors.dense([2.0, 3.0]), 1.0),
                (Vectors.dense([2.1, 3.1]), 1.0),
                (Vectors.dense([200.1, 300.1]), 2.0),
                (Vectors.dense([200.2, 300.2]), 2.0),
                (Vectors.dense([200.3, 300.3]), 2.0),
                (Vectors.dense([200.4, 300.4]), 2.0),
                (Vectors.dense([200.4, 300.4]), 2.0),
                (Vectors.dense([200.6, 300.6]), 2.0),
                (Vectors.dense([2.1, 3.1]), 1.0),
                (Vectors.dense([2.1, 3.1]), 1.0),
                (Vectors.dense([2.1, 3.1]), 1.0),
                (Vectors.dense([2.1, 3.1]), 1.0),
                (Vectors.dense([2.3, 3.2]), 1.0),
                (Vectors.dense([2.3, 3.2]), 1.0),
                (Vectors.dense([2.8, 3.2]), 3.0),
                (Vectors.dense([300., 3.2]), 4.0),
                (Vectors.dense([2.2, 3.2]), 1.0),
                (Vectors.dense([2.4, 3.2]), 5.0),
                (Vectors.dense([2.5, 3.2]), 5.0),
                (Vectors.dense([2.5, 3.2]), 5.0),
                (Vectors.dense([2.1, 3.1]), 1.0)
            ],
                type_info=Types.ROW_NAMED(
                    ['features', 'label'],
                    [DenseVectorTypeInfo(), Types.DOUBLE()])))

        self.predict_data = self.t_env.from_data_stream(
            self.env.from_collection([
                (Vectors.dense([4.0, 4.1]), 5.0),
                (Vectors.dense([300, 42]), 2.0),
            ],
                type_info=Types.ROW_NAMED(
                    ['features', 'label'],
                    [DenseVectorTypeInfo(), Types.DOUBLE()])))
Ejemplo n.º 9
0
    def test_max_value_equas_min_value_but_predict_value_not_equals(self):
        train_data = self.t_env.from_data_stream(
            self.env.from_collection([
                (Vectors.dense([40.0, 80.0]), ),
            ],
                                     type_info=Types.ROW_NAMED(
                                         ['input'], [DenseVectorTypeInfo()])))

        predict_data = self.t_env.from_data_stream(
            self.env.from_collection([
                (Vectors.dense([30.0, 50.0]), ),
            ],
                                     type_info=Types.ROW_NAMED(
                                         ['input'], [DenseVectorTypeInfo()])))

        min_max_scalar = MinMaxScaler() \
            .set_min(0.0) \
            .set_max(10.0)

        model = min_max_scalar.fit(train_data)
        result = model.transform(predict_data)[0]
        self.verify_output_result(result, min_max_scalar.get_output_col(),
                                  result.get_schema().get_field_names(),
                                  [Vectors.dense(5.0, 5.0)])
Ejemplo n.º 10
0
def basic_operations():
    env = StreamExecutionEnvironment.get_execution_environment()
    env.set_parallelism(1)

    # define the source
    ds = env.from_collection(collection=[
        (1,
         '{"name": "Flink", "tel": 123, "addr": {"country": "Germany", "city": "Berlin"}}'
         ),
        (2,
         '{"name": "hello", "tel": 135, "addr": {"country": "China", "city": "Shanghai"}}'
         ),
        (3,
         '{"name": "world", "tel": 124, "addr": {"country": "USA", "city": "NewYork"}}'
         ),
        (4,
         '{"name": "PyFlink", "tel": 32, "addr": {"country": "China", "city": "Hangzhou"}}'
         )
    ],
                             type_info=Types.ROW_NAMED(
                                 ["id", "info"],
                                 [Types.INT(), Types.STRING()]))

    # map
    def update_tel(data):
        # parse the json
        json_data = json.loads(data.info)
        json_data['tel'] += 1
        return data.id, json.dumps(json_data)

    show(ds.map(update_tel), env)
    # (1, '{"name": "Flink", "tel": 124, "addr": {"country": "Germany", "city": "Berlin"}}')
    # (2, '{"name": "hello", "tel": 136, "addr": {"country": "China", "city": "Shanghai"}}')
    # (3, '{"name": "world", "tel": 125, "addr": {"country": "USA", "city": "NewYork"}}')
    # (4, '{"name": "PyFlink", "tel": 33, "addr": {"country": "China", "city": "Hangzhou"}}')

    # filter
    show(ds.filter(lambda data: data.id == 1).map(update_tel), env)
    # (1, '{"name": "Flink", "tel": 124, "addr": {"country": "Germany", "city": "Berlin"}}')

    # key by
    show(
        ds.map(lambda data: (json.loads(data.info)['addr']['country'],
                             json.loads(data.info)['tel'])).key_by(
                                 lambda data: data[0]).sum(1), env)
Ejemplo n.º 11
0
 def setUp(self):
     super(LogisticRegressionTest, self).setUp()
     self.binomial_data_table = self.t_env.from_data_stream(
         self.env.from_collection(
             [
                 (Vectors.dense([1, 2, 3, 4]), 0., 1.),
                 (Vectors.dense([2, 2, 3, 4]), 0., 2.),
                 (Vectors.dense([3, 2, 3, 4]), 0., 3.),
                 (Vectors.dense([4, 2, 3, 4]), 0., 4.),
                 (Vectors.dense([5, 2, 3, 4]), 0., 5.),
                 (Vectors.dense([11, 2, 3, 4]), 1., 1.),
                 (Vectors.dense([12, 2, 3, 4]), 1., 2.),
                 (Vectors.dense([13, 2, 3, 4]), 1., 3.),
                 (Vectors.dense([14, 2, 3, 4]), 1., 4.),
                 (Vectors.dense([15, 2, 3, 4]), 1., 5.),
             ],
             type_info=Types.ROW_NAMED(
                 ['features', 'label', 'weight'],
                 [DenseVectorTypeInfo(),
                  Types.DOUBLE(),
                  Types.DOUBLE()])))
Ejemplo n.º 12
0
    def setUp(self):
        super(VectorAssemblerTest, self).setUp()
        self.input_data_table = self.t_env.from_data_stream(
            self.env.from_collection([
                (0, Vectors.dense(2.1, 3.1), 1.0, Vectors.sparse(
                    5, [3], [1.0])),
                (1, Vectors.dense(2.1, 3.1), 1.0,
                 Vectors.sparse(5, [1, 2, 3, 4], [1.0, 2.0, 3.0, 4.0])),
                (2, None, None, None),
            ],
                                     type_info=Types.ROW_NAMED(
                                         ['id', 'vec', 'num', 'sparse_vec'], [
                                             Types.INT(),
                                             DenseVectorTypeInfo(),
                                             Types.DOUBLE(),
                                             SparseVectorTypeInfo()
                                         ])))

        self.expected_output_data_1 = Vectors.sparse(8, [0, 1, 2, 6],
                                                     [2.1, 3.1, 1.0, 1.0])
        self.expected_output_data_2 = Vectors.dense(2.1, 3.1, 1.0, 0.0, 1.0,
                                                    2.0, 3.0, 4.0)
Ejemplo n.º 13
0
 def setUp(self):
     super(KMeansTest, self).setUp()
     self.data_table = self.t_env.from_data_stream(
         self.env.from_collection(
             [
                 (Vectors.dense([0.0, 0.0]), ),
                 (Vectors.dense([0.0, 0.3]), ),
                 (Vectors.dense([0.3, 3.0]), ),
                 (Vectors.dense([9.0, 0.0]), ),
                 (Vectors.dense([9.0, 0.6]), ),
                 (Vectors.dense([9.6, 0.0]), ),
             ],
             type_info=Types.ROW_NAMED(['features'],
                                       [DenseVectorTypeInfo()])))
     self.expected_groups = [{
         DenseVector([0.0, 0.3]),
         DenseVector([0.3, 3.0]),
         DenseVector([0.0, 0.0])
     },
                             {
                                 DenseVector([9.6, 0.0]),
                                 DenseVector([9.0, 0.0]),
                                 DenseVector([9.0, 0.6])
                             }]
Ejemplo n.º 14
0
    def setUp(self):
        super(BinaryClassificationEvaluatorTest, self).setUp()
        self.input_data_table = self.t_env.from_data_stream(
            self.env.from_collection([
                (1.0, Vectors.dense(0.1, 0.9)),
                (1.0, Vectors.dense(0.2, 0.8)),
                (1.0, Vectors.dense(0.3, 0.7)),
                (0.0, Vectors.dense(0.25, 0.75)),
                (0.0, Vectors.dense(0.4, 0.6)),
                (1.0, Vectors.dense(0.35, 0.65)),
                (1.0, Vectors.dense(0.45, 0.55)),
                (0.0, Vectors.dense(0.6, 0.4)),
                (0.0, Vectors.dense(0.7, 0.3)),
                (1.0, Vectors.dense(0.65, 0.35)),
                (0.0, Vectors.dense(0.8, 0.2)),
                (1.0, Vectors.dense(0.9, 0.1))
            ],
                type_info=Types.ROW_NAMED(
                    ['label', 'rawPrediction'],
                    [Types.DOUBLE(), DenseVectorTypeInfo()]))
        )

        self.input_data_table_score = self.t_env.from_data_stream(
            self.env.from_collection([
                (1, 0.9),
                (1, 0.8),
                (1, 0.7),
                (0, 0.75),
                (0, 0.6),
                (1, 0.65),
                (1, 0.55),
                (0, 0.4),
                (0, 0.3),
                (1, 0.35),
                (0, 0.2),
                (1, 0.1)
            ],
                type_info=Types.ROW_NAMED(
                    ['label', 'rawPrediction'],
                    [Types.INT(), Types.DOUBLE()]))
        )

        self.input_data_table_with_multi_score = self.t_env.from_data_stream(
            self.env.from_collection([
                (1.0, Vectors.dense(0.1, 0.9)),
                (1.0, Vectors.dense(0.1, 0.9)),
                (1.0, Vectors.dense(0.1, 0.9)),
                (0.0, Vectors.dense(0.25, 0.75)),
                (0.0, Vectors.dense(0.4, 0.6)),
                (1.0, Vectors.dense(0.1, 0.9)),
                (1.0, Vectors.dense(0.1, 0.9)),
                (0.0, Vectors.dense(0.6, 0.4)),
                (0.0, Vectors.dense(0.7, 0.3)),
                (1.0, Vectors.dense(0.1, 0.9)),
                (0.0, Vectors.dense(0.8, 0.2)),
                (1.0, Vectors.dense(0.9, 0.1))
            ],
                type_info=Types.ROW_NAMED(
                    ['label', 'rawPrediction'],
                    [Types.DOUBLE(), DenseVectorTypeInfo()]))
        )

        self.input_data_table_with_weight = self.t_env.from_data_stream(
            self.env.from_collection([
                (1.0, Vectors.dense(0.1, 0.9), 0.8),
                (1.0, Vectors.dense(0.1, 0.9), 0.7),
                (1.0, Vectors.dense(0.1, 0.9), 0.5),
                (0.0, Vectors.dense(0.25, 0.75), 1.2),
                (0.0, Vectors.dense(0.4, 0.6), 1.3),
                (1.0, Vectors.dense(0.1, 0.9), 1.5),
                (1.0, Vectors.dense(0.1, 0.9), 1.4),
                (0.0, Vectors.dense(0.6, 0.4), 0.3),
                (0.0, Vectors.dense(0.7, 0.3), 0.5),
                (1.0, Vectors.dense(0.1, 0.9), 1.9),
                (0.0, Vectors.dense(0.8, 0.2), 1.2),
                (1.0, Vectors.dense(0.9, 0.1), 1.0)
            ],
                type_info=Types.ROW_NAMED(
                    ['label', 'rawPrediction', 'weight'],
                    [Types.DOUBLE(), DenseVectorTypeInfo(), Types.DOUBLE()]))
        )

        self.expected_data = [0.7691481137909708, 0.3714285714285714, 0.6571428571428571]

        self.expected_data_m = [0.8571428571428571, 0.9377705627705628,
                                0.8571428571428571, 0.6488095238095237]

        self.expected_data_w = 0.8911680911680911

        self.eps = 1e-5