def test_coll_df_operations(self):
        from odps.ml.nodes import transform_nodes as tnodes

        splited = self.df.split(0.75)
        self.assertEqual(len(splited), 2)
        self.assertEqual(_df_roles(splited[0]), _df_roles(splited[1]))
        split_node = adapter_from_df(splited[0])._bind_node
        self.assertEqual(split_node.code_name, "Split")
        self.assertEqual(split_node.parameters["fraction"], 0.75)

        id_appended = self.df.append_id()
        self.assertEqual(
            _df_roles(id_appended),
            dict(
                category="FEATURE",
                petal_length="FEATURE",
                petal_width="FEATURE",
                sepal_width="FEATURE",
                sepal_length="FEATURE",
                append_id="",
            ),
        )
        append_id_node = adapter_from_df(id_appended)._bind_node
        self.assertEqual(append_id_node.code_name, "AppendID")
        self.assertEqual(append_id_node.parameters["IDColName"], "append_id")

        summary_ep = self.df._create_summary_adapter()
        summary_node = summary_ep._bind_node
        self.assertIsInstance(summary_node, tnodes.SummaryNode)
Example #2
0
    def assertFieldsEqual(self, ds1, ds2, func=repr):
        if isinstance(ds1, CollectionExpr):
            ds1 = adapter_from_df(ds1)
        if isinstance(ds2, CollectionExpr):
            ds2 = adapter_from_df(ds2)

        def repr_fields(fields):
            if isinstance(fields, DFAdapter):
                fields = fields._fields
            if len(fields) == 0:
                return []
            if isinstance(fields[0], MLField):
                return [func(f) for f in fields]
            else:
                return fields

        return self.assertEqual(repr_fields(ds1), repr_fields(ds2))
Example #3
0
    def mock_action(self, sources, output_desc=1, msg='', action=None):
        try:
            from odps.ml import PmmlModel
        except ImportError:
            PmmlModel = None

        if not isinstance(sources, Iterable):
            sources = [
                sources,
            ]

        input_types = [
            PortType.DATA if isinstance(o, CollectionExpr) else PortType.MODEL
            for o in sources
        ]

        source_objs = [
            adapter_from_df(s) if isinstance(s, CollectionExpr) else s
            for s in sources
        ]
        uplinks = [
            adapter for adapter in source_objs
            if isinstance(adapter, DFAdapter)
        ]

        if isinstance(output_desc, six.integer_types):
            output_types = [PortType.DATA for _ in range(output_desc)]
        else:
            output_types = [
                PortType.DATA if ch == 'd' else PortType.MODEL
                for ch in output_desc
            ]

        merge_node = MockNode(msg, action, input_types, output_types)
        odps = None
        for idx, o in enumerate(source_objs):
            o._link_node(merge_node, 'input%d' % (1 + idx))
            odps = o._odps
        outputs = []
        for idx, out_type in enumerate(output_types):
            if out_type == PortType.DATA or PmmlModel is None:
                new_df = six.next(s for s in sources
                                  if isinstance(s, CollectionExpr)).copy()
                DFAdapter(odps,
                          merge_node.outputs['output%d' % (1 + idx)],
                          new_df,
                          uplink=uplinks)
                outputs.append(new_df)
            else:
                outputs.append(
                    PmmlModel(odps,
                              port=merge_node.outputs['output%d' % (1 + idx)]))
        if len(output_types) == 1:
            return outputs[0]
        else:
            return outputs
    def test_sample(self):
        num_sampled = self.df.sample(n=20)
        adapter = adapter_from_df(num_sampled)
        self.assertIsInstance(num_sampled, DataFrame)
        self.assertEqual(adapter._bind_node.code_name, "RandomSample")

        frac_sampled = self.df.sample(frac=0.5)
        adapter = adapter_from_df(frac_sampled)
        self.assertIsInstance(frac_sampled, DataFrame)
        self.assertEqual(adapter._bind_node.code_name, "RandomSample")

        weighted_sampled = self.df.sample(frac=0.5, weights=self.df.sepal_length)
        adapter = adapter_from_df(weighted_sampled)
        self.assertIsInstance(weighted_sampled, DataFrame)
        self.assertEqual(adapter._bind_node.code_name, "WeightedSample")
        self.assertEqual(adapter._bind_node.parameters["probCol"], "sepal_length")

        stratified_sampled = self.df.sample(frac={"Iris-setosa": 0.5}, strata="category")
        adapter = adapter_from_df(stratified_sampled)
        self.assertIsInstance(stratified_sampled, DataFrame)
        self.assertEqual(adapter._bind_node.code_name, "StratifiedSample")
    def testSimpleJoin(self):
        schema = Schema.from_lists(['name', 'id'], [types.string, types.int64])
        table = MockTable(name='pyodps_test_expr_table', schema=schema)
        expr = CollectionExpr(_source_data=table, _schema=schema)

        schema1 = Schema.from_lists(['id', 'value'], [types.int64, types.string])
        table1 = MockTable(name='pyodps_test_expr_table1', schema=schema1)
        expr1 = CollectionExpr(_source_data=table1, _schema=schema1)

        schema2 = Schema.from_lists(['value', 'num'], [types.string, types.float64])
        table2 = MockTable(name='pyodps_test_expr_table2', schema=schema2)
        expr2 = CollectionExpr(_source_data=table2, _schema=schema2)

        df = expr.join(expr1).join(expr2)
        adapter = adapter_from_df(df)
        self.assertEqual(len(adapter._bind_node.inputs), 0)
        self.assertEqual(len(adapter._bind_node.outputs), 1)
    def testSimpleJoin(self):
        schema = Schema.from_lists(['name', 'id'], [types.string, types.int64])
        table = MockTable(name='pyodps_test_expr_table', schema=schema)
        expr = CollectionExpr(_source_data=table, _schema=schema)

        schema1 = Schema.from_lists(['id', 'value'],
                                    [types.int64, types.string])
        table1 = MockTable(name='pyodps_test_expr_table1', schema=schema1)
        expr1 = CollectionExpr(_source_data=table1, _schema=schema1)

        schema2 = Schema.from_lists(['value', 'num'],
                                    [types.string, types.float64])
        table2 = MockTable(name='pyodps_test_expr_table2', schema=schema2)
        expr2 = CollectionExpr(_source_data=table2, _schema=schema2)

        df = expr.join(expr1).join(expr2)
        adapter = adapter_from_df(df)
        self.assertEqual(len(adapter._bind_node.inputs), 0)
        self.assertEqual(len(adapter._bind_node.outputs), 1)
Example #7
0
    def mock_action(self, sources, output_desc=1, msg='', action=None):
        try:
            from odps.ml import PmmlModel
        except ImportError:
            PmmlModel = None

        if not isinstance(sources, Iterable):
            sources = [sources, ]

        input_types = [PortType.DATA if isinstance(o, CollectionExpr) else PortType.MODEL for o in sources]

        source_objs = [adapter_from_df(s) if isinstance(s, CollectionExpr) else s for s in sources]
        uplinks = [adapter for adapter in source_objs if isinstance(adapter, DFAdapter)]

        if isinstance(output_desc, six.integer_types):
            output_types = [PortType.DATA for _ in range(output_desc)]
        else:
            output_types = [PortType.DATA if ch == 'd' else PortType.MODEL for ch in output_desc]

        merge_node = MockNode(msg, action, input_types, output_types)
        odps = None
        for idx, o in enumerate(source_objs):
            o._link_node(merge_node, 'input%d' % (1 + idx))
            odps = o._odps
        outputs = []
        for idx, out_type in enumerate(output_types):
            if out_type == PortType.DATA or PmmlModel is None:
                schema = df_schema_to_odps_schema(six.next(s for s in sources if isinstance(s, CollectionExpr)).schema)
                new_df = DataFrame(DFAdapter._build_mock_table('mock_table', schema, self.odps))
                DFAdapter(odps, merge_node.outputs['output%d' % (1 + idx)], new_df, uplink=uplinks)
                outputs.append(new_df)
            else:
                outputs.append(PmmlModel(odps, port=merge_node.outputs['output%d' % (1 + idx)]))
        if len(output_types) == 1:
            return outputs[0]
        else:
            return outputs
Example #8
0
 def _add_case(self, case):
     adapter = adapter_from_df(self)
     adapter._bind_node.cases.append(case)
     return self
 def action(df):
     call_seq.append('B')
     adapter = adapter_from_df(df)
     self.ml_context._run(adapter._bind_node, self.odps)
     call_seq.append('A')
Example #10
0
def _get_bind_port(obj):
    if isinstance(obj, CollectionExpr):
        return adapter_from_df(obj)._bind_port
    else:
        return obj._bind_port
def _df_key_value(df):
    return dict((f.name, repr(f.kv_config) if f.kv_config else "") for f in adapter_from_df(df).fields)
def _df_continuity(df):
    return dict((f.name, f.continuity.name) for f in adapter_from_df(df).fields)
def _df_roles(df):
    return dict((f.name, ",".join(r.name for r in f.role)) for f in adapter_from_df(df).fields)
Example #14
0
    def test_operations(self):
        df1 = self.get_table1_df()
        df1_ep = adapter_from_df(df1)
        src_fields1 = copy.deepcopy([f for f in adapter_from_df(df1)._fields])
        df2 = self.get_table2_df()
        df2_ep = adapter_from_df(df2)
        src_fields2 = copy.deepcopy([f for f in adapter_from_df(df2)._fields])

        target = self.mock_action(df1)
        self.exec_op(BatchRoleOperation(['col11', 'col12'], FieldRole.WEIGHT, True), [df1, ], target)
        self.assertFieldsEqual(df1, src_fields1)
        self.assertFieldsEqual(target, [set([FieldRole.FEATURE, FieldRole.WEIGHT]), ] * 2, lambda f: f.role)

        target = self.mock_action(df1)
        self.exec_op(ExcludeFieldsOperation(['col12', ]), [df1, ], target)
        self.assertFieldsEqual(df1, src_fields1)
        self.assertFieldsEqual(target, [set([FieldRole.FEATURE, ]), set()], lambda f: f.role)

        target = self.mock_action(df1)
        self.exec_op(SingletonRoleOperation({'col11': FieldRole.WEIGHT, 'col12': FieldRole.LABEL}), [df1, ], target)
        self.assertFieldsEqual(df1, src_fields1)
        self.assertFieldsEqual(target,
                               [set([FieldRole.FEATURE, FieldRole.WEIGHT]), set([FieldRole.FEATURE, FieldRole.LABEL])],
                               lambda f: f.role)

        target = self.mock_action(df1)
        self.exec_op(FieldContinuityOperation(dict(col11=True, col12=False, col13=True)), [df1, ], target)
        self.assertFieldsEqual(df1, src_fields1)
        self.assertFieldsEqual(target, [FieldContinuity.CONTINUOUS, FieldContinuity.DISCRETE, FieldContinuity.CONTINUOUS],
                               lambda f: f.continuity)
        self.assertEqual(adapter_from_df(target)._fields[-1].name, 'col13')
        self.assertEqual(adapter_from_df(target)._fields[-1].type, 'expected')

        target = self.mock_action(df1)
        kv_config_vals = [KVConfig(':', ','), KVConfig('_', '+'), KVConfig('*', '%')]
        kv_config = dict(zip(['col11', 'col12', 'col13'], kv_config_vals))
        self.exec_op(FieldKVConfigOperation(kv_config), [df1, ], target)
        self.assertFieldsEqual(df1, src_fields1)
        self.assertFieldsEqual(target, kv_config_vals, lambda f: f.kv_config)
        self.assertEqual(adapter_from_df(target)._fields[-1].name, 'col13')
        self.assertEqual(adapter_from_df(target)._fields[-1].type, 'expected')

        target = self.mock_action(df1)
        self.exec_op(StaticFieldChangeOperation([MLField('col13', 'bigint', FieldRole.FEATURE),
                                                 MLField('col14', 'bigint', FieldRole.FEATURE)]), [df1, ], target)
        self.assertFieldsEqual(df1, src_fields1)
        self.assertFieldsEqual(target, ['col13', 'col14'], lambda f: f.name)

        target = self.mock_action(df1)
        self.exec_op(StaticFieldChangeOperation([MLField('col13', 'bigint', FieldRole.FEATURE),
                                                 MLField('col14', 'bigint', FieldRole.FEATURE)],
                                                is_append=True), [df1, ], target)
        self.assertFieldsEqual(df1, src_fields1)
        self.assertFieldsEqual(target, ['col11', 'col12', 'col13', 'col14'], lambda f: f.name)

        def test_generator(params, fields):
            self.assertDictEqual(params, dict(message='TestMsg'))
            self.assertFieldsEqual(df1, fields[0])
            return 'field1:string:label,field2:bigint'

        target = self.mock_action(df1, msg='TestMsg')
        self.exec_op(ProgrammaticFieldChangeOperation(
            functools.partial(test_generator, adapter_from_df(target)._bind_node.parameters, {0: df1_ep.fields}),
            is_append=False), [df1, ], target)
        self.assertFieldsEqual(df1, src_fields1)
        self.assertFieldsEqual(target, [
            MLField('field1', 'string', FieldRole.LABEL, FieldContinuity.DISCRETE),
            MLField('field2', 'bigint', FieldRole.FEATURE, FieldContinuity.CONTINUOUS),
        ])

        target = self.mock_action(df1, msg='TestMsg')
        self.exec_op(ProgrammaticFieldChangeOperation(
            functools.partial(test_generator, adapter_from_df(target)._bind_node.parameters, {0: df1_ep.fields}),
            is_append=True), [df1, ], target)
        self.assertFieldsEqual(df1, src_fields1)
        self.assertFieldsEqual(target, df1_ep.fields + [
            MLField('field1', 'string', FieldRole.LABEL, FieldContinuity.DISCRETE),
            MLField('field2', 'bigint', FieldRole.FEATURE, FieldContinuity.CONTINUOUS),
        ])

        sel_cols = {0: [f.name for f in df1_ep.fields], 1: [df2_ep.fields[0].name, ]}
        exc_cols = {0: [], 1: [df2_ep.fields[1].name, ]}
        target = self.mock_action([df1, df2])
        self.exec_op(MergeFieldsOperation(False, sel_cols, exc_cols), [df1, df2], target)
        self.assertFieldsEqual(df1, src_fields1)
        self.assertFieldsEqual(df2, src_fields2)
        self.assertFieldsEqual(target, df1_ep.fields + [df2_ep.fields[0], ])

        target = self.mock_action([df1, df2])
        new_table_names = ['t0_%s' % f.name for f in df1_ep.fields] + ['t1_%s' % df2_ep.fields[0].name, ]
        self.exec_op(MergeFieldsOperation(True, sel_cols, exc_cols), [df1, df2], target)
        self.assertFieldsEqual(df1, src_fields1)
        self.assertFieldsEqual(df2, src_fields2)
        self.assertFieldsEqual(target, new_table_names, lambda f: f.name)

        target = self.mock_action(df1)
        part_def = PartitionSelection('part1=1,part2=2')
        self.exec_op(SetPartitionOperation(part_def), [df1, ], target)
        self.assertEqual(df1_ep.partitions, None)
        self.assertFieldsEqual(df1, src_fields1)
        self.assertEqual(repr(part_def), repr(adapter_from_df(target).partitions))
        self.assertFieldsEqual(target, src_fields1)
Example #15
0
 def exec_op(op, dfs, target):
     op.execute([adapter_from_df(f) for f in dfs], adapter_from_df(target))
def _get_bind_port(obj):
    if isinstance(obj, CollectionExpr):
        return adapter_from_df(obj)._bind_port
    else:
        return obj._bind_port