def test_init(transformer_class):
    t1 = transformer_class
    t2 = transformer_class
    ts = TransformerSequence([t1, t2])
    array = list(ts.transformers())
    assert len(array) == 2
    assert array[0] == t1
Esempio n. 2
0
    def init_pipeline(self):
        """create the pipeline's TransformerSequence

        Returns:
            a TransformerSequence

        """
        ts = TransformerSequence()
        ts.add(
            ImplicitCategoricalTransform(self.node_config["target_variable"]))
        return ts
Esempio n. 3
0
    def init_pipeline(self):
        """create the pipeline's TransformerSequence

        Returns:
            a TransformerSequence

        """
        ts = TransformerSequence()
        # Note: this is a trasnformer that does not striclty adhere to Transformer interface
        # It takes a in *list* of data frames, not a single data, and returns a single dataframe
        ts.add(LeftJoinDataCombiner(self.node_config["join_key"]))
        return ts
    def init_pipeline(self):
        """create the pipeline's TransformerSequence

        Returns:
            a TransformerSequence

        """
        self.transformer_sequence = TransformerSequence()

        for transformer in self.node_config["transformer_sequence"]:
            p = self._instantiate_transformer(transformer)
            self.transformer_sequence.add(p)

        return self.transformer_sequence
Esempio n. 5
0
    def init_pipeline(self):
        """Initialize the pipeline if no pipeline object is found in the upstream data objects

        Returns:
            TransformerSequence

        """
        return TransformerSequence()
    def init_pipeline(self):
        """create the pipeline's TransformerSequence

        Returns:
            a TransformerSequence

        """
        ts = TransformerSequence()

        for operation in self.node_config["operations"]:

            args = operation.get("args", None)
            columns = operation.get("columns", None)

            p = self._instantiate_preprocessor(operation["class"], args,
                                               columns)
            ts.add(p)

        return ts
def test_add(transformer_class):
    ts = TransformerSequence()
    assert len(ts.sequence) == 0
    t = transformer_class
    ts.add(t)
    assert len(ts.sequence) == 1

    with pytest.raises(Exception) as e:
        ts.add(0)
    assert "Transformer needs to extend AbstractTransformer" in str(e)

    ts.add(transformer_class)
    ts.add(transformer_class)

    array = list(ts.transformers())
    assert len(array) == 3
    assert array[0] == t
Esempio n. 8
0
def test_run():
    class TestPipeline(AbstractPipeline):
        def transform(self, data_object):
            logging.info("TRANSFORM CALLED")
            return data_object

        def fit_transform(self, data_object):
            logging.info("FIT_TRANSFORM CALLED")
            return self.transform(data_object)

        @staticmethod
        def necessary_config(node_config):
            return set(["is_training"])

    NodeFactory().register("TestPipeline", TestPipeline)

    config = {
        "implementation_config": {
            "reader_config": {
                "myreader": {
                    "class": "CsvReader",
                    "filename": "test/minimal.csv",
                    "destinations": ["mypipeline"],
                }
            },
            "pipeline_config": {
                "mypipeline": {
                    "class": "TestPipeline",
                    "is_training": True
                }
            },
        }
    }
    configuration = Configuration(config_location=None,
                                  is_dict_config=True,
                                  dict_config=config)

    reference_file_path = "test/minimal.csv"
    corpus = pd.read_csv(reference_file_path)

    reader = CsvReader(configuration, "myreader")

    data_object = DataObject(configuration)
    data_object.add(reader, corpus)

    pipeline = TestPipeline(configuration, "mypipeline")

    with LogCapture() as l:
        pipeline.run(data_object)
    l.check(
        (
            "root",
            "INFO",
            "No upstream TransformerSequence found. Creating new TransformerSequence...",
        ),
        ("root", "INFO", "FIT_TRANSFORM CALLED"),
        ("root", "INFO", "TRANSFORM CALLED"),
    )

    data_object.add(reader, TransformerSequence(), "tsequence")
    with LogCapture() as l:
        pipeline.run(data_object)
    l.check(
        (
            "root",
            "INFO",
            "Upstream TransformerSequence found, initializing pipeline...",
        ),
        ("root", "INFO", "FIT_TRANSFORM CALLED"),
        ("root", "INFO", "TRANSFORM CALLED"),
    )

    config["implementation_config"]["pipeline_config"]["mypipeline"][
        "is_training"] = False
    configuration = Configuration(config_location=None,
                                  is_dict_config=True,
                                  dict_config=config)
    reader = CsvReader(configuration, "myreader")
    data_object = DataObject(configuration)
    data_object.add(reader, corpus)
    pipeline = TestPipeline(configuration, "mypipeline")
    with LogCapture() as l:
        pipeline.run(data_object)
    l.check(
        (
            "root",
            "INFO",
            "No upstream TransformerSequence found. Creating new TransformerSequence...",
        ),
        ("root", "INFO", "TRANSFORM CALLED"),
    )
Esempio n. 9
0
def test_execute_pipeline():
    class TestTransformer(AbstractTransformer):
        def fit(self, data):
            logging.info("Transfer FIT CALLED")

        def transform(self, data):
            logging.info("Transfer TRANSFORM CALLED")
            return data

        def fit_transform(self, data):
            logging.info("Transfer FIT_TRANSFORM CALLED")
            self.fit(data)
            return self.transform(data)

    class TestPipeline2(AbstractPipeline):
        def transform(self, data_object):
            return data_object

        @staticmethod
        def necessary_config(node_config):
            return set(["is_training"])

    NodeFactory().register("TestPipeline2", TestPipeline2)

    config = {
        "implementation_config": {
            "reader_config": {
                "myreader": {
                    "class": "CsvReader",
                    "filename": "test/minimal.csv",
                    "destinations": ["mypipeline"],
                }
            },
            "pipeline_config": {
                "mypipeline": {
                    "class": "TestPipeline",
                    "is_training": True
                }
            },
        }
    }
    configuration = Configuration(config_location=None,
                                  is_dict_config=True,
                                  dict_config=config)

    reference_file_path = "test/minimal.csv"
    corpus = pd.read_csv(reference_file_path)

    reader = CsvReader(configuration, "myreader")

    data_object = DataObject(configuration)
    data_object.add(reader, corpus)

    sequence = TransformerSequence()
    sequence.add(TestTransformer())
    data_object.add(reader, sequence, "tsequence")

    pipeline = TestPipeline2(configuration, "mypipeline")

    with pytest.raises(Exception) as e:
        pipeline.execute_pipeline(corpus, PipelineModeType.FIT)
    assert "run() must be called to extract/create a TransformerSequence" in str(
        e)

    pipeline.run(data_object)

    with pytest.raises(Exception) as e:
        pipeline.execute_pipeline(corpus, "JUNK")
    assert "mode must be of type PipelineModeType Enum object." in str(e)

    with LogCapture() as l:
        pipeline.execute_pipeline(corpus, PipelineModeType.FIT)
    l.check(("root", "INFO", "Transfer FIT CALLED"))

    with LogCapture() as l:
        pipeline.execute_pipeline(corpus, PipelineModeType.FIT_TRANSFORM)
    l.check(
        ("root", "INFO", "Transfer FIT_TRANSFORM CALLED"),
        ("root", "INFO", "Transfer FIT CALLED"),
        ("root", "INFO", "Transfer TRANSFORM CALLED"),
    )
Esempio n. 10
0
class TransformerPipeline(TrainTestSplit):
    @staticmethod
    def necessary_config(node_config):
        """Return the necessary configuration keys for the TransformerPipeline object

        Returns:
            set of keys

        """
        return set(["transformer_sequence"])

    @staticmethod
    def optional_config(node_config):
        """Return the optional configuration keys for the TransformerPipeline object

        Returns:
            set of keys

        """
        return TrainTestSplit.necessary_config(node_config)

    def init_pipeline(self):
        """create the pipeline's TransformerSequence

        Returns:
            a TransformerSequence

        """
        self.transformer_sequence = TransformerSequence()

        for transformer in self.node_config["transformer_sequence"]:
            p = self._instantiate_transformer(transformer)
            self.transformer_sequence.add(p)

        return self.transformer_sequence

    @staticmethod
    def _instantiate_transformer(transformer):
        """Import and validate user-defined transformer either from primrose or a custom codebase

        Args:
            transformer (AbstractTransformer): a subclass of primrose AbstractTransformer

        Returns:
            AbstractTransformer

        """
        classname = transformer["class"]
        path_sequence = classname.split(".")
        target_class_name = path_sequence.pop(-1)
        module = importlib.import_module(".".join(path_sequence))

        try:
            t = getattr(module, target_class_name)
        except AttributeError:
            raise Exception(
                f'Transformer {target_class_name} not found in {".".join(path_sequence)} module'
            )

        class_args = {k: v for k, v in transformer.items() if k != "class"}
        params = [p for p in inspect.signature(t).parameters]
        t_args = [class_args.pop(p) for p in params if p in class_args.keys()]

        return t(*t_args, **class_args)