Example #1
0
    def convert(
        self, sql: "org.apache.calcite.sql.SqlNode", context: "dask_sql.Context"
    ) -> DataContainer:
        table_name = str(sql.getTableName())

        if table_name in context.tables:
            if sql.getIfNotExists():
                return
            elif not sql.getReplace():
                raise RuntimeError(
                    f"A table with the name {table_name} is already present."
                )

        kwargs = convert_sql_kwargs(sql.getKwargs())

        logger.debug(
            f"Creating new table with name {table_name} and parameters {kwargs}"
        )

        format = kwargs.pop("format", None)
        if format:  # pragma: no cover
            format = format.lower()
        persist = kwargs.pop("persist", False)

        try:
            location = kwargs.pop("location")
        except KeyError:
            raise AttributeError("Parameters must include a 'location' parameter.")

        context.create_table(
            table_name, location, format=format, persist=persist, **kwargs
        )
Example #2
0
    def convert(self, sql: "org.apache.calcite.sql.SqlNode",
                context: "dask_sql.Context") -> DataContainer:
        select = sql.getSelect()
        schema_name, experiment_name = context.fqn(sql.getExperimentName())
        kwargs = convert_sql_kwargs(sql.getKwargs())

        if experiment_name in context.schema[schema_name].experiments:
            if sql.getIfNotExists():
                return
            elif not sql.getReplace():
                raise RuntimeError(
                    f"A experiment with the name {experiment_name} is already present."
                )

        logger.debug(
            f"Creating Experiment {experiment_name} from query {select} with options {kwargs}"
        )
        model_class = None
        automl_class = None
        experiment_class = None
        if "model_class" in kwargs:
            model_class = kwargs.pop("model_class")
            # when model class was provided, must provide experiment_class also for tuning
            if "experiment_class" not in kwargs:
                raise ValueError(
                    f"Parameters must include a 'experiment_class' parameter for tuning {model_class}."
                )
            experiment_class = kwargs.pop("experiment_class")
        elif "automl_class" in kwargs:
            automl_class = kwargs.pop("automl_class")
        else:
            raise ValueError(
                "Parameters must include a 'model_class' or 'automl_class' parameter."
            )
        target_column = kwargs.pop("target_column", "")
        tune_fit_kwargs = kwargs.pop("tune_fit_kwargs", {})
        parameters = kwargs.pop("tune_parameters", {})
        experiment_kwargs = kwargs.pop("experiment_kwargs", {})
        automl_kwargs = kwargs.pop("automl_kwargs", {})
        logger.info(parameters)

        select_query = context._to_sql_string(select)
        training_df = context.sql(select_query)
        if not target_column:
            raise ValueError(
                "Unsupervised Algorithm cannot be tuned Automatically,"
                "Consider providing 'target column'")
        non_target_columns = [
            col for col in training_df.columns if col != target_column
        ]
        X = training_df[non_target_columns]
        y = training_df[target_column]

        if model_class and experiment_class:
            try:
                ModelClass = import_class(model_class)
            except ImportError:
                raise ValueError(
                    f"Can not import model {model_class}. Make sure you spelled it correctly and have installed all packages."
                )
            try:
                ExperimentClass = import_class(experiment_class)
            except ImportError:
                raise ValueError(
                    f"Can not import tuner {experiment_class}. Make sure you spelled it correctly and have installed all packages."
                )

            try:
                from dask_ml.wrappers import ParallelPostFit
            except ImportError:  # pragma: no cover
                raise ValueError(
                    "dask_ml must be installed to use automl and tune hyperparameters"
                )

            model = ModelClass()

            search = ExperimentClass(model, {**parameters},
                                     **experiment_kwargs)
            logger.info(tune_fit_kwargs)
            search.fit(X, y, **tune_fit_kwargs)
            df = pd.DataFrame(search.cv_results_)
            df["model_class"] = model_class

            context.register_model(
                experiment_name,
                ParallelPostFit(estimator=search.best_estimator_),
                X.columns,
                schema_name=schema_name,
            )

        if automl_class:

            try:
                AutoMLClass = import_class(automl_class)
            except ImportError:
                raise ValueError(
                    f"Can not import automl model {automl_class}. Make sure you spelled it correctly and have installed all packages."
                )

            try:
                from dask_ml.wrappers import ParallelPostFit
            except ImportError:  # pragma: no cover
                raise ValueError(
                    "dask_ml must be installed to use automl and tune hyperparameters"
                )

            automl = AutoMLClass(**automl_kwargs)
            # should be avoided if  data doesn't fit in memory
            automl.fit(X.compute(), y.compute())
            df = (pd.DataFrame(
                automl.evaluated_individuals_).T.reset_index().rename(
                    {"index": "models"}, axis=1))

            context.register_model(
                experiment_name,
                ParallelPostFit(estimator=automl.fitted_pipeline_),
                X.columns,
                schema_name=schema_name,
            )

        context.register_experiment(experiment_name,
                                    experiment_results=df,
                                    schema_name=schema_name)
        cc = ColumnContainer(df.columns)
        dc = DataContainer(dd.from_pandas(df, npartitions=1), cc)
        return dc
Example #3
0
    def convert(
        self, sql: "org.apache.calcite.sql.SqlNode", context: "dask_sql.Context"
    ) -> DataContainer:
        select = sql.getSelect()
        model_name = str(sql.getModelName())
        kwargs = convert_sql_kwargs(sql.getKwargs())

        if model_name in context.models:
            if sql.getIfNotExists():
                return
            elif not sql.getReplace():
                raise RuntimeError(
                    f"A model with the name {model_name} is already present."
                )

        logger.debug(
            f"Creating model {model_name} from query {select} with options {kwargs}"
        )

        try:
            model_class = kwargs.pop("model_class")
        except KeyError:
            raise ValueError("Parameters must include a 'model_class' parameter.")

        target_column = kwargs.pop("target_column", "")
        wrap_predict = kwargs.pop("wrap_predict", False)
        wrap_fit = kwargs.pop("wrap_fit", False)
        fit_kwargs = kwargs.pop("fit_kwargs", {})

        try:
            ModelClass = import_class(model_class)
        except ImportError:
            raise ValueError(
                f"Can not import model {model_class}. Make sure you spelled it correctly and have installed all packages."
            )

        model = ModelClass(**kwargs)
        if wrap_fit:
            from dask_ml.wrappers import Incremental

            model = Incremental(estimator=model)

        if wrap_predict:
            from dask_ml.wrappers import ParallelPostFit

            model = ParallelPostFit(estimator=model)

        select_query = context._to_sql_string(select)
        training_df = context.sql(select_query)

        if target_column:
            non_target_columns = [
                col for col in training_df.columns if col != target_column
            ]
            X = training_df[non_target_columns]
            y = training_df[target_column]
        else:
            X = training_df
            y = None

        model.fit(X, y, **fit_kwargs)
        context.register_model(model_name, model, X.columns)
Example #4
0
    def convert(self, sql: "org.apache.calcite.sql.SqlNode",
                context: "dask_sql.Context") -> DataContainer:
        select = sql.getSelect()
        schema_name, model_name = context.fqn(sql.getModelName())
        kwargs = convert_sql_kwargs(sql.getKwargs())

        if model_name in context.schema[schema_name].models:
            if sql.getIfNotExists():
                return
            elif not sql.getReplace():
                raise RuntimeError(
                    f"A model with the name {model_name} is already present.")

        logger.debug(
            f"Creating model {model_name} from query {select} with options {kwargs}"
        )

        try:
            model_class = kwargs.pop("model_class")
        except KeyError:
            raise ValueError(
                "Parameters must include a 'model_class' parameter.")

        target_column = kwargs.pop("target_column", "")
        wrap_predict = kwargs.pop("wrap_predict", False)
        wrap_fit = kwargs.pop("wrap_fit", False)
        fit_kwargs = kwargs.pop("fit_kwargs", {})

        select_query = context._to_sql_string(select)
        training_df = context.sql(select_query)

        if target_column:
            non_target_columns = [
                col for col in training_df.columns if col != target_column
            ]
            X = training_df[non_target_columns]
            y = training_df[target_column]
        else:
            X = training_df
            y = None

        try:
            ModelClass = import_class(model_class)
        except ImportError:
            raise ValueError(
                f"Can not import model {model_class}. Make sure you spelled it correctly and have installed all packages."
            )

        model = ModelClass(**kwargs)
        if wrap_fit:
            try:
                from dask_ml.wrappers import Incremental
            except ImportError:  # pragma: no cover
                raise ValueError("Wrapping requires dask-ml to be installed.")

            model = Incremental(estimator=model)

        if wrap_predict:
            try:
                from dask_ml.wrappers import ParallelPostFit
            except ImportError:  # pragma: no cover
                raise ValueError("Wrapping requires dask-ml to be installed.")

            # When `wrap_predict` is set to True we train on single partition frames
            # because this is only useful for non dask distributed models
            # Training via delayed fit ensures that we dont have to transfer
            # data back to the client for training

            X_d = X.repartition(npartitions=1).to_delayed()
            if y is not None:
                y_d = y.repartition(npartitions=1).to_delayed()
            else:
                y_d = None

            delayed_model = [
                delayed(model.fit)(x_p, y_p) for x_p, y_p in zip(X_d, y_d)
            ]
            model = delayed_model[0].compute()
            model = ParallelPostFit(estimator=model)

        else:
            model.fit(X, y, **fit_kwargs)
        context.register_model(model_name,
                               model,
                               X.columns,
                               schema_name=schema_name)