def run(file, meta, features, response, family, outpath, predict): """ Fit a generalized linear regression model. """ import pathlib from pybda.util.string import drop_suffix from pybda.logger import set_logger from pybda.spark_session import SparkSession from pybda.io.as_filename import as_logfile from pybda.io.io import read_and_transmute, read_column_info outpath = drop_suffix(outpath, "/") set_logger(as_logfile(outpath)) with SparkSession() as spark: try: meta, features = read_column_info(meta, features) data = read_and_transmute(spark, file, features, response) fl = GBM(spark, response, features, family) fl = fl.fit(data) fl.write(outpath) if pathlib.Path(predict).exists(): pre_data = read_and_transmute(spark, predict, features, drop=False) pre_data = fl.predict(pre_data) pre_data.write(outpath) except Exception as e: logger.error("Some error: %s", str(e))
def run(factors, file, features, outpath): """ Fit a factor analysis to a data set """ from pybda.util.string import drop_suffix from pybda.logger import set_logger from pybda.spark_session import SparkSession from pybda.io.io import read_info, read_and_transmute from pybda.io.as_filename import as_logfile outpath = drop_suffix(outpath, "/") set_logger(as_logfile(outpath)) with SparkSession() as spark: try: features = read_info(features) data = read_and_transmute(spark, file, features, assemble_features=False) fl = FactorAnalysis(spark, factors, features) trans = fl.fit_transform(data) trans.write(outpath) except Exception as e: logger.error("Some error: {}".format(str(e)))
def run(clusters, file, features, outpath): """ Fit a kmeans-clustering to a data set. """ from pybda.io.as_filename import as_logfile from pybda.logger import set_logger from pybda.spark_session import SparkSession from pybda.util.string import drop_suffix from pybda.io.io import read_info, read_and_transmute outfolder = drop_suffix(outpath, "/") set_logger(as_logfile(outpath)) with SparkSession() as spark: try: features = read_info(features) data = read_and_transmute(spark, file, features) fit = KMeans(spark, clusters, features) fit = fit.fit(data, outfolder) fit.write(data, outfolder) except Exception as e: logger.error("Some error: {}".format(e))