def run(file, meta, features, response, family, outpath, predict): """ Fit a generalized linear regression model. """ import pathlib from pybda.util.string import drop_suffix from pybda.logger import set_logger from pybda.spark_session import SparkSession from pybda.io.as_filename import as_logfile from pybda.io.io import read_and_transmute, read_column_info outpath = drop_suffix(outpath, "/") set_logger(as_logfile(outpath)) with SparkSession() as spark: try: meta, features = read_column_info(meta, features) data = read_and_transmute(spark, file, features, response) fl = GBM(spark, response, features, family) fl = fl.fit(data) fl.write(outpath) if pathlib.Path(predict).exists(): pre_data = read_and_transmute(spark, predict, features, drop=False) pre_data = fl.predict(pre_data) pre_data.write(outpath) except Exception as e: logger.error("Some error: %s", str(e))
def run(factors, file, features, outpath): """ Fit a factor analysis to a data set """ from pybda.util.string import drop_suffix from pybda.logger import set_logger from pybda.spark_session import SparkSession from pybda.io.io import read_info, read_and_transmute from pybda.io.as_filename import as_logfile outpath = drop_suffix(outpath, "/") set_logger(as_logfile(outpath)) with SparkSession() as spark: try: features = read_info(features) data = read_and_transmute(spark, file, features, assemble_features=False) fl = FactorAnalysis(spark, factors, features) trans = fl.fit_transform(data) trans.write(outpath) except Exception as e: logger.error("Some error: {}".format(str(e)))
def run(inpath, outpath, pval): from pybda.spark_session import SparkSession from pybda.io.as_filename import as_logfile from pybda.io.io import read_parquet, write_parquet from pybda.util.string import drop_suffix from pybda.logger import set_logger outpath = drop_suffix(outpath, "/") set_logger(as_logfile(outpath)) with SparkSession() as spark: try: outi = Outliers(spark, pval) data = outi.fit_transform(read_parquet(spark, inpath)) write_parquet(data, outpath) except Exception as e: logger.error("Some error: {}".format(str(e)))
def run(file, output, n): from pybda.logger import set_logger from pybda.spark_session import SparkSession from pybda.util.string import drop_suffix from pybda.io.io import write_tsv output = drop_suffix(output, "/") set_logger(as_logfile(output)) with SparkSession() as spark: try: from pybda.io.io import read data = read(spark, file) subsamp = sample(data, n) write_tsv(subsamp, output) except Exception as e: logger.error("Some error: {}".format(str(e)))
def run(clusters, file, features, outpath): """ Fit a kmeans-clustering to a data set. """ from pybda.io.as_filename import as_logfile from pybda.logger import set_logger from pybda.spark_session import SparkSession from pybda.util.string import drop_suffix from pybda.io.io import read_info, read_and_transmute outfolder = drop_suffix(outpath, "/") set_logger(as_logfile(outpath)) with SparkSession() as spark: try: features = read_info(features) data = read_and_transmute(spark, file, features) fit = KMeans(spark, clusters, features) fit = fit.fit(data, outfolder) fit.write(data, outfolder) except Exception as e: logger.error("Some error: {}".format(e))