Exemple #1
0
def run(file, meta, features, response, family, outpath, predict):
    """
    Fit a generalized linear regression model.
    """

    import pathlib
    from pybda.util.string import drop_suffix
    from pybda.logger import set_logger
    from pybda.spark_session import SparkSession
    from pybda.io.as_filename import as_logfile
    from pybda.io.io import read_and_transmute, read_column_info

    outpath = drop_suffix(outpath, "/")
    set_logger(as_logfile(outpath))

    with SparkSession() as spark:
        try:
            meta, features = read_column_info(meta, features)
            data = read_and_transmute(spark, file, features, response)
            fl = GBM(spark, response, features, family)
            fl = fl.fit(data)
            fl.write(outpath)
            if pathlib.Path(predict).exists():
                pre_data = read_and_transmute(spark,
                                              predict,
                                              features,
                                              drop=False)
                pre_data = fl.predict(pre_data)
                pre_data.write(outpath)

        except Exception as e:
            logger.error("Some error: %s", str(e))
Exemple #2
0
def run(factors, file, features, outpath):
    """
    Fit a factor analysis to a data set
    """

    from pybda.util.string import drop_suffix
    from pybda.logger import set_logger
    from pybda.spark_session import SparkSession
    from pybda.io.io import read_info, read_and_transmute
    from pybda.io.as_filename import as_logfile

    outpath = drop_suffix(outpath, "/")
    set_logger(as_logfile(outpath))

    with SparkSession() as spark:
        try:
            features = read_info(features)
            data = read_and_transmute(spark,
                                      file,
                                      features,
                                      assemble_features=False)
            fl = FactorAnalysis(spark, factors, features)
            trans = fl.fit_transform(data)
            trans.write(outpath)
        except Exception as e:
            logger.error("Some error: {}".format(str(e)))
Exemple #3
0
def run(inpath, outpath, pval):
    from pybda.spark_session import SparkSession
    from pybda.io.as_filename import as_logfile
    from pybda.io.io import read_parquet, write_parquet
    from pybda.util.string import drop_suffix
    from pybda.logger import set_logger

    outpath = drop_suffix(outpath, "/")
    set_logger(as_logfile(outpath))

    with SparkSession() as spark:
        try:
            outi = Outliers(spark, pval)
            data = outi.fit_transform(read_parquet(spark, inpath))
            write_parquet(data, outpath)
        except Exception as e:
            logger.error("Some error: {}".format(str(e)))
Exemple #4
0
def run(file, output, n):
    from pybda.logger import set_logger
    from pybda.spark_session import SparkSession
    from pybda.util.string import drop_suffix
    from pybda.io.io import write_tsv

    output = drop_suffix(output, "/")
    set_logger(as_logfile(output))

    with SparkSession() as spark:
        try:
            from pybda.io.io import read
            data = read(spark, file)
            subsamp = sample(data, n)
            write_tsv(subsamp, output)
        except Exception as e:
            logger.error("Some error: {}".format(str(e)))
Exemple #5
0
def run(clusters, file, features, outpath):
    """
    Fit a kmeans-clustering to a data set.
    """

    from pybda.io.as_filename import as_logfile
    from pybda.logger import set_logger
    from pybda.spark_session import SparkSession
    from pybda.util.string import drop_suffix
    from pybda.io.io import read_info, read_and_transmute

    outfolder = drop_suffix(outpath, "/")
    set_logger(as_logfile(outpath))

    with SparkSession() as spark:
        try:
            features = read_info(features)
            data = read_and_transmute(spark, file, features)
            fit = KMeans(spark, clusters, features)
            fit = fit.fit(data, outfolder)
            fit.write(data, outfolder)
        except Exception as e:
            logger.error("Some error: {}".format(e))
Exemple #6
0
 def outfiles_no_suffix(self, algorithm):
     outfiles = self.__tree.outfiles(algorithm)
     outfiles = [drop_suffix(out, '.tsv') for out in outfiles]
     return outfiles