def predict(dataset, fitmodel_url, coeff=0.5, save_results=True, show=False):
    from disco.worker.pipeline.worker import Worker, Stage
    from disco.core import Job, result_iterator
    import discomll
    path = "/".join(discomll.__file__.split("/")[:-1] + ["ensemble", "core", ""])

    job = Job(worker=Worker(save_results=save_results))
    job.pipeline = [
        ("split", Stage("map", input_chain=dataset.params["input_chain"], init=simple_init, process=map_predict))]

    if "dwfr_fitmodel" not in fitmodel_url:
        raise Exception("Incorrect fit model.")
    try:
        coeff = float(coeff)
        if coeff < 0:
            raise Exception("Parameter coeff should be greater than 0.")
    except ValueError:
        raise Exception("Parameter coeff should be numerical.")

    job.params = dataset.params
    job.params["coeff"] = coeff
    for k, v in result_iterator(fitmodel_url["dwfr_fitmodel"]):
        job.params[k] = v

    if len(job.params["forest"]) == 0:
        print "Warning: There is no decision trees in forest"
        return []

    job.run(name="distributed_weighted_forest_rand_predict", input=dataset.params["data_tag"],
            required_files=[path + "decision_tree.py"])

    return job.wait(show=show)
Ejemplo n.º 2
0
def measure(test_data, predictions, measure="ca", save_results=True, show=False):
    from disco.worker.pipeline.worker import Worker, Stage
    from disco.core import Job, result_iterator
    from disco.worker.task_io import task_input_stream, chain_reader

    if measure not in ["ca", "mse"]:
        raise Exception("measure should be ca or mse.")
    if test_data.params["id_index"] == -1:
        raise Exception("ID index should be defined.")

    if predictions == []:
        return "No predictions", None

    # define a job and set save of results to ddfs
    job = Job(worker=Worker(save_results=save_results))

    job = Job(worker=Worker(save_results=save_results))
    job.pipeline = [
        ("split", Stage("map", input_chain=test_data.params["input_chain"], init=simple_init, process=map_test_data))]

    job.params = test_data.params
    job.run(name="ma_parse_testdata", input=test_data.params["data_tag"])
    parsed_testdata = job.wait(show=show)

    reduce_proces = reduce_ca if measure == "ca" else reduce_mse

    job = Job(worker=Worker(save_results=save_results))
    job.pipeline = [("split", Stage("map", init=simple_init, input_chain=[task_input_stream, chain_reader],
                                    process=map_predictions)),
                    ('group_all', Stage("reduce", init=simple_init, process=reduce_proces, sort=True, combine=True))]

    job.run(name="ma_measure_accuracy", input=parsed_testdata + predictions)

    measure, acc = [(measure, acc) for measure, acc in result_iterator(job.wait(show=show))][0]
    return measure, acc
def predict(dataset, fitmodel_url, voting=False, save_results=True, show=False):
    from disco.worker.pipeline.worker import Worker, Stage
    from disco.core import Job, result_iterator
    import discomll

    path = "/".join(discomll.__file__.split("/")[:-1] + ["ensemble", "core", ""])

    if "drf_fitmodel" not in fitmodel_url:
        raise Exception("Incorrect fit model.")

    job = Job(worker=Worker(save_results=save_results))
    job.pipeline = [("split", Stage("map", input_chain=dataset.params["input_chain"], init=simple_init,
                                    process=map_predict_voting if voting else map_predict_dist))]

    job.params = dataset.params
    for k, v in result_iterator(fitmodel_url["drf_fitmodel"]):
        job.params[k] = v

    if len(job.params["forest"]) == 0:
        print "Warning: There is no decision trees in forest"
        return []

    job.run(name="distributed_random_forest_predict", input=dataset.params["data_tag"],
            required_files=[path + "decision_tree.py"])

    return job.wait(show=show)
Ejemplo n.º 4
0
def predict(dataset, fitmodel_url, save_results=True, show=False):
    """
    Function starts a job that makes predictions to input data with a given model.

    Parameters
    ----------
    input - dataset object with input urls and other parameters
    fitmodel_url - model created in fit phase
    save_results - save results to ddfs
    show - show info about job execution

    Returns
    -------
    Urls with predictions on ddfs
    """
    from disco.worker.pipeline.worker import Worker, Stage
    from disco.core import Job, result_iterator

    if "linsvm_fitmodel" not in fitmodel_url:
        raise Exception("Incorrect fit model.")

    job = Job(worker=Worker(save_results=save_results))
    # job parallelizes execution of mappers
    job.pipeline = [
        ("split", Stage("map", input_chain=dataset.params["input_chain"], init=simple_init, process=map_predict))]

    job.params = dataset.params
    job.params["fit_params"] = [v for _, v in result_iterator(fitmodel_url["linsvm_fitmodel"])][0]
    job.run(name="linsvm_predict", input=dataset.params["data_tag"])

    return job.wait(show=show)
Ejemplo n.º 5
0
def fit(dataset, save_results=True, show=False):
    """
    Function builds a model for Naive Bayes. It executes multiple map functions and one reduce function which aggregates intermediate results and returns a model.

    Parameters
    ----------
    input - dataset object with input urls and other parameters
    save_results - save results to ddfs
    show - show info about job execution

    Returns
    -------
    Urls of fit model results on ddfs
    """
    from disco.worker.pipeline.worker import Worker, Stage
    from disco.core import Job

    # define a job and set save of results to ddfs
    job = Job(worker=Worker(save_results=save_results))

    # job parallelizes mappers, sorts intermediate pairs and joins them with one reducer
    job.pipeline = [
        ("split", Stage("map", input_chain=dataset.params["input_chain"], init=simple_init, process=map_fit)),
        ('group_all', Stage("reduce", init=simple_init, process=reduce_fit, sort=True, combine=True))]

    job.params = dataset.params  # job parameters (dataset object)
    # define name of a job and input data urls
    job.run(name="naivebayes_fit", input=dataset.params["data_tag"])
    fitmodel_url = job.wait(show=show)
    return {"naivebayes_fitmodel": fitmodel_url}  # return results url
def fit_predict(training_data, fitting_data, tau=1, samples_per_job=0, save_results=True, show=False):
    from disco.worker.pipeline.worker import Worker, Stage
    from disco.core import Job, result_iterator
    from disco.core import Disco

    """
    training_data - training samples
    fitting_data - dataset to be fitted to training data.
    tau - controls how quickly the weight of a training sample falls off with distance of its x(i) from the query point x.
    samples_per_job - define a number of samples that will be processed in single mapreduce job. If 0, algorithm will calculate number of samples per job.
    """

    try:
        tau = float(tau)
        if tau <= 0:
            raise Exception("Parameter tau should be >= 0.")
    except ValueError:
        raise Exception("Parameter tau should be numerical.")

    if fitting_data.params["id_index"] == -1:
        raise Exception("Predict data should have id_index set.")

    job = Job(worker=Worker(save_results=save_results))
    job.pipeline = [
        ("split", Stage("map", input_chain=fitting_data.params["input_chain"], init=simple_init, process=map_predict))
    ]
    job.params = fitting_data.params
    job.run(name="lwlr_read_data", input=fitting_data.params["data_tag"])

    samples = {}
    results = []
    tau = float(2 * tau ** 2)  # calculate tau once
    counter = 0

    for test_id, x in result_iterator(job.wait(show=show)):
        if samples_per_job == 0:
            # calculate number of samples per job
            if len(x) <= 100:  # if there is less than 100 attributes
                samples_per_job = 100  # 100 samples is max per on job
            else:
                # there is more than 100 attributes
                samples_per_job = len(x) * -25 / 900.0 + 53  # linear function

        samples[test_id] = x
        if counter == samples_per_job:
            results.append(_fit_predict(training_data, samples, tau, save_results, show))
            counter = 0
            samples = {}
        counter += 1

    if len(samples) > 0:  # if there is some samples left in the the dictionary
        results.append(_fit_predict(training_data, samples, tau, save_results, show))

    # merge results of every iteration into a single tag
    ddfs = Disco().ddfs
    ddfs.tag(job.name, [[list(ddfs.blobs(tag))[0][0]] for tag in results])

    return ["tag://" + job.name]
Ejemplo n.º 7
0
def predict(dataset, fitmodel_url, m=1, save_results=True, show=False):
    """
    Function starts a job that makes predictions to input data with a given model

    Parameters
    ----------
    input - dataset object with input urls and other parameters
    fitmodel_url - model created in fit phase
    m - m estimate is used with discrete features
    save_results - save results to ddfs
    show - show info about job execution

    Returns
    -------
    Urls of predictions on ddfs
    """
    from disco.worker.pipeline.worker import Worker, Stage
    from disco.core import Job, result_iterator
    import numpy as np

    try:
        m = float(m)
    except ValueError:
        raise Exception("Parameter m should be numerical.")

    if "naivebayes_fitmodel" in fitmodel_url:
        # fit model is loaded from ddfs
        fit_model = dict((k, v) for k, v in result_iterator(fitmodel_url["naivebayes_fitmodel"]))
        if len(fit_model["y_labels"]) < 2:
            print "There is only one class in training data."
            return []
    else:
        raise Exception("Incorrect fit model.")

    if dataset.params["X_meta"].count("d") > 0:  # if there are discrete features in the model
        # code calculates logarithms to optimize predict phase as opposed to calculation by every mapped.
        np.seterr(divide='ignore')
        for iv in fit_model["iv"]:
            dist = [fit_model.pop((y,) + iv, 0) for y in fit_model["y_labels"]]
            fit_model[iv] = np.nan_to_num(
                np.log(np.true_divide(np.array(dist) + m * fit_model["prior"], np.sum(dist) + m))) - fit_model[
                                "prior_log"]
        del (fit_model["iv"])

    # define a job and set save of results to ddfs
    job = Job(worker=Worker(save_results=save_results))

    # job parallelizes execution of mappers
    job.pipeline = [
        ("split", Stage("map", input_chain=dataset.params["input_chain"], init=simple_init, process=map_predict))]

    job.params = dataset.params  # job parameters (dataset object)
    job.params["fit_model"] = fit_model
    # define name of a job and input data urls
    job.run(name="naivebayes_predict", input=dataset.params["data_tag"])
    results = job.wait(show=show)
    return results
Ejemplo n.º 8
0
def fit(dataset, alpha=1e-8, max_iterations=10, save_results=True, show=False):
    """
    Function starts a job for calculation of theta parameters

    Parameters
    ----------
    input - dataset object with input urls and other parameters
    alpha - convergence value
    max_iterations - define maximum number of iterations
    save_results - save results to ddfs
    show - show info about job execution

    Returns
    -------
    Urls of fit model results on ddfs
    """
    from disco.worker.pipeline.worker import Worker, Stage
    from disco.core import Job, result_iterator
    import numpy as np

    if dataset.params["y_map"] == []:
        raise Exception("Logistic regression requires a target label mapping parameter.")
    try:
        alpha = float(alpha)
        max_iterations = int(max_iterations)
        if max_iterations < 1:
            raise Exception("Parameter max_iterations should be greater than 0.")
    except ValueError:
        raise Exception("Parameters should be numerical.")

    # initialize thetas to 0 and add intercept term
    thetas = np.zeros(len(dataset.params["X_indices"]) + 1)

    J = [0]  # J cost function values for every iteration
    for i in range(max_iterations):
        job = Job(worker=Worker(save_results=save_results))
        # job parallelizes mappers and joins them with one reducer
        job.pipeline = [
            ("split", Stage("map", input_chain=dataset.params["input_chain"], init=simple_init, process=map_fit)),
            ('group_all', Stage("reduce", init=simple_init, process=reduce_fit, combine=True))]

        job.params = dataset.params  # job parameters (dataset object)
        job.params["thetas"] = thetas  # every iteration set new thetas
        job.run(name="logreg_fit_iter_%d" % (i + 1), input=dataset.params["data_tag"])

        fitmodel_url = job.wait(show=show)
        for k, v in result_iterator(fitmodel_url):
            if k == "J":  #
                J.append(v)  # save value of J cost function
            else:
                thetas = v  # save new thetas
        if np.abs(J[-2] - J[-1]) < alpha:  # check for convergence
            if show:
                print("Converged at iteration %d" % (i + 1))
            break

    return {"logreg_fitmodel": fitmodel_url}  # return results url
Ejemplo n.º 9
0
def predict(input, loglikelihoods, ys, splitter=" ", map_reader=chain_reader):
    ys = dict([(id, 1) for id in ys])
    job = Job(name="naive_bayes_predict")
    job.run(
        input=input,
        map_reader=map_reader,
        map=predict_map,
        params=Params(loglikelihoods=loglikelihoods, ys=ys, splitter=splitter),
        clean=False,
    )
    return job.wait()
Ejemplo n.º 10
0
def predict(input, loglikelihoods, ys, splitter=' ', map_reader=chain_reader):
    ys = dict([(id, 1) for id in ys])
    job = Job(name='naive_bayes_predict')
    job.run(input=input,
            map_reader=map_reader,
            map=predict_map,
            params=Params(loglikelihoods=loglikelihoods,
                          ys=ys,
                          splitter=splitter),
            clean=False)
    return job.wait()
Ejemplo n.º 11
0
class DiscoJob():


    def __init__(self,config,map,reduce):
        import config_util

        self.config = config_util.config
        #if the user doesn't specify output, print to stdout
        if not config.get('output_uri') and not config.get('print_to_stdout'):
            config['print_to_stdout'] = True

        for item in config:
            self.config[item] = config[item]

        self.map = map
        self.reduce = reduce
        self.job = Job()
        self.params = Params()
        for key in self.config:
            self.params.__dict__[key] = self.config[key]

    def run(self):

        if self.config['print_to_stdout']:

            self.job.run(input = do_split(self.config),
                     map = self.map,
                     reduce = self.reduce,
                     params = self.params,
                     map_input_stream = mongodb_input_stream,
                     required_modules= ['mongodb_io',
                                        'mongodb_input',
                                        'config_util',
                                        'mongo_util',
                                        'mongodb_output'])
            for key, value in result_iterator(self.job.wait(show=True)):
                print key, value

        else:
            self.job.run(input = do_split(self.config),
                     map = self.map,
                     reduce = self.reduce,
                     params = self.params,
                     map_input_stream = mongodb_input_stream,
                     reduce_output_stream = mongodb_output_stream,
                     required_modules= ['mongodb_io',
                                        'mongodb_input',
                                        'config_util',
                                        'mongo_util',
                                        'mongodb_output'])

            if self.config.get("job_wait",False):
                self.job.wait(show=True)
Ejemplo n.º 12
0
def measure(test_data,
            predictions,
            measure="ca",
            save_results=True,
            show=False):
    from disco.worker.pipeline.worker import Worker, Stage
    from disco.core import Job, result_iterator
    from disco.worker.task_io import task_input_stream, chain_reader

    if measure not in ["ca", "mse"]:
        raise Exception("measure should be ca or mse.")
    if test_data.params["id_index"] == -1:
        raise Exception("ID index should be defined.")

    if predictions == []:
        return "No predictions", None

    # define a job and set save of results to ddfs
    job = Job(worker=Worker(save_results=save_results))

    job = Job(worker=Worker(save_results=save_results))
    job.pipeline = [("split",
                     Stage("map",
                           input_chain=test_data.params["input_chain"],
                           init=simple_init,
                           process=map_test_data))]

    job.params = test_data.params
    job.run(name="ma_parse_testdata", input=test_data.params["data_tag"])
    parsed_testdata = job.wait(show=show)

    reduce_proces = reduce_ca if measure == "ca" else reduce_mse

    job = Job(worker=Worker(save_results=save_results))
    job.pipeline = [("split",
                     Stage("map",
                           init=simple_init,
                           input_chain=[task_input_stream, chain_reader],
                           process=map_predictions)),
                    ('group_all',
                     Stage("reduce",
                           init=simple_init,
                           process=reduce_proces,
                           sort=True,
                           combine=True))]

    job.run(name="ma_measure_accuracy", input=parsed_testdata + predictions)

    measure, acc = [
        (measure, acc) for measure, acc in result_iterator(job.wait(show=show))
    ][0]
    return measure, acc
Ejemplo n.º 13
0
def fit(dataset, save_results=True, show=False):
    from disco.worker.pipeline.worker import Worker, Stage
    from disco.core import Job
    job = Job(worker=Worker(save_results=save_results))

    job.pipeline = [
        ("split", Stage("map", input_chain=dataset.params["input_chain"], init=simple_init, process=map_fit)),
        ('group_all', Stage("reduce", init=simple_init, process=reduce_fit, combine=True))]

    job.params = dataset.params
    job.run(name="linreg_fit", input=dataset.params["data_tag"])

    fitmodel_url = job.wait(show=show)
    return {"linreg_fitmodel": fitmodel_url}  # return results url
def fit(dataset, trees_per_chunk=1, bootstrap=True, max_tree_nodes=50, min_samples_leaf=10, min_samples_split=5,
        class_majority=1, separate_max=True, measure="info_gain", accuracy=1, random_state=None, save_results=True,
        show=False):
    from disco.worker.pipeline.worker import Worker, Stage
    from disco.core import Job
    import discomll
    path = "/".join(discomll.__file__.split("/")[:-1] + ["ensemble", "core", ""])

    try:
        trees_per_chunk = int(trees_per_chunk)
        max_tree_nodes = int(max_tree_nodes) if max_tree_nodes != None else max_tree_nodes
        min_samples_leaf = int(min_samples_leaf)
        min_samples_split = int(min_samples_split)
        class_majority = float(class_majority)
        accuracy = int(accuracy)
        separate_max = separate_max
        if trees_per_chunk > 1 and bootstrap == False:
            raise Exception("Parameter trees_per_chunk (or Trees per subset) should be 1 to disable bootstrap.")
        if trees_per_chunk <= 0 or min_samples_leaf <= 0 or class_majority <= 0 or min_samples_split <= 0 and accuracy < 0 or type(
                bootstrap) != bool:
            raise Exception("Parameters should be greater than 0.")
    except ValueError:
        raise Exception("Parameters should be numerical.")

    if measure not in ["info_gain", "mdl"]:
        raise Exception("measure should be set to info_gain or mdl.")

    job = Job(worker=Worker(save_results=save_results))
    job.pipeline = [
        ("split", Stage("map", input_chain=dataset.params["input_chain"], init=map_init,
                        process=map_fit_bootstrap if bootstrap else map_fit)),
        ('group_all', Stage("reduce", init=simple_init, process=reduce_fit, combine=True))]

    job.params = dataset.params
    job.params["trees_per_chunk"] = trees_per_chunk
    job.params["max_tree_nodes"] = max_tree_nodes
    job.params["min_samples_leaf"] = min_samples_leaf
    job.params["min_samples_split"] = min_samples_split
    job.params["class_majority"] = class_majority
    job.params["measure"] = measure
    job.params["bootstrap"] = bootstrap
    job.params["accuracy"] = accuracy
    job.params["separate_max"] = separate_max
    job.params['seed'] = random_state

    job.run(name="forest_distributed_decision_trees_fit", input=dataset.params["data_tag"],
            required_files=[path + "decision_tree.py", path + "measures.py"])

    fitmodel_url = job.wait(show=show)
    return {"fddt_fitmodel": fitmodel_url}  # return results url
Ejemplo n.º 15
0
def measure(input, save_results=True, show=False):
    from disco.worker.pipeline.worker import Worker, Stage
    from disco.core import Job

    # define a job and set save of results to ddfs
    job = Job(worker=Worker(save_results=save_results))

    job.pipeline = [
        ("split", Stage("map", input_chain=input.params["input_chain"], init=simple_init, process=map_fit)),
        ('group_all', Stage("reduce", init=simple_init, process=reduce_fit, combine=True))]

    job.params = input.params  # job parameters (dataset object)

    job.run(name="Distribution", input=input.params["data_tag"])
    return job.wait(show=show)  # return results url
Ejemplo n.º 16
0
def predict(dataset, fitmodel_url, save_results=True, show=False):
    from disco.worker.pipeline.worker import Worker, Stage
    from disco.core import Job, result_iterator

    if "linreg_fitmodel" not in fitmodel_url:
        raise Exception("Incorrect fit model.")

    job = Job(worker=Worker(save_results=save_results))
    job.pipeline = [
        ("split", Stage("map", input_chain=dataset.params["input_chain"], init=simple_init, process=map_predict))]
    job.params = dataset.params
    job.params["thetas"] = [v for _, v in result_iterator(fitmodel_url["linreg_fitmodel"])][0]

    job.run(name="linreg_predict", input=dataset.params["data_tag"])
    return job.wait(show=show)
Ejemplo n.º 17
0
def fit(dataset, nu=0.1, save_results=True, show=False):
    """
    Function starts a job for calculation of model parameters

    Parameters
    ----------
    input - dataset object with input urls and other parameters
    nu - parameter to adjust the classifier
    save_results - save results to ddfs
    show - show info about job execution

    Returns
    -------
    Urls of fit model results on ddfs
    """
    from disco.worker.pipeline.worker import Worker, Stage
    from disco.core import Job

    if dataset.params["y_map"] == []:
        raise Exception(
            "Linear proximal SVM requires a target label mapping parameter.")
    try:
        nu = float(nu)
        if nu <= 0:
            raise Exception("Parameter nu should be greater than 0")
    except ValueError:
        raise Exception("Parameter should be numerical.")

    job = Job(worker=Worker(save_results=save_results))

    # job parallelizes mappers and joins them with one reducer
    job.pipeline = [("split",
                     Stage("map",
                           input_chain=dataset.params["input_chain"],
                           init=simple_init,
                           process=map_fit)),
                    ('group_all',
                     Stage("reduce",
                           init=simple_init,
                           process=reduce_fit,
                           combine=True))]

    job.params = dataset.params
    job.params["nu"] = nu
    job.run(name="linearsvm_fit", input=dataset.params["data_tag"])
    fitmodel_url = job.wait(show=show)
    return {"linsvm_fitmodel": fitmodel_url}  # return results url
def _fit_predict(fit_data, samples, tau, save_results, show):
    from disco.worker.pipeline.worker import Worker, Stage
    from disco.core import Job

    job = Job(worker=Worker(save_results=save_results))

    job.pipeline = [
        ("split", Stage("map", input_chain=fit_data.params["input_chain"], init=simple_init, process=map_fit)),
        ("group_all", Stage("reduce", init=simple_init, process=reduce_fit, sort=True, combine=True)),
    ]

    job.params = fit_data.params
    job.params["tau"] = tau
    job.params["samples"] = samples

    job.run(name="lwlr_fit_predict", input=fit_data.params["data_tag"])
    return job.wait(show=show)
Ejemplo n.º 19
0
def predict(dataset, fitmodel_url, save_results=True, show=False):
    from disco.worker.pipeline.worker import Worker, Stage
    from disco.core import Job, result_iterator

    if "linreg_fitmodel" not in fitmodel_url:
        raise Exception("Incorrect fit model.")

    job = Job(worker=Worker(save_results=save_results))
    job.pipeline = [("split",
                     Stage("map",
                           input_chain=dataset.params["input_chain"],
                           init=simple_init,
                           process=map_predict))]
    job.params = dataset.params
    job.params["thetas"] = [
        v for _, v in result_iterator(fitmodel_url["linreg_fitmodel"])
    ][0]

    job.run(name="linreg_predict", input=dataset.params["data_tag"])
    return job.wait(show=show)
Ejemplo n.º 20
0
def predict(dataset, fitmodel_url, save_results=True, show=False):
    """
    Function starts a job that makes predictions to input data with a given model

    Parameters
    ----------
    input - dataset object with input urls and other parameters
    fitmodel_url - model created in fit phase
    save_results - save results to ddfs
    show - show info about job execution

    Returns
    -------
    Urls with predictions on ddfs
    """
    from disco.worker.pipeline.worker import Worker, Stage
    from disco.core import Job, result_iterator

    if dataset.params["y_map"] == []:
        raise Exception(
            "Logistic regression requires a target label mapping parameter.")
    if "logreg_fitmodel" not in fitmodel_url:
        raise Exception("Incorrect fit model.")

    job = Job(worker=Worker(save_results=save_results))
    # job parallelizes execution of mappers
    job.pipeline = [("split",
                     Stage("map",
                           input_chain=dataset.params["input_chain"],
                           init=simple_init,
                           process=map_predict))]

    job.params = dataset.params  # job parameters (dataset object)
    job.params["thetas"] = [
        v for k, v in result_iterator(fitmodel_url["logreg_fitmodel"])
        if k == "thetas"
    ][0]  # thetas are loaded from ddfs

    job.run(name="logreg_predict", input=dataset.params["data_tag"])
    results = job.wait(show=show)
    return results
Ejemplo n.º 21
0
def fit(dataset, save_results=True, show=False):
    from disco.worker.pipeline.worker import Worker, Stage
    from disco.core import Job
    job = Job(worker=Worker(save_results=save_results))

    job.pipeline = [("split",
                     Stage("map",
                           input_chain=dataset.params["input_chain"],
                           init=simple_init,
                           process=map_fit)),
                    ('group_all',
                     Stage("reduce",
                           init=simple_init,
                           process=reduce_fit,
                           combine=True))]

    job.params = dataset.params
    job.run(name="linreg_fit", input=dataset.params["data_tag"])

    fitmodel_url = job.wait(show=show)
    return {"linreg_fitmodel": fitmodel_url}  # return results url
Ejemplo n.º 22
0
    def start(self):
        """Starts the entire process of querying twitter and classification.

        This method is responsible for running MapReduce for feature
        extraction.
        """
        def range_reader(stream, size, url):
           page_num = stream.getvalue()
           # Map readers should return a list of values, so page_num is
           # explicitly converted to an integer and then wrapped into a
           # list. By doing this each mapper instance will get exactly
           # one page number
           # If we don't do this, the mapper API just reads the numbers
           # character by character and we end up fetching the same 10
           # pages: digits 0, 9 all through since each character of a number
           # should be one of these 10 digits.
           return [int(page_num)]

        job = Job()

        inputs = [('raw://%d' % (i)) for i in range(1, self.num_pages)]

        job.run(input=inputs, map=mapper, reduce=reducer,
                map_reader=range_reader, params=Params(
                    query=self.query,
                    trained_vectorizer=self.vectorizer
                    ),
                required_modules=[
                    ('vectorizer', os.path.join(datasettings.PROJECT_ROOT,
                        'analyzer',
                        'vectorizer.py'),),
                    ('models', os.path.join(datasettings.PROJECT_ROOT,
                        'webui', 'fatninja',
                        'models.py'),),
                        ])

        self.feature_vector, self.row_num_to_tweet_id_map = \
            self.vectorizer.build_feature_matrix(job)

        self.classify()
Ejemplo n.º 23
0
def fit(dataset, nu=0.1, save_results=True, show=False):
    """
    Function starts a job for calculation of model parameters

    Parameters
    ----------
    input - dataset object with input urls and other parameters
    nu - parameter to adjust the classifier
    save_results - save results to ddfs
    show - show info about job execution

    Returns
    -------
    Urls of fit model results on ddfs
    """
    from disco.worker.pipeline.worker import Worker, Stage
    from disco.core import Job

    if dataset.params["y_map"] == []:
        raise Exception("Linear proximal SVM requires a target label mapping parameter.")
    try:
        nu = float(nu)
        if nu <= 0:
            raise Exception("Parameter nu should be greater than 0")
    except ValueError:
        raise Exception("Parameter should be numerical.")

    job = Job(worker=Worker(save_results=save_results))

    # job parallelizes mappers and joins them with one reducer
    job.pipeline = [
        ("split", Stage("map", input_chain=dataset.params["input_chain"], init=simple_init, process=map_fit)),
        ('group_all', Stage("reduce", init=simple_init, process=reduce_fit, combine=True))]

    job.params = dataset.params
    job.params["nu"] = nu
    job.run(name="linearsvm_fit", input=dataset.params["data_tag"])
    fitmodel_url = job.wait(show=show)
    return {"linsvm_fitmodel": fitmodel_url}  # return results url
Ejemplo n.º 24
0
def fit(dataset, save_results=True, show=False):
    """
    Function builds a model for Naive Bayes. It executes multiple map functions and one reduce function which aggregates intermediate results and returns a model.

    Parameters
    ----------
    input - dataset object with input urls and other parameters
    save_results - save results to ddfs
    show - show info about job execution

    Returns
    -------
    Urls of fit model results on ddfs
    """
    from disco.worker.pipeline.worker import Worker, Stage
    from disco.core import Job

    # define a job and set save of results to ddfs
    job = Job(worker=Worker(save_results=save_results))

    # job parallelizes mappers, sorts intermediate pairs and joins them with one reducer
    job.pipeline = [("split",
                     Stage("map",
                           input_chain=dataset.params["input_chain"],
                           init=simple_init,
                           process=map_fit)),
                    ('group_all',
                     Stage("reduce",
                           init=simple_init,
                           process=reduce_fit,
                           sort=True,
                           combine=True))]

    job.params = dataset.params  # job parameters (dataset object)
    # define name of a job and input data urls
    job.run(name="naivebayes_fit", input=dataset.params["data_tag"])
    fitmodel_url = job.wait(show=show)
    return {"naivebayes_fitmodel": fitmodel_url}  # return results url
Ejemplo n.º 25
0
def predict(dataset, fitmodel_url, coeff=0.5, save_results=True, show=False):
    from disco.worker.pipeline.worker import Worker, Stage
    from disco.core import Job, result_iterator
    import discomll
    path = "/".join(
        discomll.__file__.split("/")[:-1] + ["ensemble", "core", ""])

    job = Job(worker=Worker(save_results=save_results))
    job.pipeline = [("split",
                     Stage("map",
                           input_chain=dataset.params["input_chain"],
                           init=simple_init,
                           process=map_predict))]

    if "dwf_fitmodel" not in fitmodel_url:
        raise Exception("Incorrect fit model.")

    try:
        coeff = float(coeff)
        if coeff < 0:
            raise Exception("Parameter coeff should be greater than 0.")
    except ValueError:
        raise Exception("Parameter coeff should be numerical.")

    job.params = dataset.params
    job.params["coeff"] = coeff
    for k, v in result_iterator(fitmodel_url["dwf_fitmodel"]):
        job.params[k] = v

    if len(job.params["forest"]) == 0:
        print "Warning: There is no decision trees in forest"
        return []

    job.run(name="distributed_weighted_forest_predict",
            input=dataset.params["data_tag"],
            required_files=[path + "decision_tree.py"])

    return job.wait(show=show)
Ejemplo n.º 26
0
def _fit_predict(fit_data, samples, tau, save_results, show):
    from disco.worker.pipeline.worker import Worker, Stage
    from disco.core import Job
    job = Job(worker=Worker(save_results=save_results))

    job.pipeline = [("split",
                     Stage("map",
                           input_chain=fit_data.params["input_chain"],
                           init=simple_init,
                           process=map_fit)),
                    ('group_all',
                     Stage("reduce",
                           init=simple_init,
                           process=reduce_fit,
                           sort=True,
                           combine=True))]

    job.params = fit_data.params
    job.params["tau"] = tau
    job.params["samples"] = samples

    job.run(name="lwlr_fit_predict", input=fit_data.params["data_tag"])
    return job.wait(show=show)
def predict(dataset,
            fitmodel_url,
            voting=False,
            save_results=True,
            show=False):
    from disco.worker.pipeline.worker import Worker, Stage
    from disco.core import Job, result_iterator
    import discomll

    path = "/".join(
        discomll.__file__.split("/")[:-1] + ["ensemble", "core", ""])

    if "drf_fitmodel" not in fitmodel_url:
        raise Exception("Incorrect fit model.")

    job = Job(worker=Worker(save_results=save_results))
    job.pipeline = [
        ("split",
         Stage("map",
               input_chain=dataset.params["input_chain"],
               init=simple_init,
               process=map_predict_voting if voting else map_predict_dist))
    ]

    job.params = dataset.params
    for k, v in result_iterator(fitmodel_url["drf_fitmodel"]):
        job.params[k] = v

    if len(job.params["forest"]) == 0:
        print "Warning: There is no decision trees in forest"
        return []

    job.run(name="distributed_random_forest_predict",
            input=dataset.params["data_tag"],
            required_files=[path + "decision_tree.py"])

    return job.wait(show=show)
Ejemplo n.º 28
0
class DiscoJob():

    DEFAULT_CONFIG = {
        "job_output_key" : "_id",
        "job_output_value" : "value",
        "input_uri" : "mongodb://localhost/test.in",
        "output_uri" : "mongodb://localhost/test.out",
        "print_to_stdout": False,
        "job_wait":True,
        "split_size" : 8,
        "split_key" : {"_id" : 1},
        "create_input_splits" : True,
        "use_shards" : False,
        "use_chunks" : True,
        "slave_ok" : False,
        "limit" : 0,
        "skip" : 0,
        "input_key" : None,
        "sort" : None,
        "timeout" : False,
        "fields" : None,
        "query" : {}
    }

    def __init__(self,config,map,reduce):
        self.config = DiscoJob.DEFAULT_CONFIG.copy()
        self.config.update(config)

        self.map = map
        self.reduce = reduce
        self.job = Job()
        self.params = Params(**self.config)

    def run(self):

        if self.config['print_to_stdout']:

            self.job.run(input = do_split(self.config),
                     map = self.map,
                     reduce = self.reduce,
                     params = self.params,
                     map_input_stream = mongodb_input_stream,
                     required_modules= ['mongodisco.mongodb_io',
                                        'mongodisco.mongodb_input',
                                        'mongodisco.mongo_util',
                                        'mongodisco.mongodb_output'])
            for key, value in result_iterator(self.job.wait(show=True)):
                print key, value

        else:
            self.job.run(input = do_split(self.config),
                     map = self.map,
                     reduce = self.reduce,
                     params = self.params,
                     map_input_stream = mongodb_input_stream,
                     reduce_output_stream = mongodb_output_stream,
                     required_modules= ['mongodisco.mongodb_io',
                                        'mongodisco.mongodb_input',
                                        'mongodisco.mongo_util',
                                        'mongodisco.mongodb_output'])

            if self.config.get("job_wait",False):
                self.job.wait(show=True)
Ejemplo n.º 29
0
def fit_predict(training_data,
                fitting_data,
                tau=1,
                samples_per_job=0,
                save_results=True,
                show=False):
    from disco.worker.pipeline.worker import Worker, Stage
    from disco.core import Job, result_iterator
    from disco.core import Disco
    """
    training_data - training samples
    fitting_data - dataset to be fitted to training data.
    tau - controls how quickly the weight of a training sample falls off with distance of its x(i) from the query point x.
    samples_per_job - define a number of samples that will be processed in single mapreduce job. If 0, algorithm will calculate number of samples per job.
    """

    try:
        tau = float(tau)
        if tau <= 0:
            raise Exception("Parameter tau should be >= 0.")
    except ValueError:
        raise Exception("Parameter tau should be numerical.")

    if fitting_data.params["id_index"] == -1:
        raise Exception("Predict data should have id_index set.")

    job = Job(worker=Worker(save_results=save_results))
    job.pipeline = [("split",
                     Stage("map",
                           input_chain=fitting_data.params["input_chain"],
                           init=simple_init,
                           process=map_predict))]
    job.params = fitting_data.params
    job.run(name="lwlr_read_data", input=fitting_data.params["data_tag"])

    samples = {}
    results = []
    tau = float(2 * tau**2)  # calculate tau once
    counter = 0

    for test_id, x in result_iterator(job.wait(show=show)):
        if samples_per_job == 0:
            # calculate number of samples per job
            if len(x) <= 100:  # if there is less than 100 attributes
                samples_per_job = 100  # 100 samples is max per on job
            else:
                # there is more than 100 attributes
                samples_per_job = len(x) * -25 / 900. + 53  # linear function

        samples[test_id] = x
        if counter == samples_per_job:
            results.append(
                _fit_predict(training_data, samples, tau, save_results, show))
            counter = 0
            samples = {}
        counter += 1

    if len(samples) > 0:  # if there is some samples left in the the dictionary
        results.append(
            _fit_predict(training_data, samples, tau, save_results, show))

    # merge results of every iteration into a single tag
    ddfs = Disco().ddfs
    ddfs.tag(job.name, [[list(ddfs.blobs(tag))[0][0]] for tag in results])

    return ["tag://" + job.name]
def fit(
    dataset,
    trees_per_chunk=3,
    max_tree_nodes=50,
    min_samples_leaf=10,
    min_samples_split=5,
    class_majority=1,
    measure="info_gain",
    k="sqrt",
    accuracy=1,
    random_state=None,
    separate_max=True,
    save_results=True,
    show=False,
):
    from disco.worker.pipeline.worker import Worker, Stage
    from disco.core import Job
    import discomll

    path = "/".join(discomll.__file__.split("/")[:-1] + ["ensemble", "core", ""])

    try:
        trees_per_chunk = int(trees_per_chunk)
        max_tree_nodes = int(max_tree_nodes) if max_tree_nodes != None else max_tree_nodes
        min_samples_leaf = int(min_samples_leaf)
        min_samples_split = int(min_samples_split)
        class_majority = float(class_majority)
        separate_max = separate_max
        accuracy = int(accuracy)

        if (
            trees_per_chunk <= 0
            or min_samples_leaf <= 0
            or min_samples_split <= 0
            or class_majority <= 0
            or accuracy < 0
        ):
            raise Exception("Parameters should be greater than 0.")
    except ValueError:
        raise Exception("Parameters should be numerical.")

    if measure not in ["info_gain", "mdl"]:
        raise Exception("measure should be set to info_gain or mdl.")

    job = Job(worker=Worker(save_results=save_results))
    job.pipeline = [
        ("split", Stage("map", input_chain=dataset.params["input_chain"], init=map_init, process=map_fit)),
        ("group_all", Stage("reduce", init=simple_init, process=reduce_fit, combine=True)),
    ]

    job.params = dataset.params
    job.params["trees_per_chunk"] = trees_per_chunk
    job.params["max_tree_nodes"] = max_tree_nodes
    job.params["min_samples_leaf"] = min_samples_leaf
    job.params["min_samples_split"] = min_samples_split
    job.params["class_majority"] = class_majority
    job.params["measure"] = measure
    job.params["accuracy"] = accuracy
    job.params["k"] = k
    job.params["seed"] = random_state
    job.params["separate_max"] = separate_max

    job.run(
        name="distributed_weighted_forest_fit",
        input=dataset.params["data_tag"],
        required_files=[path + "decision_tree.py", path + "measures.py", path + "k_medoids.py"],
    )

    fitmodel_url = job.wait(show=show)
    return {"dwf_fitmodel": fitmodel_url}  # return results url
Ejemplo n.º 31
0
    

    rest = [(x[:-3],here+x) for x in src_files]

    for x in rest:
        print "import ",x[0]
    print "Making Job"        
    print input_files
    worker = Worker(required_modules=rest,
                    master='http://disco1:8989',
                    input=input_files, 
                    map=mapper,  
                    map_init=mapper_init,
		    sort=False,
		    reduce=disco.worker.classic.func.nop_reduce)
    job = Job(name="GreenSpace",worker=worker)
    
    print "Running Job"
    job.run()
    try:
	    print "Job run"
	    r = job.wait(show=True)
	    print "Done Job"
    finally:
	    httpd.shutdown()

    for status, reason in result_iterator(r):
        print status, reason


Ejemplo n.º 32
0
def estimate(input, ys, splitter=" ", map_reader=chain_reader):
    ys = dict([(id, 1) for id in ys])

    job = Job(name="naive_bayes_estimate")

    job.run(
        input=input,
        map_reader=map_reader,
        map=estimate_map,
        combiner=estimate_combiner,
        reduce=estimate_reduce,
        params=Params(ys=ys, splitter=splitter),
        clean=False,
    )
    results = job.wait()

    total = 0
    # will include the items for which we'll be classifying,
    # for example if the dataset includes males and females,
    # this dict will include the keys male and female and the
    # number of times these have been observed in the train set
    items = {}

    # the number of times the classes have been observed.  For
    # example,  if the feature is something like tall or short, then the dict
    # will contain the total number of times we have seen tall and short.
    classes = {}

    # the number of times we have seen a class with a feature.
    pairs = {}

    for key, value in result_iterator(results):
        l = key.split(splitter)
        value = int(value)
        if len(l) == 1:
            if l[0] == "":
                total = value
            elif ys.has_key(l[0]):
                classes[l[0]] = value
            else:
                items[l[0]] = value
        else:
            pairs[key] = value

    # counts[key] = [[c,i], [not c, i], [c, not i], [not c, not i]]
    counts = {}
    for i in items:
        for y in ys:
            key = y + splitter + i
            counts[key] = [0, 0, 0, 0]
            if pairs.has_key(key):
                counts[key][0] = pairs[key]
            counts[key][1] = items[i] - counts[key][0]
            if not classes.has_key(y):
                counts[key][2] = 0
            else:
                counts[key][2] = classes[y] - counts[key][0]
            counts[key][3] = total - sum(counts[key][:3])

            # add pseudocounts
            counts[key] = map(lambda x: x + 1, counts[key])
    total += 4

    import math

    loglikelihoods = {}
    for key, value in counts.iteritems():
        l = key.split(splitter)
        if not loglikelihoods.has_key(l[0]):
            loglikelihoods[l[0]] = 0.0
        loglikelihoods[l[0]] += math.log(value[0] + value[2]) - math.log(value[1] + value[3])
        loglikelihoods[key] = math.log(value[0]) - math.log(value[1])

    return loglikelihoods
Ejemplo n.º 33
0
class DiscoJob():

    DEFAULT_CONFIG = {
        "job_output_key": "_id",
        "job_output_value": "value",
        "input_uri": "mongodb://localhost/test.in",
        "output_uri": "mongodb://localhost/test.out",
        "print_to_stdout": False,
        "job_wait": True,
        "split_size": 8,
        "split_key": {
            "_id": 1
        },
        "create_input_splits": True,
        "use_shards": False,
        "use_chunks": True,
        "slave_ok": False,
        "limit": 0,
        "skip": 0,
        "input_key": None,
        "sort": None,
        "timeout": False,
        "fields": None,
        "query": {}
    }

    def __init__(self, config, map, reduce):
        self.config = DiscoJob.DEFAULT_CONFIG.copy()
        self.config.update(config)

        self.map = map
        self.reduce = reduce
        self.job = Job()
        self.params = Params(**self.config)

    def run(self):

        if self.config['print_to_stdout']:

            self.job.run(input=do_split(self.config),
                         map=self.map,
                         reduce=self.reduce,
                         params=self.params,
                         map_input_stream=mongodb_input_stream,
                         required_modules=[
                             'mongodisco.mongodb_io',
                             'mongodisco.mongodb_input',
                             'mongodisco.mongo_util',
                             'mongodisco.mongodb_output'
                         ])
            for key, value in result_iterator(self.job.wait(show=True)):
                print key, value

        else:
            self.job.run(input=do_split(self.config),
                         map=self.map,
                         reduce=self.reduce,
                         params=self.params,
                         map_input_stream=mongodb_input_stream,
                         reduce_output_stream=mongodb_output_stream,
                         required_modules=[
                             'mongodisco.mongodb_io',
                             'mongodisco.mongodb_input',
                             'mongodisco.mongo_util',
                             'mongodisco.mongodb_output'
                         ])

            if self.config.get("job_wait", False):
                self.job.wait(show=True)
Ejemplo n.º 34
0
def fit(dataset,
        trees_per_chunk=1,
        bootstrap=True,
        max_tree_nodes=50,
        min_samples_leaf=10,
        min_samples_split=5,
        class_majority=1,
        separate_max=True,
        measure="info_gain",
        accuracy=1,
        random_state=None,
        save_results=True,
        show=False):
    from disco.worker.pipeline.worker import Worker, Stage
    from disco.core import Job
    import discomll
    path = "/".join(
        discomll.__file__.split("/")[:-1] + ["ensemble", "core", ""])

    try:
        trees_per_chunk = int(trees_per_chunk)
        max_tree_nodes = int(
            max_tree_nodes) if max_tree_nodes != None else max_tree_nodes
        min_samples_leaf = int(min_samples_leaf)
        min_samples_split = int(min_samples_split)
        class_majority = float(class_majority)
        accuracy = int(accuracy)
        separate_max = separate_max
        if trees_per_chunk > 1 and bootstrap == False:
            raise Exception(
                "Parameter trees_per_chunk (or Trees per subset) should be 1 to disable bootstrap."
            )
        if trees_per_chunk <= 0 or min_samples_leaf <= 0 or class_majority <= 0 or min_samples_split <= 0 and accuracy < 0 or type(
                bootstrap) != bool:
            raise Exception("Parameters should be greater than 0.")
    except ValueError:
        raise Exception("Parameters should be numerical.")

    if measure not in ["info_gain", "mdl"]:
        raise Exception("measure should be set to info_gain or mdl.")

    job = Job(worker=Worker(save_results=save_results))
    job.pipeline = [
        ("split",
         Stage("map",
               input_chain=dataset.params["input_chain"],
               init=map_init,
               process=map_fit_bootstrap if bootstrap else map_fit)),
        ('group_all',
         Stage("reduce", init=simple_init, process=reduce_fit, combine=True))
    ]

    job.params = dataset.params
    job.params["trees_per_chunk"] = trees_per_chunk
    job.params["max_tree_nodes"] = max_tree_nodes
    job.params["min_samples_leaf"] = min_samples_leaf
    job.params["min_samples_split"] = min_samples_split
    job.params["class_majority"] = class_majority
    job.params["measure"] = measure
    job.params["bootstrap"] = bootstrap
    job.params["accuracy"] = accuracy
    job.params["separate_max"] = separate_max
    job.params['seed'] = random_state

    job.run(name="forest_distributed_decision_trees_fit",
            input=dataset.params["data_tag"],
            required_files=[path + "decision_tree.py", path + "measures.py"])

    fitmodel_url = job.wait(show=show)
    return {"fddt_fitmodel": fitmodel_url}  # return results url
Ejemplo n.º 35
0
def estimate(input, ys, splitter=' ', map_reader=chain_reader):
    ys = dict([(id, 1) for id in ys])

    job = Job(name='naive_bayes_estimate')

    job.run(input=input,
            map_reader=map_reader,
            map=estimate_map,
            combiner=estimate_combiner,
            reduce=estimate_reduce,
            params=Params(ys=ys, splitter=splitter),
            clean=False)
    results = job.wait()

    total = 0
    # will include the items for which we'll be classifying,
    # for example if the dataset includes males and females,
    # this dict will include the keys male and female and the
    # number of times these have been observed in the train set
    items = {}

    # the number of times the classes have been observed.  For
    # example,  if the feature is something like tall or short, then the dict
    # will contain the total number of times we have seen tall and short.
    classes = {}

    # the number of times we have seen a class with a feature.
    pairs = {}

    for key, value in result_iterator(results):
        l = key.split(splitter)
        value = int(value)
        if len(l) == 1:
            if l[0] == '':
                total = value
            elif ys.has_key(l[0]):
                classes[l[0]] = value
            else:
                items[l[0]] = value
        else:
            pairs[key] = value


#counts[key] = [[c,i], [not c, i], [c, not i], [not c, not i]]
    counts = {}
    for i in items:
        for y in ys:
            key = y + splitter + i
            counts[key] = [0, 0, 0, 0]
            if pairs.has_key(key):
                counts[key][0] = pairs[key]
            counts[key][1] = items[i] - counts[key][0]
            if not classes.has_key(y):
                counts[key][2] = 0
            else:
                counts[key][2] = classes[y] - counts[key][0]
            counts[key][3] = total - sum(counts[key][:3])

            # add pseudocounts
            counts[key] = map(lambda x: x + 1, counts[key])
    total += 4

    import math
    loglikelihoods = {}
    for key, value in counts.iteritems():
        l = key.split(splitter)
        if not loglikelihoods.has_key(l[0]):
            loglikelihoods[l[0]] = 0.0
        loglikelihoods[l[0]] += math.log(value[0] +
                                         value[2]) - math.log(value[1] +
                                                              value[3])
        loglikelihoods[key] = math.log(value[0]) - math.log(value[1])

    return loglikelihoods
Ejemplo n.º 36
0
def predict(dataset, fitmodel_url, m=1, save_results=True, show=False):
    """
    Function starts a job that makes predictions to input data with a given model

    Parameters
    ----------
    input - dataset object with input urls and other parameters
    fitmodel_url - model created in fit phase
    m - m estimate is used with discrete features
    save_results - save results to ddfs
    show - show info about job execution

    Returns
    -------
    Urls of predictions on ddfs
    """
    from disco.worker.pipeline.worker import Worker, Stage
    from disco.core import Job, result_iterator
    import numpy as np

    try:
        m = float(m)
    except ValueError:
        raise Exception("Parameter m should be numerical.")

    if "naivebayes_fitmodel" in fitmodel_url:
        # fit model is loaded from ddfs
        fit_model = dict(
            (k, v)
            for k, v in result_iterator(fitmodel_url["naivebayes_fitmodel"]))
        if len(fit_model["y_labels"]) < 2:
            print "There is only one class in training data."
            return []
    else:
        raise Exception("Incorrect fit model.")

    if dataset.params["X_meta"].count(
            "d") > 0:  # if there are discrete features in the model
        # code calculates logarithms to optimize predict phase as opposed to calculation by every mapped.
        np.seterr(divide='ignore')
        for iv in fit_model["iv"]:
            dist = [
                fit_model.pop((y, ) + iv, 0) for y in fit_model["y_labels"]
            ]
            fit_model[iv] = np.nan_to_num(
                np.log(
                    np.true_divide(
                        np.array(dist) + m * fit_model["prior"],
                        np.sum(dist) + m))) - fit_model["prior_log"]
        del (fit_model["iv"])

    # define a job and set save of results to ddfs
    job = Job(worker=Worker(save_results=save_results))

    # job parallelizes execution of mappers
    job.pipeline = [("split",
                     Stage("map",
                           input_chain=dataset.params["input_chain"],
                           init=simple_init,
                           process=map_predict))]

    job.params = dataset.params  # job parameters (dataset object)
    job.params["fit_model"] = fit_model
    # define name of a job and input data urls
    job.run(name="naivebayes_predict", input=dataset.params["data_tag"])
    results = job.wait(show=show)
    return results
Ejemplo n.º 37
0
def fit(dataset, alpha=1e-8, max_iterations=10, save_results=True, show=False):
    """
    Function starts a job for calculation of theta parameters

    Parameters
    ----------
    input - dataset object with input urls and other parameters
    alpha - convergence value
    max_iterations - define maximum number of iterations
    save_results - save results to ddfs
    show - show info about job execution

    Returns
    -------
    Urls of fit model results on ddfs
    """
    from disco.worker.pipeline.worker import Worker, Stage
    from disco.core import Job, result_iterator
    import numpy as np

    if dataset.params["y_map"] == []:
        raise Exception(
            "Logistic regression requires a target label mapping parameter.")
    try:
        alpha = float(alpha)
        max_iterations = int(max_iterations)
        if max_iterations < 1:
            raise Exception(
                "Parameter max_iterations should be greater than 0.")
    except ValueError:
        raise Exception("Parameters should be numerical.")

    # initialize thetas to 0 and add intercept term
    thetas = np.zeros(len(dataset.params["X_indices"]) + 1)

    J = [0]  # J cost function values for every iteration
    for i in range(max_iterations):
        job = Job(worker=Worker(save_results=save_results))
        # job parallelizes mappers and joins them with one reducer
        job.pipeline = [("split",
                         Stage("map",
                               input_chain=dataset.params["input_chain"],
                               init=simple_init,
                               process=map_fit)),
                        ('group_all',
                         Stage("reduce",
                               init=simple_init,
                               process=reduce_fit,
                               combine=True))]

        job.params = dataset.params  # job parameters (dataset object)
        job.params["thetas"] = thetas  # every iteration set new thetas
        job.run(name="logreg_fit_iter_%d" % (i + 1),
                input=dataset.params["data_tag"])

        fitmodel_url = job.wait(show=show)
        for k, v in result_iterator(fitmodel_url):
            if k == "J":  #
                J.append(v)  # save value of J cost function
            else:
                thetas = v  # save new thetas
        if np.abs(J[-2] - J[-1]) < alpha:  # check for convergence
            if show:
                print("Converged at iteration %d" % (i + 1))
            break

    return {"logreg_fitmodel": fitmodel_url}  # return results url