def predict(dataset, fitmodel_url, coeff=0.5, save_results=True, show=False): from disco.worker.pipeline.worker import Worker, Stage from disco.core import Job, result_iterator import discomll path = "/".join(discomll.__file__.split("/")[:-1] + ["ensemble", "core", ""]) job = Job(worker=Worker(save_results=save_results)) job.pipeline = [ ("split", Stage("map", input_chain=dataset.params["input_chain"], init=simple_init, process=map_predict))] if "dwfr_fitmodel" not in fitmodel_url: raise Exception("Incorrect fit model.") try: coeff = float(coeff) if coeff < 0: raise Exception("Parameter coeff should be greater than 0.") except ValueError: raise Exception("Parameter coeff should be numerical.") job.params = dataset.params job.params["coeff"] = coeff for k, v in result_iterator(fitmodel_url["dwfr_fitmodel"]): job.params[k] = v if len(job.params["forest"]) == 0: print "Warning: There is no decision trees in forest" return [] job.run(name="distributed_weighted_forest_rand_predict", input=dataset.params["data_tag"], required_files=[path + "decision_tree.py"]) return job.wait(show=show)
def fit(dataset, save_results=True, show=False): """ Function builds a model for Naive Bayes. It executes multiple map functions and one reduce function which aggregates intermediate results and returns a model. Parameters ---------- input - dataset object with input urls and other parameters save_results - save results to ddfs show - show info about job execution Returns ------- Urls of fit model results on ddfs """ from disco.worker.pipeline.worker import Worker, Stage from disco.core import Job # define a job and set save of results to ddfs job = Job(worker=Worker(save_results=save_results)) # job parallelizes mappers, sorts intermediate pairs and joins them with one reducer job.pipeline = [ ("split", Stage("map", input_chain=dataset.params["input_chain"], init=simple_init, process=map_fit)), ('group_all', Stage("reduce", init=simple_init, process=reduce_fit, sort=True, combine=True))] job.params = dataset.params # job parameters (dataset object) # define name of a job and input data urls job.run(name="naivebayes_fit", input=dataset.params["data_tag"]) fitmodel_url = job.wait(show=show) return {"naivebayes_fitmodel": fitmodel_url} # return results url
def measure(test_data, predictions, measure="ca", save_results=True, show=False): from disco.worker.pipeline.worker import Worker, Stage from disco.core import Job, result_iterator from disco.worker.task_io import task_input_stream, chain_reader if measure not in ["ca", "mse"]: raise Exception("measure should be ca or mse.") if test_data.params["id_index"] == -1: raise Exception("ID index should be defined.") if predictions == []: return "No predictions", None # define a job and set save of results to ddfs job = Job(worker=Worker(save_results=save_results)) job = Job(worker=Worker(save_results=save_results)) job.pipeline = [ ("split", Stage("map", input_chain=test_data.params["input_chain"], init=simple_init, process=map_test_data))] job.params = test_data.params job.run(name="ma_parse_testdata", input=test_data.params["data_tag"]) parsed_testdata = job.wait(show=show) reduce_proces = reduce_ca if measure == "ca" else reduce_mse job = Job(worker=Worker(save_results=save_results)) job.pipeline = [("split", Stage("map", init=simple_init, input_chain=[task_input_stream, chain_reader], process=map_predictions)), ('group_all', Stage("reduce", init=simple_init, process=reduce_proces, sort=True, combine=True))] job.run(name="ma_measure_accuracy", input=parsed_testdata + predictions) measure, acc = [(measure, acc) for measure, acc in result_iterator(job.wait(show=show))][0] return measure, acc
def predict(dataset, fitmodel_url, voting=False, save_results=True, show=False): from disco.worker.pipeline.worker import Worker, Stage from disco.core import Job, result_iterator import discomll path = "/".join(discomll.__file__.split("/")[:-1] + ["ensemble", "core", ""]) if "drf_fitmodel" not in fitmodel_url: raise Exception("Incorrect fit model.") job = Job(worker=Worker(save_results=save_results)) job.pipeline = [("split", Stage("map", input_chain=dataset.params["input_chain"], init=simple_init, process=map_predict_voting if voting else map_predict_dist))] job.params = dataset.params for k, v in result_iterator(fitmodel_url["drf_fitmodel"]): job.params[k] = v if len(job.params["forest"]) == 0: print "Warning: There is no decision trees in forest" return [] job.run(name="distributed_random_forest_predict", input=dataset.params["data_tag"], required_files=[path + "decision_tree.py"]) return job.wait(show=show)
def predict(dataset, fitmodel_url, save_results=True, show=False): """ Function starts a job that makes predictions to input data with a given model. Parameters ---------- input - dataset object with input urls and other parameters fitmodel_url - model created in fit phase save_results - save results to ddfs show - show info about job execution Returns ------- Urls with predictions on ddfs """ from disco.worker.pipeline.worker import Worker, Stage from disco.core import Job, result_iterator if "linsvm_fitmodel" not in fitmodel_url: raise Exception("Incorrect fit model.") job = Job(worker=Worker(save_results=save_results)) # job parallelizes execution of mappers job.pipeline = [ ("split", Stage("map", input_chain=dataset.params["input_chain"], init=simple_init, process=map_predict))] job.params = dataset.params job.params["fit_params"] = [v for _, v in result_iterator(fitmodel_url["linsvm_fitmodel"])][0] job.run(name="linsvm_predict", input=dataset.params["data_tag"]) return job.wait(show=show)
def fit_predict(training_data, fitting_data, tau=1, samples_per_job=0, save_results=True, show=False): from disco.worker.pipeline.worker import Worker, Stage from disco.core import Job, result_iterator from disco.core import Disco """ training_data - training samples fitting_data - dataset to be fitted to training data. tau - controls how quickly the weight of a training sample falls off with distance of its x(i) from the query point x. samples_per_job - define a number of samples that will be processed in single mapreduce job. If 0, algorithm will calculate number of samples per job. """ try: tau = float(tau) if tau <= 0: raise Exception("Parameter tau should be >= 0.") except ValueError: raise Exception("Parameter tau should be numerical.") if fitting_data.params["id_index"] == -1: raise Exception("Predict data should have id_index set.") job = Job(worker=Worker(save_results=save_results)) job.pipeline = [ ("split", Stage("map", input_chain=fitting_data.params["input_chain"], init=simple_init, process=map_predict)) ] job.params = fitting_data.params job.run(name="lwlr_read_data", input=fitting_data.params["data_tag"]) samples = {} results = [] tau = float(2 * tau ** 2) # calculate tau once counter = 0 for test_id, x in result_iterator(job.wait(show=show)): if samples_per_job == 0: # calculate number of samples per job if len(x) <= 100: # if there is less than 100 attributes samples_per_job = 100 # 100 samples is max per on job else: # there is more than 100 attributes samples_per_job = len(x) * -25 / 900.0 + 53 # linear function samples[test_id] = x if counter == samples_per_job: results.append(_fit_predict(training_data, samples, tau, save_results, show)) counter = 0 samples = {} counter += 1 if len(samples) > 0: # if there is some samples left in the the dictionary results.append(_fit_predict(training_data, samples, tau, save_results, show)) # merge results of every iteration into a single tag ddfs = Disco().ddfs ddfs.tag(job.name, [[list(ddfs.blobs(tag))[0][0]] for tag in results]) return ["tag://" + job.name]
def predict(dataset, fitmodel_url, m=1, save_results=True, show=False): """ Function starts a job that makes predictions to input data with a given model Parameters ---------- input - dataset object with input urls and other parameters fitmodel_url - model created in fit phase m - m estimate is used with discrete features save_results - save results to ddfs show - show info about job execution Returns ------- Urls of predictions on ddfs """ from disco.worker.pipeline.worker import Worker, Stage from disco.core import Job, result_iterator import numpy as np try: m = float(m) except ValueError: raise Exception("Parameter m should be numerical.") if "naivebayes_fitmodel" in fitmodel_url: # fit model is loaded from ddfs fit_model = dict((k, v) for k, v in result_iterator(fitmodel_url["naivebayes_fitmodel"])) if len(fit_model["y_labels"]) < 2: print "There is only one class in training data." return [] else: raise Exception("Incorrect fit model.") if dataset.params["X_meta"].count("d") > 0: # if there are discrete features in the model # code calculates logarithms to optimize predict phase as opposed to calculation by every mapped. np.seterr(divide='ignore') for iv in fit_model["iv"]: dist = [fit_model.pop((y,) + iv, 0) for y in fit_model["y_labels"]] fit_model[iv] = np.nan_to_num( np.log(np.true_divide(np.array(dist) + m * fit_model["prior"], np.sum(dist) + m))) - fit_model[ "prior_log"] del (fit_model["iv"]) # define a job and set save of results to ddfs job = Job(worker=Worker(save_results=save_results)) # job parallelizes execution of mappers job.pipeline = [ ("split", Stage("map", input_chain=dataset.params["input_chain"], init=simple_init, process=map_predict))] job.params = dataset.params # job parameters (dataset object) job.params["fit_model"] = fit_model # define name of a job and input data urls job.run(name="naivebayes_predict", input=dataset.params["data_tag"]) results = job.wait(show=show) return results
def fit(dataset, alpha=1e-8, max_iterations=10, save_results=True, show=False): """ Function starts a job for calculation of theta parameters Parameters ---------- input - dataset object with input urls and other parameters alpha - convergence value max_iterations - define maximum number of iterations save_results - save results to ddfs show - show info about job execution Returns ------- Urls of fit model results on ddfs """ from disco.worker.pipeline.worker import Worker, Stage from disco.core import Job, result_iterator import numpy as np if dataset.params["y_map"] == []: raise Exception("Logistic regression requires a target label mapping parameter.") try: alpha = float(alpha) max_iterations = int(max_iterations) if max_iterations < 1: raise Exception("Parameter max_iterations should be greater than 0.") except ValueError: raise Exception("Parameters should be numerical.") # initialize thetas to 0 and add intercept term thetas = np.zeros(len(dataset.params["X_indices"]) + 1) J = [0] # J cost function values for every iteration for i in range(max_iterations): job = Job(worker=Worker(save_results=save_results)) # job parallelizes mappers and joins them with one reducer job.pipeline = [ ("split", Stage("map", input_chain=dataset.params["input_chain"], init=simple_init, process=map_fit)), ('group_all', Stage("reduce", init=simple_init, process=reduce_fit, combine=True))] job.params = dataset.params # job parameters (dataset object) job.params["thetas"] = thetas # every iteration set new thetas job.run(name="logreg_fit_iter_%d" % (i + 1), input=dataset.params["data_tag"]) fitmodel_url = job.wait(show=show) for k, v in result_iterator(fitmodel_url): if k == "J": # J.append(v) # save value of J cost function else: thetas = v # save new thetas if np.abs(J[-2] - J[-1]) < alpha: # check for convergence if show: print("Converged at iteration %d" % (i + 1)) break return {"logreg_fitmodel": fitmodel_url} # return results url
def measure(test_data, predictions, measure="ca", save_results=True, show=False): from disco.worker.pipeline.worker import Worker, Stage from disco.core import Job, result_iterator from disco.worker.task_io import task_input_stream, chain_reader if measure not in ["ca", "mse"]: raise Exception("measure should be ca or mse.") if test_data.params["id_index"] == -1: raise Exception("ID index should be defined.") if predictions == []: return "No predictions", None # define a job and set save of results to ddfs job = Job(worker=Worker(save_results=save_results)) job = Job(worker=Worker(save_results=save_results)) job.pipeline = [("split", Stage("map", input_chain=test_data.params["input_chain"], init=simple_init, process=map_test_data))] job.params = test_data.params job.run(name="ma_parse_testdata", input=test_data.params["data_tag"]) parsed_testdata = job.wait(show=show) reduce_proces = reduce_ca if measure == "ca" else reduce_mse job = Job(worker=Worker(save_results=save_results)) job.pipeline = [("split", Stage("map", init=simple_init, input_chain=[task_input_stream, chain_reader], process=map_predictions)), ('group_all', Stage("reduce", init=simple_init, process=reduce_proces, sort=True, combine=True))] job.run(name="ma_measure_accuracy", input=parsed_testdata + predictions) measure, acc = [ (measure, acc) for measure, acc in result_iterator(job.wait(show=show)) ][0] return measure, acc
def fit(dataset, save_results=True, show=False): from disco.worker.pipeline.worker import Worker, Stage from disco.core import Job job = Job(worker=Worker(save_results=save_results)) job.pipeline = [ ("split", Stage("map", input_chain=dataset.params["input_chain"], init=simple_init, process=map_fit)), ('group_all', Stage("reduce", init=simple_init, process=reduce_fit, combine=True))] job.params = dataset.params job.run(name="linreg_fit", input=dataset.params["data_tag"]) fitmodel_url = job.wait(show=show) return {"linreg_fitmodel": fitmodel_url} # return results url
def fit(dataset, trees_per_chunk=1, bootstrap=True, max_tree_nodes=50, min_samples_leaf=10, min_samples_split=5, class_majority=1, separate_max=True, measure="info_gain", accuracy=1, random_state=None, save_results=True, show=False): from disco.worker.pipeline.worker import Worker, Stage from disco.core import Job import discomll path = "/".join(discomll.__file__.split("/")[:-1] + ["ensemble", "core", ""]) try: trees_per_chunk = int(trees_per_chunk) max_tree_nodes = int(max_tree_nodes) if max_tree_nodes != None else max_tree_nodes min_samples_leaf = int(min_samples_leaf) min_samples_split = int(min_samples_split) class_majority = float(class_majority) accuracy = int(accuracy) separate_max = separate_max if trees_per_chunk > 1 and bootstrap == False: raise Exception("Parameter trees_per_chunk (or Trees per subset) should be 1 to disable bootstrap.") if trees_per_chunk <= 0 or min_samples_leaf <= 0 or class_majority <= 0 or min_samples_split <= 0 and accuracy < 0 or type( bootstrap) != bool: raise Exception("Parameters should be greater than 0.") except ValueError: raise Exception("Parameters should be numerical.") if measure not in ["info_gain", "mdl"]: raise Exception("measure should be set to info_gain or mdl.") job = Job(worker=Worker(save_results=save_results)) job.pipeline = [ ("split", Stage("map", input_chain=dataset.params["input_chain"], init=map_init, process=map_fit_bootstrap if bootstrap else map_fit)), ('group_all', Stage("reduce", init=simple_init, process=reduce_fit, combine=True))] job.params = dataset.params job.params["trees_per_chunk"] = trees_per_chunk job.params["max_tree_nodes"] = max_tree_nodes job.params["min_samples_leaf"] = min_samples_leaf job.params["min_samples_split"] = min_samples_split job.params["class_majority"] = class_majority job.params["measure"] = measure job.params["bootstrap"] = bootstrap job.params["accuracy"] = accuracy job.params["separate_max"] = separate_max job.params['seed'] = random_state job.run(name="forest_distributed_decision_trees_fit", input=dataset.params["data_tag"], required_files=[path + "decision_tree.py", path + "measures.py"]) fitmodel_url = job.wait(show=show) return {"fddt_fitmodel": fitmodel_url} # return results url
def measure(input, save_results=True, show=False): from disco.worker.pipeline.worker import Worker, Stage from disco.core import Job # define a job and set save of results to ddfs job = Job(worker=Worker(save_results=save_results)) job.pipeline = [ ("split", Stage("map", input_chain=input.params["input_chain"], init=simple_init, process=map_fit)), ('group_all', Stage("reduce", init=simple_init, process=reduce_fit, combine=True))] job.params = input.params # job parameters (dataset object) job.run(name="Distribution", input=input.params["data_tag"]) return job.wait(show=show) # return results url
def predict(dataset, fitmodel_url, save_results=True, show=False): from disco.worker.pipeline.worker import Worker, Stage from disco.core import Job, result_iterator if "linreg_fitmodel" not in fitmodel_url: raise Exception("Incorrect fit model.") job = Job(worker=Worker(save_results=save_results)) job.pipeline = [ ("split", Stage("map", input_chain=dataset.params["input_chain"], init=simple_init, process=map_predict))] job.params = dataset.params job.params["thetas"] = [v for _, v in result_iterator(fitmodel_url["linreg_fitmodel"])][0] job.run(name="linreg_predict", input=dataset.params["data_tag"]) return job.wait(show=show)
def fit(dataset, nu=0.1, save_results=True, show=False): """ Function starts a job for calculation of model parameters Parameters ---------- input - dataset object with input urls and other parameters nu - parameter to adjust the classifier save_results - save results to ddfs show - show info about job execution Returns ------- Urls of fit model results on ddfs """ from disco.worker.pipeline.worker import Worker, Stage from disco.core import Job if dataset.params["y_map"] == []: raise Exception( "Linear proximal SVM requires a target label mapping parameter.") try: nu = float(nu) if nu <= 0: raise Exception("Parameter nu should be greater than 0") except ValueError: raise Exception("Parameter should be numerical.") job = Job(worker=Worker(save_results=save_results)) # job parallelizes mappers and joins them with one reducer job.pipeline = [("split", Stage("map", input_chain=dataset.params["input_chain"], init=simple_init, process=map_fit)), ('group_all', Stage("reduce", init=simple_init, process=reduce_fit, combine=True))] job.params = dataset.params job.params["nu"] = nu job.run(name="linearsvm_fit", input=dataset.params["data_tag"]) fitmodel_url = job.wait(show=show) return {"linsvm_fitmodel": fitmodel_url} # return results url
def _fit_predict(fit_data, samples, tau, save_results, show): from disco.worker.pipeline.worker import Worker, Stage from disco.core import Job job = Job(worker=Worker(save_results=save_results)) job.pipeline = [ ("split", Stage("map", input_chain=fit_data.params["input_chain"], init=simple_init, process=map_fit)), ("group_all", Stage("reduce", init=simple_init, process=reduce_fit, sort=True, combine=True)), ] job.params = fit_data.params job.params["tau"] = tau job.params["samples"] = samples job.run(name="lwlr_fit_predict", input=fit_data.params["data_tag"]) return job.wait(show=show)
def predict(dataset, fitmodel_url, save_results=True, show=False): from disco.worker.pipeline.worker import Worker, Stage from disco.core import Job, result_iterator if "linreg_fitmodel" not in fitmodel_url: raise Exception("Incorrect fit model.") job = Job(worker=Worker(save_results=save_results)) job.pipeline = [("split", Stage("map", input_chain=dataset.params["input_chain"], init=simple_init, process=map_predict))] job.params = dataset.params job.params["thetas"] = [ v for _, v in result_iterator(fitmodel_url["linreg_fitmodel"]) ][0] job.run(name="linreg_predict", input=dataset.params["data_tag"]) return job.wait(show=show)
def predict(dataset, fitmodel_url, save_results=True, show=False): """ Function starts a job that makes predictions to input data with a given model Parameters ---------- input - dataset object with input urls and other parameters fitmodel_url - model created in fit phase save_results - save results to ddfs show - show info about job execution Returns ------- Urls with predictions on ddfs """ from disco.worker.pipeline.worker import Worker, Stage from disco.core import Job, result_iterator if dataset.params["y_map"] == []: raise Exception( "Logistic regression requires a target label mapping parameter.") if "logreg_fitmodel" not in fitmodel_url: raise Exception("Incorrect fit model.") job = Job(worker=Worker(save_results=save_results)) # job parallelizes execution of mappers job.pipeline = [("split", Stage("map", input_chain=dataset.params["input_chain"], init=simple_init, process=map_predict))] job.params = dataset.params # job parameters (dataset object) job.params["thetas"] = [ v for k, v in result_iterator(fitmodel_url["logreg_fitmodel"]) if k == "thetas" ][0] # thetas are loaded from ddfs job.run(name="logreg_predict", input=dataset.params["data_tag"]) results = job.wait(show=show) return results
def fit(dataset, save_results=True, show=False): from disco.worker.pipeline.worker import Worker, Stage from disco.core import Job job = Job(worker=Worker(save_results=save_results)) job.pipeline = [("split", Stage("map", input_chain=dataset.params["input_chain"], init=simple_init, process=map_fit)), ('group_all', Stage("reduce", init=simple_init, process=reduce_fit, combine=True))] job.params = dataset.params job.run(name="linreg_fit", input=dataset.params["data_tag"]) fitmodel_url = job.wait(show=show) return {"linreg_fitmodel": fitmodel_url} # return results url
def fit(dataset, nu=0.1, save_results=True, show=False): """ Function starts a job for calculation of model parameters Parameters ---------- input - dataset object with input urls and other parameters nu - parameter to adjust the classifier save_results - save results to ddfs show - show info about job execution Returns ------- Urls of fit model results on ddfs """ from disco.worker.pipeline.worker import Worker, Stage from disco.core import Job if dataset.params["y_map"] == []: raise Exception("Linear proximal SVM requires a target label mapping parameter.") try: nu = float(nu) if nu <= 0: raise Exception("Parameter nu should be greater than 0") except ValueError: raise Exception("Parameter should be numerical.") job = Job(worker=Worker(save_results=save_results)) # job parallelizes mappers and joins them with one reducer job.pipeline = [ ("split", Stage("map", input_chain=dataset.params["input_chain"], init=simple_init, process=map_fit)), ('group_all', Stage("reduce", init=simple_init, process=reduce_fit, combine=True))] job.params = dataset.params job.params["nu"] = nu job.run(name="linearsvm_fit", input=dataset.params["data_tag"]) fitmodel_url = job.wait(show=show) return {"linsvm_fitmodel": fitmodel_url} # return results url
def fit(dataset, save_results=True, show=False): """ Function builds a model for Naive Bayes. It executes multiple map functions and one reduce function which aggregates intermediate results and returns a model. Parameters ---------- input - dataset object with input urls and other parameters save_results - save results to ddfs show - show info about job execution Returns ------- Urls of fit model results on ddfs """ from disco.worker.pipeline.worker import Worker, Stage from disco.core import Job # define a job and set save of results to ddfs job = Job(worker=Worker(save_results=save_results)) # job parallelizes mappers, sorts intermediate pairs and joins them with one reducer job.pipeline = [("split", Stage("map", input_chain=dataset.params["input_chain"], init=simple_init, process=map_fit)), ('group_all', Stage("reduce", init=simple_init, process=reduce_fit, sort=True, combine=True))] job.params = dataset.params # job parameters (dataset object) # define name of a job and input data urls job.run(name="naivebayes_fit", input=dataset.params["data_tag"]) fitmodel_url = job.wait(show=show) return {"naivebayes_fitmodel": fitmodel_url} # return results url
def predict(dataset, fitmodel_url, coeff=0.5, save_results=True, show=False): from disco.worker.pipeline.worker import Worker, Stage from disco.core import Job, result_iterator import discomll path = "/".join( discomll.__file__.split("/")[:-1] + ["ensemble", "core", ""]) job = Job(worker=Worker(save_results=save_results)) job.pipeline = [("split", Stage("map", input_chain=dataset.params["input_chain"], init=simple_init, process=map_predict))] if "dwf_fitmodel" not in fitmodel_url: raise Exception("Incorrect fit model.") try: coeff = float(coeff) if coeff < 0: raise Exception("Parameter coeff should be greater than 0.") except ValueError: raise Exception("Parameter coeff should be numerical.") job.params = dataset.params job.params["coeff"] = coeff for k, v in result_iterator(fitmodel_url["dwf_fitmodel"]): job.params[k] = v if len(job.params["forest"]) == 0: print "Warning: There is no decision trees in forest" return [] job.run(name="distributed_weighted_forest_predict", input=dataset.params["data_tag"], required_files=[path + "decision_tree.py"]) return job.wait(show=show)
def _fit_predict(fit_data, samples, tau, save_results, show): from disco.worker.pipeline.worker import Worker, Stage from disco.core import Job job = Job(worker=Worker(save_results=save_results)) job.pipeline = [("split", Stage("map", input_chain=fit_data.params["input_chain"], init=simple_init, process=map_fit)), ('group_all', Stage("reduce", init=simple_init, process=reduce_fit, sort=True, combine=True))] job.params = fit_data.params job.params["tau"] = tau job.params["samples"] = samples job.run(name="lwlr_fit_predict", input=fit_data.params["data_tag"]) return job.wait(show=show)
def predict(dataset, fitmodel_url, voting=False, save_results=True, show=False): from disco.worker.pipeline.worker import Worker, Stage from disco.core import Job, result_iterator import discomll path = "/".join( discomll.__file__.split("/")[:-1] + ["ensemble", "core", ""]) if "drf_fitmodel" not in fitmodel_url: raise Exception("Incorrect fit model.") job = Job(worker=Worker(save_results=save_results)) job.pipeline = [ ("split", Stage("map", input_chain=dataset.params["input_chain"], init=simple_init, process=map_predict_voting if voting else map_predict_dist)) ] job.params = dataset.params for k, v in result_iterator(fitmodel_url["drf_fitmodel"]): job.params[k] = v if len(job.params["forest"]) == 0: print "Warning: There is no decision trees in forest" return [] job.run(name="distributed_random_forest_predict", input=dataset.params["data_tag"], required_files=[path + "decision_tree.py"]) return job.wait(show=show)
def fit_predict(training_data, fitting_data, tau=1, samples_per_job=0, save_results=True, show=False): from disco.worker.pipeline.worker import Worker, Stage from disco.core import Job, result_iterator from disco.core import Disco """ training_data - training samples fitting_data - dataset to be fitted to training data. tau - controls how quickly the weight of a training sample falls off with distance of its x(i) from the query point x. samples_per_job - define a number of samples that will be processed in single mapreduce job. If 0, algorithm will calculate number of samples per job. """ try: tau = float(tau) if tau <= 0: raise Exception("Parameter tau should be >= 0.") except ValueError: raise Exception("Parameter tau should be numerical.") if fitting_data.params["id_index"] == -1: raise Exception("Predict data should have id_index set.") job = Job(worker=Worker(save_results=save_results)) job.pipeline = [("split", Stage("map", input_chain=fitting_data.params["input_chain"], init=simple_init, process=map_predict))] job.params = fitting_data.params job.run(name="lwlr_read_data", input=fitting_data.params["data_tag"]) samples = {} results = [] tau = float(2 * tau**2) # calculate tau once counter = 0 for test_id, x in result_iterator(job.wait(show=show)): if samples_per_job == 0: # calculate number of samples per job if len(x) <= 100: # if there is less than 100 attributes samples_per_job = 100 # 100 samples is max per on job else: # there is more than 100 attributes samples_per_job = len(x) * -25 / 900. + 53 # linear function samples[test_id] = x if counter == samples_per_job: results.append( _fit_predict(training_data, samples, tau, save_results, show)) counter = 0 samples = {} counter += 1 if len(samples) > 0: # if there is some samples left in the the dictionary results.append( _fit_predict(training_data, samples, tau, save_results, show)) # merge results of every iteration into a single tag ddfs = Disco().ddfs ddfs.tag(job.name, [[list(ddfs.blobs(tag))[0][0]] for tag in results]) return ["tag://" + job.name]
def fit( dataset, trees_per_chunk=3, max_tree_nodes=50, min_samples_leaf=10, min_samples_split=5, class_majority=1, measure="info_gain", k="sqrt", accuracy=1, random_state=None, separate_max=True, save_results=True, show=False, ): from disco.worker.pipeline.worker import Worker, Stage from disco.core import Job import discomll path = "/".join(discomll.__file__.split("/")[:-1] + ["ensemble", "core", ""]) try: trees_per_chunk = int(trees_per_chunk) max_tree_nodes = int(max_tree_nodes) if max_tree_nodes != None else max_tree_nodes min_samples_leaf = int(min_samples_leaf) min_samples_split = int(min_samples_split) class_majority = float(class_majority) separate_max = separate_max accuracy = int(accuracy) if ( trees_per_chunk <= 0 or min_samples_leaf <= 0 or min_samples_split <= 0 or class_majority <= 0 or accuracy < 0 ): raise Exception("Parameters should be greater than 0.") except ValueError: raise Exception("Parameters should be numerical.") if measure not in ["info_gain", "mdl"]: raise Exception("measure should be set to info_gain or mdl.") job = Job(worker=Worker(save_results=save_results)) job.pipeline = [ ("split", Stage("map", input_chain=dataset.params["input_chain"], init=map_init, process=map_fit)), ("group_all", Stage("reduce", init=simple_init, process=reduce_fit, combine=True)), ] job.params = dataset.params job.params["trees_per_chunk"] = trees_per_chunk job.params["max_tree_nodes"] = max_tree_nodes job.params["min_samples_leaf"] = min_samples_leaf job.params["min_samples_split"] = min_samples_split job.params["class_majority"] = class_majority job.params["measure"] = measure job.params["accuracy"] = accuracy job.params["k"] = k job.params["seed"] = random_state job.params["separate_max"] = separate_max job.run( name="distributed_weighted_forest_fit", input=dataset.params["data_tag"], required_files=[path + "decision_tree.py", path + "measures.py", path + "k_medoids.py"], ) fitmodel_url = job.wait(show=show) return {"dwf_fitmodel": fitmodel_url} # return results url
def fit(dataset, trees_per_chunk=1, bootstrap=True, max_tree_nodes=50, min_samples_leaf=10, min_samples_split=5, class_majority=1, separate_max=True, measure="info_gain", accuracy=1, random_state=None, save_results=True, show=False): from disco.worker.pipeline.worker import Worker, Stage from disco.core import Job import discomll path = "/".join( discomll.__file__.split("/")[:-1] + ["ensemble", "core", ""]) try: trees_per_chunk = int(trees_per_chunk) max_tree_nodes = int( max_tree_nodes) if max_tree_nodes != None else max_tree_nodes min_samples_leaf = int(min_samples_leaf) min_samples_split = int(min_samples_split) class_majority = float(class_majority) accuracy = int(accuracy) separate_max = separate_max if trees_per_chunk > 1 and bootstrap == False: raise Exception( "Parameter trees_per_chunk (or Trees per subset) should be 1 to disable bootstrap." ) if trees_per_chunk <= 0 or min_samples_leaf <= 0 or class_majority <= 0 or min_samples_split <= 0 and accuracy < 0 or type( bootstrap) != bool: raise Exception("Parameters should be greater than 0.") except ValueError: raise Exception("Parameters should be numerical.") if measure not in ["info_gain", "mdl"]: raise Exception("measure should be set to info_gain or mdl.") job = Job(worker=Worker(save_results=save_results)) job.pipeline = [ ("split", Stage("map", input_chain=dataset.params["input_chain"], init=map_init, process=map_fit_bootstrap if bootstrap else map_fit)), ('group_all', Stage("reduce", init=simple_init, process=reduce_fit, combine=True)) ] job.params = dataset.params job.params["trees_per_chunk"] = trees_per_chunk job.params["max_tree_nodes"] = max_tree_nodes job.params["min_samples_leaf"] = min_samples_leaf job.params["min_samples_split"] = min_samples_split job.params["class_majority"] = class_majority job.params["measure"] = measure job.params["bootstrap"] = bootstrap job.params["accuracy"] = accuracy job.params["separate_max"] = separate_max job.params['seed'] = random_state job.run(name="forest_distributed_decision_trees_fit", input=dataset.params["data_tag"], required_files=[path + "decision_tree.py", path + "measures.py"]) fitmodel_url = job.wait(show=show) return {"fddt_fitmodel": fitmodel_url} # return results url
def predict(dataset, fitmodel_url, m=1, save_results=True, show=False): """ Function starts a job that makes predictions to input data with a given model Parameters ---------- input - dataset object with input urls and other parameters fitmodel_url - model created in fit phase m - m estimate is used with discrete features save_results - save results to ddfs show - show info about job execution Returns ------- Urls of predictions on ddfs """ from disco.worker.pipeline.worker import Worker, Stage from disco.core import Job, result_iterator import numpy as np try: m = float(m) except ValueError: raise Exception("Parameter m should be numerical.") if "naivebayes_fitmodel" in fitmodel_url: # fit model is loaded from ddfs fit_model = dict( (k, v) for k, v in result_iterator(fitmodel_url["naivebayes_fitmodel"])) if len(fit_model["y_labels"]) < 2: print "There is only one class in training data." return [] else: raise Exception("Incorrect fit model.") if dataset.params["X_meta"].count( "d") > 0: # if there are discrete features in the model # code calculates logarithms to optimize predict phase as opposed to calculation by every mapped. np.seterr(divide='ignore') for iv in fit_model["iv"]: dist = [ fit_model.pop((y, ) + iv, 0) for y in fit_model["y_labels"] ] fit_model[iv] = np.nan_to_num( np.log( np.true_divide( np.array(dist) + m * fit_model["prior"], np.sum(dist) + m))) - fit_model["prior_log"] del (fit_model["iv"]) # define a job and set save of results to ddfs job = Job(worker=Worker(save_results=save_results)) # job parallelizes execution of mappers job.pipeline = [("split", Stage("map", input_chain=dataset.params["input_chain"], init=simple_init, process=map_predict))] job.params = dataset.params # job parameters (dataset object) job.params["fit_model"] = fit_model # define name of a job and input data urls job.run(name="naivebayes_predict", input=dataset.params["data_tag"]) results = job.wait(show=show) return results
def fit(dataset, alpha=1e-8, max_iterations=10, save_results=True, show=False): """ Function starts a job for calculation of theta parameters Parameters ---------- input - dataset object with input urls and other parameters alpha - convergence value max_iterations - define maximum number of iterations save_results - save results to ddfs show - show info about job execution Returns ------- Urls of fit model results on ddfs """ from disco.worker.pipeline.worker import Worker, Stage from disco.core import Job, result_iterator import numpy as np if dataset.params["y_map"] == []: raise Exception( "Logistic regression requires a target label mapping parameter.") try: alpha = float(alpha) max_iterations = int(max_iterations) if max_iterations < 1: raise Exception( "Parameter max_iterations should be greater than 0.") except ValueError: raise Exception("Parameters should be numerical.") # initialize thetas to 0 and add intercept term thetas = np.zeros(len(dataset.params["X_indices"]) + 1) J = [0] # J cost function values for every iteration for i in range(max_iterations): job = Job(worker=Worker(save_results=save_results)) # job parallelizes mappers and joins them with one reducer job.pipeline = [("split", Stage("map", input_chain=dataset.params["input_chain"], init=simple_init, process=map_fit)), ('group_all', Stage("reduce", init=simple_init, process=reduce_fit, combine=True))] job.params = dataset.params # job parameters (dataset object) job.params["thetas"] = thetas # every iteration set new thetas job.run(name="logreg_fit_iter_%d" % (i + 1), input=dataset.params["data_tag"]) fitmodel_url = job.wait(show=show) for k, v in result_iterator(fitmodel_url): if k == "J": # J.append(v) # save value of J cost function else: thetas = v # save new thetas if np.abs(J[-2] - J[-1]) < alpha: # check for convergence if show: print("Converged at iteration %d" % (i + 1)) break return {"logreg_fitmodel": fitmodel_url} # return results url