def predict(dataset, fitmodel_url, save_results=True, show=False): """ Predict the closest clusters for the datapoints in input. """ from disco.job import Job from disco.worker.pipeline.worker import Worker, Stage from disco.core import result_iterator if "kmeans_fitmodel" not in fitmodel_url: raise Exception("Incorrect fit model.") job = Job(worker=Worker(save_results=save_results)) job.params = dict(dataset.params.items() + mean_point_center.items()) job.params["centers"] = [ (i, c) for i, c in result_iterator(fitmodel_url["kmeans_fitmodel"]) ] job.pipeline = [("split", Stage("kmeans_predict", input_chain=dataset.params["input_chain"], init=simple_init, process=predict_map))] job.run(input=dataset.params["data_tag"], name="kmeans_predict") return job.wait(show=show)
def predict(dataset, fitmodel_url, save_results=True, show=False): """ Predict the closest clusters for the datapoints in input. """ from disco.job import Job from disco.worker.pipeline.worker import Worker, Stage from disco.core import result_iterator if "kmeans_fitmodel" not in fitmodel_url: raise Exception("Incorrect fit model.") job = Job(worker=Worker(save_results=save_results)) job.params = dict(dataset.params.items() + mean_point_center.items()) job.params["centers"] = [(i, c) for i, c in result_iterator(fitmodel_url["kmeans_fitmodel"])] job.pipeline = [ ( "split", Stage("kmeans_predict", input_chain=dataset.params["input_chain"], init=simple_init, process=predict_map), ) ] job.run(input=dataset.params["data_tag"], name="kmeans_predict") return job.wait(show=show)
def fit(dataset, n_clusters=5, max_iterations=10, random_state=None, save_results=True, show=False): """ Optimize k-clustering for `iterations` iterations with cluster center definitions as given in `center`. """ from disco.job import Job from disco.worker.pipeline.worker import Worker, Stage from disco.core import result_iterator try: n_clusters = int(n_clusters) max_iterations = int(max_iterations) if n_clusters < 2: raise Exception("Parameter n_clusters should be greater than 1.") if max_iterations < 1: raise Exception( "Parameter max_iterations should be greater than 0.") except ValueError: raise Exception("Parameters should be numerical.") job = Job(worker=Worker(save_results=save_results)) job.pipeline = [("split", Stage("kmeans_init_map", input_chain=dataset.params["input_chain"], init=map_init, process=random_init_map)), ('group_label', Stage("kmeans_init_reduce", process=estimate_reduce, init=simple_init, combine=True))] job.params = dict(dataset.params.items() + mean_point_center.items()) job.params['seed'] = random_state job.params['k'] = n_clusters job.run(input=dataset.params["data_tag"], name="kmeans_init") init = job.wait(show=show) centers = [(i, c) for i, c in result_iterator(init)] for j in range(max_iterations): job = Job(worker=Worker(save_results=save_results)) job.params = dict(dataset.params.items() + mean_point_center.items()) job.params['k'] = n_clusters job.params['centers'] = centers job.pipeline = [('split', Stage("kmeans_map_iter_%s" % (j + 1, ), input_chain=dataset.params["input_chain"], process=estimate_map, init=simple_init)), ('group_label', Stage("kmeans_reduce_iter_%s" % (j + 1, ), process=estimate_reduce, init=simple_init, combine=True))] job.run(input=dataset.params["data_tag"], name='kmeans_iter_%d' % (j + 1, )) fitmodel_url = job.wait(show=show) centers = [(i, c) for i, c in result_iterator(fitmodel_url)] return {"kmeans_fitmodel": fitmodel_url} # return results url
def fit(dataset, n_clusters=5, max_iterations=10, random_state=None, save_results=True, show=False): """ Optimize k-clustering for `iterations` iterations with cluster center definitions as given in `center`. """ from disco.job import Job from disco.worker.pipeline.worker import Worker, Stage from disco.core import result_iterator try: n_clusters = int(n_clusters) max_iterations = int(max_iterations) if n_clusters < 2: raise Exception("Parameter n_clusters should be greater than 1.") if max_iterations < 1: raise Exception("Parameter max_iterations should be greater than 0.") except ValueError: raise Exception("Parameters should be numerical.") job = Job(worker=Worker(save_results=save_results)) job.pipeline = [ ( "split", Stage("kmeans_init_map", input_chain=dataset.params["input_chain"], init=map_init, process=random_init_map), ), ("group_label", Stage("kmeans_init_reduce", process=estimate_reduce, init=simple_init, combine=True)), ] job.params = dict(dataset.params.items() + mean_point_center.items()) job.params["seed"] = random_state job.params["k"] = n_clusters job.run(input=dataset.params["data_tag"], name="kmeans_init") init = job.wait(show=show) centers = [(i, c) for i, c in result_iterator(init)] for j in range(max_iterations): job = Job(worker=Worker(save_results=save_results)) job.params = dict(dataset.params.items() + mean_point_center.items()) job.params["k"] = n_clusters job.params["centers"] = centers job.pipeline = [ ( "split", Stage( "kmeans_map_iter_%s" % (j + 1,), input_chain=dataset.params["input_chain"], process=estimate_map, init=simple_init, ), ), ( "group_label", Stage("kmeans_reduce_iter_%s" % (j + 1,), process=estimate_reduce, init=simple_init, combine=True), ), ] job.run(input=dataset.params["data_tag"], name="kmeans_iter_%d" % (j + 1,)) fitmodel_url = job.wait(show=show) centers = [(i, c) for i, c in result_iterator(fitmodel_url)] return {"kmeans_fitmodel": fitmodel_url} # return results url