def ex3_discomll(replication=2): data_tag = ["test:ex3" for i in range(replication)] train_data = dataset.Data(data_tag=data_tag, data_type="chunk", X_indices=[0, 1], y_index=2) data_tag = ["test:ex3_test" for i in range(replication)] test_data = dataset.Data(data_tag=data_tag, data_type="chunk", X_indices=[0, 1], y_index=2) return train_data, test_data
def regression_data_discomll(): train = dataset.Data( data_tag=["test:regression_data1", "test:regression_data2"], data_type="chunk", id_index=0, X_indices=[0], X_meta=["c"], y_index=1) test = dataset.Data( data_tag=["test:regression_data_test1", "test:regression_data_test2"], data_type="chunk", id_index=0, X_indices=[0], X_meta=["c"], y_index=1) return train, test
def ex4_discomll(replication=2): data_tag = ["test:ex4" for i in range(replication)] data = dataset.Data(data_tag=data_tag, data_type="chunk", X_indices=xrange(0, 2), y_index=2, y_map=["0.0000000e+00", "1.0000000e+00"]) return data
def iris_discomll(replication=2): data_tag = ["test:iris" for i in range(replication)] train_data = dataset.Data(data_tag=data_tag, data_type="chunk", X_indices=xrange(0, 4), X_meta=["c" for i in xrange(0, 4)], y_index=4, delimiter=",") data_tag = ["test:iris_test" for i in range(replication)] test_data = dataset.Data(data_tag=data_tag, data_type="chunk", X_indices=xrange(0, 4), X_meta=["c" for i in xrange(0, 4)], y_index=4, delimiter=",") return train_data, test_data
def breastcancer_cont_discomll(replication=2): data_tag = ["test:breast_cancer_cont" for _ in range(replication)] train_data = dataset.Data(data_tag=data_tag, data_type="chunk", X_indices=xrange(0, 9), X_meta=["c" for i in range(9)], y_index=9, delimiter=",", y_map=["benign", "malign"]) data_tag = ["test:breast_cancer_cont_test" for i in range(replication)] test_data = dataset.Data(data_tag=data_tag, data_type="chunk", X_indices=xrange(0, 9), X_meta=["c" for i in range(9)], y_index=9, delimiter=",", y_map=["benign", "malign"]) return train_data, test_data
def file_url(input_dict): from discomll import dataset if input_dict["range"] == "true": urls = [ url.strip() for url in input_dict["url"].split("\n") if url != "" ] else: urls = [[url.strip()] for url in input_dict["url"].split("\n") if url != ""] for url in urls: if url[0].split("://")[0] == "https": raise Exception("Dataset should be accessible over HTTP.") del (input_dict["url"]) X_indices_splited = input_dict["X_indices"].replace(" ", "").split("-") if len(X_indices_splited) == 2: a, b = X_indices_splited if not a.isdigit() or not b.isdigit(): raise Exception( "Feature indices should be integers. Example: 1-10") X_indices = range(int(a), int(b)) else: X_indices = [ int(v) for v in input_dict["X_indices"].replace(" ", "").split(",") if v != "" ] del (input_dict["X_indices"]) input_dict[ "data_type"] = "gzip" if input_dict["data_type"] == "true" else "" if input_dict["atr_meta"] == "numeric": X_meta = ["c" for i in range(len(X_indices))] elif input_dict["atr_meta"] == "discrete": X_meta = ["d" for i in range(len(X_indices))] else: X_meta = input_dict["custom"] data = dataset.Data( data_tag=urls, X_indices=X_indices, X_meta=X_meta, generate_urls=True if input_dict["range"] == "true" else False, **input_dict) print data.params return {"dataset": data}
def breastcancer_disc_discomll(replication=2): data_tag = ["test:breast_cancer_disc" for i in range(replication)] train_data = dataset.Data(data_tag=data_tag, data_type="chunk", X_indices=xrange(1, 10), X_meta=["d" for i in range(9)], id_index=0, y_index=10, delimiter=",", y_map=["2", "4"]) data_tag = ["test:breast_cancer_disc_test" for i in range(replication)] test_data = dataset.Data(data_tag=data_tag, data_type="chunk", X_indices=xrange(1, 10), X_meta=["d" for i in range(9)], id_index=0, y_index=10, delimiter=",", y_map=["2", "4"], missing_vals=["?"]) return train_data, test_data
from disco.core import result_iterator from discomll import dataset from discomll.regression import linear_regression from discomll.utils import model_view # define training dataset train = dataset.Data(data_tag=["test:ex3"], data_type="chunk", X_indices=[0, 1], y_index=2) # define test dataset test = dataset.Data(data_tag=["test:ex3_test"], data_type="chunk", X_indices=[0, 1], y_index=2) # fit model on training dataset fit_model = linear_regression.fit(train) # output model model = model_view.output_model(fit_model) print model # predict test dataset predictions = linear_regression.predict(test, fit_model) # output results for k, v in result_iterator(predictions): print k, v[0]
from discomll import dataset from discomll.classification import linear_svm train = dataset.Data(data_tag=[ "http://ropot.ijs.si/data/sonar/train/xaaaaa.gz", "http://ropot.ijs.si/data/sonar/train/xaaabj.gz" ], data_type="gzip", generate_urls=True, X_indices=range(1, 61), id_index=0, y_index=61, X_meta=["c" for i in range(1, 61)], y_map=["R", "M"], delimiter=",") test = dataset.Data(data_tag=[ "http://ropot.ijs.si/data/sonar/test/xaaaaa.gz", "http://ropot.ijs.si/data/sonar/test/xaaabj.gz" ], data_type="gzip", generate_urls=True, X_indices=range(1, 61), id_index=0, y_index=61, X_meta=["c" for i in range(1, 61)], y_map=["R", "M"], delimiter=",") fit_model = linear_svm.fit(train) predictions = linear_svm.predict(test, fit_model)
from discomll import dataset from discomll.classification import linear_svm train = dataset.Data(data_tag=[ "http://ropot.ijs.si/data/ionosphere/train/xaaaaa.gz", "http://ropot.ijs.si/data/ionosphere/train/xaaabj.gz" ], data_type="gzip", generate_urls=True, id_index=0, X_indices=range(1, 35), X_meta=["c" for i in range(1, 35)], y_index=35, delimiter=",", y_map=["b", "g"]) test = dataset.Data(data_tag=[ "http://ropot.ijs.si/data/ionosphere/test/xaaaaa.gz", "http://ropot.ijs.si/data/ionosphere/test/xaaabj.gz" ], data_type="gzip", generate_urls=True, id_index=0, X_indices=range(1, 35), X_meta=["c" for i in range(1, 35)], y_index=35, delimiter=",", y_map=["b", "g"]) fit_model = linear_svm.fit(train) predictions = linear_svm.predict(test, fit_model)
from discomll import dataset from discomll.classification import naivebayes train = dataset.Data(data_tag=["http://ropot.ijs.si/data/lymphography/train/xaaaaa.gz", "http://ropot.ijs.si/data/lymphography/train/xaaabj.gz"], data_type="gzip", generate_urls=True, X_indices=range(2, 20), id_index=0, y_index=1, X_meta=["d", "d", "d", "d", "d", "d", "d", "d", "c", "c", "d", "d", "d", "d", "d", "d", "d", "c"], delimiter=",") test = dataset.Data(data_tag=["http://ropot.ijs.si/data/lymphography/test/xaaaaa.gz", "http://ropot.ijs.si/data/lymphography/test/xaaabj.gz"], data_type="gzip", generate_urls=True, X_indices=range(2, 20), id_index=0, y_index=1, X_meta=["d", "d", "d", "d", "d", "d", "d", "d", "c", "c", "d", "d", "d", "d", "d", "d", "d", "c"], delimiter=",") fit_model = naivebayes.fit(train) predictions = naivebayes.predict(test, fit_model) print predictions
from discomll import dataset from discomll.classification import naivebayes train = dataset.Data(data_tag=[ "http://ropot.ijs.si/data/segmentation/train/xaaaaa.gz", "http://ropot.ijs.si/data/segmentation/train/xaaabj.gz" ], data_type="gzip", generate_urls=True, X_indices=range(2, 21), id_index=0, y_index=1, X_meta=["c" for i in range(2, 21)], delimiter=",") test = dataset.Data(data_tag=[ "http://ropot.ijs.si/data/segmentation/test/xaaaaa.gz", "http://ropot.ijs.si/data/segmentation/test/xaaabj.gz" ], data_type="gzip", generate_urls=True, X_indices=range(2, 21), id_index=0, y_index=1, X_meta=["c" for i in range(2, 21)], delimiter=",") fit_model = naivebayes.fit(train) predictions = naivebayes.predict(test, fit_model) print predictions
from disco.core import result_iterator from discomll import dataset from discomll.ensemble import distributed_weighted_forest_rand from discomll.utils import accuracy train = dataset.Data( data_tag=[[ "http://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data" ]], id_index=0, X_indices=xrange(1, 10), X_meta="http://ropot.ijs.si/data/datasets_meta/breastcancer_meta.csv", y_index=10, delimiter=",") fit_model = distributed_weighted_forest_rand.fit(train, trees_per_chunk=3, max_tree_nodes=50, min_samples_leaf=5, min_samples_split=10, class_majority=1, measure="info_gain", num_medoids=10, accuracy=1, separate_max=True, random_state=None, save_results=True) # predict training dataset predictions = distributed_weighted_forest_rand.predict(train, fit_model)
from disco.core import result_iterator from discomll import dataset from discomll.classification import naivebayes from discomll.utils import model_view from discomll.utils import accuracy # define training dataset train = dataset.Data(data_tag=["test:breast_cancer_disc"], data_type="chunk", X_indices=xrange(1, 10), X_meta=["d" for i in xrange(1, 10)], id_index=0, y_index=10, delimiter=",", y_map=["2", "4"], # define mapping parameter. "2" is mapped to 1, "4" is mapped to -1. missing_vals=["?"]) # define missing value symbol # define test dataset test = dataset.Data(data_tag=["test:breast_cancer_disc_test"], data_type="chunk", X_indices=xrange(1, 10), X_meta=["d" for i in xrange(1, 10)], id_index=0, y_index=10, delimiter=",", y_map=["2", "4"], # define mapping parameter. "2" is mapped to 1, "4" is mapped to -1. missing_vals=["?"]) # define missing value symbol
def file_url(input_dict): from discomll import dataset import itertools urls = [url.strip() for url in input_dict["url"].split("\n") if url != ""] for url in urls: if url.split("/")[0].lower().startswith("https"): raise Exception( "URLs should be accessible via HTTP and not HTTPS.") if input_dict["range"] == "true": if len(urls) != 2: raise Exception( "A first and last URL should be specified if the Range parameter is checked." ) url_start = urls[0].split("/") url_end = urls[1].split("/") url_base = "/".join(url_start[:-1]) start_index = url_start[-1].index("a") file_name = url_start[-1][0:start_index] url_base += "/" + file_name start = url_start[-1][start_index:] finish = url_end[-1][start_index:] file_extension = "" if start.count(".") == 1 and finish.count(".") == 1: start, file_extension = start.split(".") finish, _ = finish.split(".") file_extension = "." + file_extension else: raise Exception("URLs does not have the same pattern.") alphabet = "abcdefghijklmnopqrstuvwxyz" product = itertools.product(alphabet, repeat=len(start)) urls = [] for p in product: urls.append(url_base + "".join(p) + file_extension) if "".join(p) == finish: break X_indices_splited = input_dict["X_indices"].replace(" ", "").split("-") if len(X_indices_splited) == 2: a, b = X_indices_splited if not a.isdigit() or not b.isdigit(): raise Exception( "Feature indices should be integers. Example: 1-10") X_indices = range(int(a), int(b)) else: X_indices = [ int(v) for v in input_dict["X_indices"].replace(" ", "").split(",") if v != "" ] del (input_dict["X_indices"]) input_dict[ "data_type"] = "gzip" if input_dict["data_type"] == "true" else "" data = dataset.Data(data_tag=urls, X_indices=X_indices, **input_dict) return {"dataset": data}
from discomll import dataset from discomll.clustering import kmeans train = dataset.Data(data_tag=[ "http://ropot.ijs.si/data/linear/train/xaaaaa.gz", "http://ropot.ijs.si/data/linear/train/xaaabj.gz" ], data_type="gzip", generate_urls=True, X_indices=range(1, 22), id_index=0, delimiter=",") test = dataset.Data(data_tag=[ "http://ropot.ijs.si/data/linear/test/xaaaaa.gz", "http://ropot.ijs.si/data/linear/test/xaaabj.gz" ], data_type="gzip", generate_urls=True, X_indices=range(1, 22), id_index=0, delimiter=",") fit_model = kmeans.fit(train, n_clusters=5, max_iterations=10, random_state=0) predictions = kmeans.predict(test, fit_model) print predictions
from disco.core import result_iterator from discomll import dataset from discomll.ensemble import forest_distributed_decision_trees from discomll.utils import model_view train = dataset.Data(data_tag=[["http://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"]], X_indices=xrange(0, 4), X_meta="http://ropot.ijs.si/data/datasets_meta/iris_meta.csv", y_index=4, delimiter=",") fit_model = forest_distributed_decision_trees.fit(train, trees_per_chunk=1, bootstrap=False, max_tree_nodes=50, min_samples_leaf=2, min_samples_split=1, class_majority=1, separate_max=True, measure="info_gain", accuracy=1, random_state=None, save_results=True) print model_view.output_model(fit_model) # predict training dataset predictions = forest_distributed_decision_trees.predict(train, fit_model) # output results for k, v in result_iterator(predictions): print k, v[0]
from discomll import dataset from discomll.classification import logistic_regression from discomll.utils import model_view # define training dataset train = dataset.Data(data_tag=["test:ex4"], data_type="chunk", X_indices=xrange(0, 2), y_index=2, y_map=["0.0000000e+00", "1.0000000e+00"]) # fit model on training dataset fit_model = logistic_regression.fit(train) # output model model = model_view.output_model(fit_model) print model
from disco.core import result_iterator from discomll import dataset from discomll.clustering import kmeans from discomll.utils import model_view # define training dataset train = dataset.Data( data_tag=["test:breast_cancer_cont"], data_type="chunk", # define data source - chunk data on ddfs X_indices=xrange(0, 9), # define attribute indices y_index=9, # define class index delimiter=",") # define test dataset test = dataset.Data( data_tag=["test:breast_cancer_cont_test"], data_type="chunk", # define data source - chunk data on ddfs X_indices=xrange(0, 9), # define attribute indices y_index=9, # define class index delimiter=",") # fit model on training dataset fit_model = kmeans.fit(train, n_clusters=2, max_iterations=5, random_state=0) # output model model = model_view.output_model(fit_model) print model # predict test dataset predictions = kmeans.predict(test, fit_model)
from disco.core import result_iterator from discomll import dataset from discomll.regression import locally_weighted_linear_regression training_data = dataset.Data( data_tag=["test:regression_data1", "test:regression_data2"], data_type="chunk", id_index=0, X_indices=[0], y_index=1) fitting_data = dataset.Data( data_tag=["test:regression_data_test1", "test:regression_data_test2"], data_type="chunk", id_index=0, X_indices=[0], y_index=1) # fit fitting data to training data results = locally_weighted_linear_regression.fit_predict(training_data, fitting_data, tau=10) # output results for k, v in result_iterator(results): print k, v
from discomll import dataset from discomll.regression import locally_weighted_linear_regression train = dataset.Data(data_tag=[ "http://ropot.ijs.si/data/fraction/train/xaaaaa.gz", "http://ropot.ijs.si/data/fraction/train/xaaabj.gz" ], data_type="gzip", generate_urls=True, X_indices=range(1, 14), id_index=0, y_index=14, delimiter=",") test = dataset.Data( data_tag=[["http://ropot.ijs.si/data/fraction/test/xaaaaa.gz"]], data_type="gzip", X_indices=range(1, 14), id_index=0, y_index=14, delimiter=",") predictions = locally_weighted_linear_regression.fit_predict(train, test, tau=1, samples_per_job=0, save_results=True) print predictions