Exemple #1
0
def ex3_discomll(replication=2):
    data_tag = ["test:ex3" for i in range(replication)]
    train_data = dataset.Data(data_tag=data_tag,
                              data_type="chunk",
                              X_indices=[0, 1],
                              y_index=2)

    data_tag = ["test:ex3_test" for i in range(replication)]
    test_data = dataset.Data(data_tag=data_tag,
                             data_type="chunk",
                             X_indices=[0, 1],
                             y_index=2)

    return train_data, test_data
Exemple #2
0
def regression_data_discomll():
    train = dataset.Data(
        data_tag=["test:regression_data1", "test:regression_data2"],
        data_type="chunk",
        id_index=0,
        X_indices=[0],
        X_meta=["c"],
        y_index=1)

    test = dataset.Data(
        data_tag=["test:regression_data_test1", "test:regression_data_test2"],
        data_type="chunk",
        id_index=0,
        X_indices=[0],
        X_meta=["c"],
        y_index=1)
    return train, test
Exemple #3
0
def ex4_discomll(replication=2):
    data_tag = ["test:ex4" for i in range(replication)]
    data = dataset.Data(data_tag=data_tag,
                        data_type="chunk",
                        X_indices=xrange(0, 2),
                        y_index=2,
                        y_map=["0.0000000e+00", "1.0000000e+00"])
    return data
Exemple #4
0
def iris_discomll(replication=2):
    data_tag = ["test:iris" for i in range(replication)]
    train_data = dataset.Data(data_tag=data_tag,
                              data_type="chunk",
                              X_indices=xrange(0, 4),
                              X_meta=["c" for i in xrange(0, 4)],
                              y_index=4,
                              delimiter=",")

    data_tag = ["test:iris_test" for i in range(replication)]

    test_data = dataset.Data(data_tag=data_tag,
                             data_type="chunk",
                             X_indices=xrange(0, 4),
                             X_meta=["c" for i in xrange(0, 4)],
                             y_index=4,
                             delimiter=",")

    return train_data, test_data
Exemple #5
0
def breastcancer_cont_discomll(replication=2):
    data_tag = ["test:breast_cancer_cont" for _ in range(replication)]
    train_data = dataset.Data(data_tag=data_tag,
                              data_type="chunk",
                              X_indices=xrange(0, 9),
                              X_meta=["c" for i in range(9)],
                              y_index=9,
                              delimiter=",",
                              y_map=["benign", "malign"])

    data_tag = ["test:breast_cancer_cont_test" for i in range(replication)]
    test_data = dataset.Data(data_tag=data_tag,
                             data_type="chunk",
                             X_indices=xrange(0, 9),
                             X_meta=["c" for i in range(9)],
                             y_index=9,
                             delimiter=",",
                             y_map=["benign", "malign"])

    return train_data, test_data
Exemple #6
0
def file_url(input_dict):
    from discomll import dataset

    if input_dict["range"] == "true":
        urls = [
            url.strip() for url in input_dict["url"].split("\n") if url != ""
        ]
    else:
        urls = [[url.strip()] for url in input_dict["url"].split("\n")
                if url != ""]
        for url in urls:
            if url[0].split("://")[0] == "https":
                raise Exception("Dataset should be accessible over HTTP.")
    del (input_dict["url"])

    X_indices_splited = input_dict["X_indices"].replace(" ", "").split("-")
    if len(X_indices_splited) == 2:
        a, b = X_indices_splited
        if not a.isdigit() or not b.isdigit():
            raise Exception(
                "Feature indices should be integers. Example: 1-10")
        X_indices = range(int(a), int(b))
    else:
        X_indices = [
            int(v) for v in input_dict["X_indices"].replace(" ", "").split(",")
            if v != ""
        ]
    del (input_dict["X_indices"])

    input_dict[
        "data_type"] = "gzip" if input_dict["data_type"] == "true" else ""

    if input_dict["atr_meta"] == "numeric":
        X_meta = ["c" for i in range(len(X_indices))]
    elif input_dict["atr_meta"] == "discrete":
        X_meta = ["d" for i in range(len(X_indices))]
    else:
        X_meta = input_dict["custom"]

    data = dataset.Data(
        data_tag=urls,
        X_indices=X_indices,
        X_meta=X_meta,
        generate_urls=True if input_dict["range"] == "true" else False,
        **input_dict)

    print data.params

    return {"dataset": data}
Exemple #7
0
def breastcancer_disc_discomll(replication=2):
    data_tag = ["test:breast_cancer_disc" for i in range(replication)]

    train_data = dataset.Data(data_tag=data_tag,
                              data_type="chunk",
                              X_indices=xrange(1, 10),
                              X_meta=["d" for i in range(9)],
                              id_index=0,
                              y_index=10,
                              delimiter=",",
                              y_map=["2", "4"])

    data_tag = ["test:breast_cancer_disc_test" for i in range(replication)]
    test_data = dataset.Data(data_tag=data_tag,
                             data_type="chunk",
                             X_indices=xrange(1, 10),
                             X_meta=["d" for i in range(9)],
                             id_index=0,
                             y_index=10,
                             delimiter=",",
                             y_map=["2", "4"],
                             missing_vals=["?"])

    return train_data, test_data
Exemple #8
0
from disco.core import result_iterator

from discomll import dataset
from discomll.regression import linear_regression
from discomll.utils import model_view

# define training dataset
train = dataset.Data(data_tag=["test:ex3"],
                     data_type="chunk",
                     X_indices=[0, 1],
                     y_index=2)

# define test dataset
test = dataset.Data(data_tag=["test:ex3_test"],
                    data_type="chunk",
                    X_indices=[0, 1],
                    y_index=2)

# fit model on training dataset
fit_model = linear_regression.fit(train)

# output model
model = model_view.output_model(fit_model)
print model

# predict test dataset
predictions = linear_regression.predict(test, fit_model)

# output results
for k, v in result_iterator(predictions):
    print k, v[0]
Exemple #9
0
from discomll import dataset
from discomll.classification import linear_svm

train = dataset.Data(data_tag=[
    "http://ropot.ijs.si/data/sonar/train/xaaaaa.gz",
    "http://ropot.ijs.si/data/sonar/train/xaaabj.gz"
],
                     data_type="gzip",
                     generate_urls=True,
                     X_indices=range(1, 61),
                     id_index=0,
                     y_index=61,
                     X_meta=["c" for i in range(1, 61)],
                     y_map=["R", "M"],
                     delimiter=",")

test = dataset.Data(data_tag=[
    "http://ropot.ijs.si/data/sonar/test/xaaaaa.gz",
    "http://ropot.ijs.si/data/sonar/test/xaaabj.gz"
],
                    data_type="gzip",
                    generate_urls=True,
                    X_indices=range(1, 61),
                    id_index=0,
                    y_index=61,
                    X_meta=["c" for i in range(1, 61)],
                    y_map=["R", "M"],
                    delimiter=",")

fit_model = linear_svm.fit(train)
predictions = linear_svm.predict(test, fit_model)
Exemple #10
0
from discomll import dataset
from discomll.classification import linear_svm

train = dataset.Data(data_tag=[
    "http://ropot.ijs.si/data/ionosphere/train/xaaaaa.gz",
    "http://ropot.ijs.si/data/ionosphere/train/xaaabj.gz"
],
                     data_type="gzip",
                     generate_urls=True,
                     id_index=0,
                     X_indices=range(1, 35),
                     X_meta=["c" for i in range(1, 35)],
                     y_index=35,
                     delimiter=",",
                     y_map=["b", "g"])

test = dataset.Data(data_tag=[
    "http://ropot.ijs.si/data/ionosphere/test/xaaaaa.gz",
    "http://ropot.ijs.si/data/ionosphere/test/xaaabj.gz"
],
                    data_type="gzip",
                    generate_urls=True,
                    id_index=0,
                    X_indices=range(1, 35),
                    X_meta=["c" for i in range(1, 35)],
                    y_index=35,
                    delimiter=",",
                    y_map=["b", "g"])

fit_model = linear_svm.fit(train)
predictions = linear_svm.predict(test, fit_model)
from discomll import dataset
from discomll.classification import naivebayes

train = dataset.Data(data_tag=["http://ropot.ijs.si/data/lymphography/train/xaaaaa.gz",
                               "http://ropot.ijs.si/data/lymphography/train/xaaabj.gz"],
                     data_type="gzip",
                     generate_urls=True,
                     X_indices=range(2, 20),
                     id_index=0,
                     y_index=1,
                     X_meta=["d", "d", "d", "d", "d", "d", "d", "d", "c", "c", "d", "d", "d", "d", "d", "d", "d", "c"],
                     delimiter=",")

test = dataset.Data(data_tag=["http://ropot.ijs.si/data/lymphography/test/xaaaaa.gz",
                              "http://ropot.ijs.si/data/lymphography/test/xaaabj.gz"],
                    data_type="gzip",
                    generate_urls=True,
                    X_indices=range(2, 20),
                    id_index=0,
                    y_index=1,
                    X_meta=["d", "d", "d", "d", "d", "d", "d", "d", "c", "c", "d", "d", "d", "d", "d", "d", "d", "c"],
                    delimiter=",")

fit_model = naivebayes.fit(train)
predictions = naivebayes.predict(test, fit_model)
print predictions
from discomll import dataset
from discomll.classification import naivebayes

train = dataset.Data(data_tag=[
    "http://ropot.ijs.si/data/segmentation/train/xaaaaa.gz",
    "http://ropot.ijs.si/data/segmentation/train/xaaabj.gz"
],
                     data_type="gzip",
                     generate_urls=True,
                     X_indices=range(2, 21),
                     id_index=0,
                     y_index=1,
                     X_meta=["c" for i in range(2, 21)],
                     delimiter=",")

test = dataset.Data(data_tag=[
    "http://ropot.ijs.si/data/segmentation/test/xaaaaa.gz",
    "http://ropot.ijs.si/data/segmentation/test/xaaabj.gz"
],
                    data_type="gzip",
                    generate_urls=True,
                    X_indices=range(2, 21),
                    id_index=0,
                    y_index=1,
                    X_meta=["c" for i in range(2, 21)],
                    delimiter=",")

fit_model = naivebayes.fit(train)
predictions = naivebayes.predict(test, fit_model)
print predictions
Exemple #13
0
from disco.core import result_iterator

from discomll import dataset
from discomll.ensemble import distributed_weighted_forest_rand
from discomll.utils import accuracy

train = dataset.Data(
    data_tag=[[
        "http://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data"
    ]],
    id_index=0,
    X_indices=xrange(1, 10),
    X_meta="http://ropot.ijs.si/data/datasets_meta/breastcancer_meta.csv",
    y_index=10,
    delimiter=",")

fit_model = distributed_weighted_forest_rand.fit(train,
                                                 trees_per_chunk=3,
                                                 max_tree_nodes=50,
                                                 min_samples_leaf=5,
                                                 min_samples_split=10,
                                                 class_majority=1,
                                                 measure="info_gain",
                                                 num_medoids=10,
                                                 accuracy=1,
                                                 separate_max=True,
                                                 random_state=None,
                                                 save_results=True)

# predict training dataset
predictions = distributed_weighted_forest_rand.predict(train, fit_model)
from disco.core import result_iterator

from discomll import dataset
from discomll.classification import naivebayes
from discomll.utils import model_view
from discomll.utils import accuracy



# define training dataset
train = dataset.Data(data_tag=["test:breast_cancer_disc"],
                     data_type="chunk",
                     X_indices=xrange(1, 10),
                     X_meta=["d" for i in xrange(1, 10)],
                     id_index=0,
                     y_index=10,
                     delimiter=",",
                     y_map=["2", "4"],
                     # define mapping parameter. "2" is mapped to 1, "4" is mapped to -1.
                     missing_vals=["?"])  # define missing value symbol

# define test dataset
test = dataset.Data(data_tag=["test:breast_cancer_disc_test"],
                    data_type="chunk",
                    X_indices=xrange(1, 10),
                    X_meta=["d" for i in xrange(1, 10)],
                    id_index=0,
                    y_index=10,
                    delimiter=",",
                    y_map=["2", "4"],  # define mapping parameter. "2" is mapped to 1, "4" is mapped to -1.
                    missing_vals=["?"])  # define missing value symbol
Exemple #15
0
def file_url(input_dict):
    from discomll import dataset
    import itertools

    urls = [url.strip() for url in input_dict["url"].split("\n") if url != ""]
    for url in urls:
        if url.split("/")[0].lower().startswith("https"):
            raise Exception(
                "URLs should be accessible via HTTP and not HTTPS.")

    if input_dict["range"] == "true":
        if len(urls) != 2:
            raise Exception(
                "A first and last URL should be specified if the Range parameter is checked."
            )

        url_start = urls[0].split("/")
        url_end = urls[1].split("/")

        url_base = "/".join(url_start[:-1])
        start_index = url_start[-1].index("a")
        file_name = url_start[-1][0:start_index]
        url_base += "/" + file_name

        start = url_start[-1][start_index:]
        finish = url_end[-1][start_index:]
        file_extension = ""
        if start.count(".") == 1 and finish.count(".") == 1:
            start, file_extension = start.split(".")
            finish, _ = finish.split(".")
            file_extension = "." + file_extension
        else:
            raise Exception("URLs does not have the same pattern.")

        alphabet = "abcdefghijklmnopqrstuvwxyz"
        product = itertools.product(alphabet, repeat=len(start))

        urls = []
        for p in product:
            urls.append(url_base + "".join(p) + file_extension)
            if "".join(p) == finish:
                break

    X_indices_splited = input_dict["X_indices"].replace(" ", "").split("-")
    if len(X_indices_splited) == 2:
        a, b = X_indices_splited
        if not a.isdigit() or not b.isdigit():
            raise Exception(
                "Feature indices should be integers. Example: 1-10")
        X_indices = range(int(a), int(b))
    else:
        X_indices = [
            int(v) for v in input_dict["X_indices"].replace(" ", "").split(",")
            if v != ""
        ]
    del (input_dict["X_indices"])

    input_dict[
        "data_type"] = "gzip" if input_dict["data_type"] == "true" else ""

    data = dataset.Data(data_tag=urls, X_indices=X_indices, **input_dict)

    return {"dataset": data}
from discomll import dataset
from discomll.clustering import kmeans

train = dataset.Data(data_tag=[
    "http://ropot.ijs.si/data/linear/train/xaaaaa.gz",
    "http://ropot.ijs.si/data/linear/train/xaaabj.gz"
],
                     data_type="gzip",
                     generate_urls=True,
                     X_indices=range(1, 22),
                     id_index=0,
                     delimiter=",")

test = dataset.Data(data_tag=[
    "http://ropot.ijs.si/data/linear/test/xaaaaa.gz",
    "http://ropot.ijs.si/data/linear/test/xaaabj.gz"
],
                    data_type="gzip",
                    generate_urls=True,
                    X_indices=range(1, 22),
                    id_index=0,
                    delimiter=",")

fit_model = kmeans.fit(train, n_clusters=5, max_iterations=10, random_state=0)
predictions = kmeans.predict(test, fit_model)
print predictions
Exemple #17
0
from disco.core import result_iterator

from discomll import dataset
from discomll.ensemble import forest_distributed_decision_trees
from discomll.utils import model_view

train = dataset.Data(data_tag=[["http://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"]],
                     X_indices=xrange(0, 4),
                     X_meta="http://ropot.ijs.si/data/datasets_meta/iris_meta.csv",
                     y_index=4,
                     delimiter=",")

fit_model = forest_distributed_decision_trees.fit(train, trees_per_chunk=1, bootstrap=False, max_tree_nodes=50,
                                                  min_samples_leaf=2, min_samples_split=1, class_majority=1,
                                                  separate_max=True, measure="info_gain", accuracy=1, random_state=None,
                                                  save_results=True)

print model_view.output_model(fit_model)

# predict training dataset
predictions = forest_distributed_decision_trees.predict(train, fit_model)

# output results
for k, v in result_iterator(predictions):
    print k, v[0]
Exemple #18
0
from discomll import dataset
from discomll.classification import logistic_regression
from discomll.utils import model_view

# define training dataset
train = dataset.Data(data_tag=["test:ex4"],
                     data_type="chunk",
                     X_indices=xrange(0, 2),
                     y_index=2,
                     y_map=["0.0000000e+00", "1.0000000e+00"])

# fit model on training dataset
fit_model = logistic_regression.fit(train)

# output model
model = model_view.output_model(fit_model)
print model
Exemple #19
0
from disco.core import result_iterator

from discomll import dataset
from discomll.clustering import kmeans
from discomll.utils import model_view

# define training dataset
train = dataset.Data(
    data_tag=["test:breast_cancer_cont"],
    data_type="chunk",  # define data source - chunk data on ddfs
    X_indices=xrange(0, 9),  # define attribute indices
    y_index=9,  # define class index
    delimiter=",")

# define test dataset
test = dataset.Data(
    data_tag=["test:breast_cancer_cont_test"],
    data_type="chunk",  # define data source - chunk data on ddfs
    X_indices=xrange(0, 9),  # define attribute indices
    y_index=9,  # define class index
    delimiter=",")

# fit model on training dataset
fit_model = kmeans.fit(train, n_clusters=2, max_iterations=5, random_state=0)

# output model
model = model_view.output_model(fit_model)
print model

# predict test dataset
predictions = kmeans.predict(test, fit_model)
from disco.core import result_iterator

from discomll import dataset
from discomll.regression import locally_weighted_linear_regression

training_data = dataset.Data(
    data_tag=["test:regression_data1", "test:regression_data2"],
    data_type="chunk",
    id_index=0,
    X_indices=[0],
    y_index=1)

fitting_data = dataset.Data(
    data_tag=["test:regression_data_test1", "test:regression_data_test2"],
    data_type="chunk",
    id_index=0,
    X_indices=[0],
    y_index=1)

# fit fitting data to training data
results = locally_weighted_linear_regression.fit_predict(training_data,
                                                         fitting_data,
                                                         tau=10)

# output results
for k, v in result_iterator(results):
    print k, v
Exemple #21
0
from discomll import dataset
from discomll.regression import locally_weighted_linear_regression

train = dataset.Data(data_tag=[
    "http://ropot.ijs.si/data/fraction/train/xaaaaa.gz",
    "http://ropot.ijs.si/data/fraction/train/xaaabj.gz"
],
                     data_type="gzip",
                     generate_urls=True,
                     X_indices=range(1, 14),
                     id_index=0,
                     y_index=14,
                     delimiter=",")

test = dataset.Data(
    data_tag=[["http://ropot.ijs.si/data/fraction/test/xaaaaa.gz"]],
    data_type="gzip",
    X_indices=range(1, 14),
    id_index=0,
    y_index=14,
    delimiter=",")

predictions = locally_weighted_linear_regression.fit_predict(train,
                                                             test,
                                                             tau=1,
                                                             samples_per_job=0,
                                                             save_results=True)
print predictions