Esempio n. 1
0
def test_lift3():
    # https://www.kaggle.com/vsmolyakov/svm-classifier
    # with some mods --> remove deprecated stale/code from sklearn
    # in source (from older version, incompatible)
    src = """
    import numpy as np
    import pandas as pd
    import matplotlib.pyplot as plt
    import seaborn as sns
    import csv as csv

    from sklearn.svm import SVC
    from sklearn.model_selection import train_test_split, GridSearchCV
    from sklearn.preprocessing import StandardScaler


    # subbed out data here...
    X_train = np.random.random((100, 100))
    y_train = np.random.random(100) > 0.5


    svm_parameters = [{'kernel': ['rbf'], 'C': [1,10,100,1000]}]
    clf = GridSearchCV(SVC(), svm_parameters, cv=3, verbose=2)
    clf.fit(X_train, y_train)
    clf.best_params_

    C_opt = 10
    clf = SVC(C=C_opt, kernel='rbf')
    clf.fit(X_train, y_train)
    clf.n_support_

    X_test_data = np.random.random((100, 100))
    y_pred = clf.predict(X_test_data)
    """
    result = PipelineLifter(src)
    assert not result.failed
    assert len(result.pipelines) == 2

    from sklearn.pipeline import Pipeline
    from sklearn.model_selection import GridSearchCV
    from sklearn.svm import SVC
    clf = SVC(C=10, kernel="rbf")
    expected1 = Pipeline([("clf", clf)])

    svm_parameters = [{'kernel': ['rbf'], 'C': [1, 10, 100, 1000]}]
    clf = GridSearchCV(SVC(), svm_parameters, cv=3, verbose=2)
    expected2 = Pipeline([("clf", clf)])

    expected = set([pt.md5(expected1), pt.md5(expected2)])
    got = set([pt.md5(p) for p in result.pipelines])
    assert got == expected
Esempio n. 2
0
def test_lift2():
    # same pipeline but now not using the Pipeline construct in the
    # source
    src = """
    from sklearn.pipeline import Pipeline
    from sklearn.preprocessing import StandardScaler
    from sklearn.linear_model import LogisticRegression
    logit = LogisticRegression(
        C=1,
        solver='lbfgs',
        max_iter=500,
        random_state=17,
        n_jobs=1,
        multi_class='multinomial')

    import numpy as np
    X = np.random.random((10, 10))
    y = np.random.random(10)
    y = y > 0.5

    scaler = StandardScaler()

    X_scaled = scaler.fit_transform(X)
    logit.fit(X_scaled, y)
    logit.predict(X_scaled)
    """
    result = PipelineLifter(src)
    assert not result.failed
    assert len(result.pipelines) == 1

    from sklearn.pipeline import Pipeline
    from sklearn.preprocessing import StandardScaler
    from sklearn.linear_model import LogisticRegression
    logit = LogisticRegression(C=1,
                               solver='lbfgs',
                               max_iter=500,
                               random_state=17,
                               n_jobs=1,
                               multi_class='multinomial')
    expected = Pipeline([('scaler', StandardScaler()), ('logit', logit)])

    assert pt.md5(expected) == pt.md5(result.pipelines[0])
Esempio n. 3
0
def get_repair_hashes(repairer, seed, num_passes=2, num_pipelines=3):
    passes = [[]] * num_passes
    for i in range(0, num_passes):
        print("Pass: {}".format(i))
        utils.set_seed(seed)
        num_remaining = num_pipelines
        pbar = tqdm.tqdm(total=num_pipelines)
        for p in data.pipelines:
            if num_remaining <= 0:
                break
            repaired = repairer.repair(p, data.X, data.y, bound_num_repairs=1)
            orig_md5 = pt.md5(p)
            if repaired is None:
                continue
            repaired_md5 = pt.md5(repaired)
            if orig_md5 == repaired_md5:
                continue
            passes[i].append(repaired_md5)
            num_remaining -= 1
            pbar.update(1)
        pbar.close()
    return passes
Esempio n. 4
0
def test_lift1():
    src = """
    from sklearn.pipeline import Pipeline
    from sklearn.preprocessing import StandardScaler
    from sklearn.linear_model import LogisticRegression
    logit = LogisticRegression(
        C=1,
        solver='lbfgs',
        max_iter=500,
        random_state=17,
        n_jobs=1,
        multi_class='multinomial')
    p = Pipeline([('scaler', StandardScaler()), ('logit', logit)])

    import numpy as np
    X = np.random.random((10, 10))
    y = np.random.random(10)
    y = y > 0.5

    p.fit(X, y)
    p.predict(X)
    """
    result = PipelineLifter(src)
    assert not result.failed
    assert len(result.pipelines) == 1

    from sklearn.pipeline import Pipeline
    from sklearn.preprocessing import StandardScaler
    from sklearn.linear_model import LogisticRegression
    logit = LogisticRegression(C=1,
                               solver='lbfgs',
                               max_iter=500,
                               random_state=17,
                               n_jobs=1,
                               multi_class='multinomial')
    expected = Pipeline([('scaler', StandardScaler()), ('logit', logit)])

    assert pt.md5(expected) == pt.md5(result.pipelines[0])
    def extract_pipelines(self):
        assert self.graph is not None
        seed_node_ids = find_pipeline_seeds(self.graph)
        slices = get_graph_slices(self.graph, seed_node_ids)
        slices = remove_subgraphs(slices)

        path_lengths = nx.all_pairs_shortest_path_length(
            self.graph.reverse(copy=True))
        path_lengths = dict(path_lengths)
        pipelines = []
        hashes = set([])
        for _slice in slices:
            annotated_slice = execute_graph_line_by_line(_slice)
            components = extract_pipeline_components(annotated_slice,
                                                     path_lengths)

            pipeline = build_pipeline(components)
            h = pt.md5(pipeline)

            if pipeline is not None and h not in hashes:
                pipelines.append(pipeline)
                hashes.add(h)
        return pipelines