Exemple #1
0
    def fit(x_train):
        # Setup the K-NN estimator:
        x_train = tensor(x_train)
        Ntrain, D = x_train.shape
        start = timer()
        # The "training" time here should be negligible:
        x_train_norm = (x_train ** 2).sum(-1)
        elapsed = timer() - start

        def f(x_test):
            x_test = tensor(x_test)

            # Estimate the largest reasonable batch size:
            Ntest = x_test.shape[0]
            av_mem = int(5e8)  # 500 Mb of GPU memory per batch
            # Remember that a vector of D float32 number takes up 4*D bytes:
            Ntest_loop = min(max(1, av_mem // (4 * D * Ntrain)), Ntest)
            Nloop = (Ntest - 1) // Ntest_loop + 1
            out = int_tensor(Ntest, K)

            start = timer()
            # Actual K-NN query:
            for k in range(Nloop):
                x_test_k = x_test[Ntest_loop * k : Ntest_loop * (k + 1), :]
                out[Ntest_loop * k : Ntest_loop * (k + 1), :] = KNN_torch_fun(
                    x_train, x_train_norm, x_test_k, K, metric
                )

            # torch.cuda.empty_cache()

            elapsed = timer() - start
            indices = out.cpu().numpy()
            return indices, elapsed

        return f, elapsed
def h20_fit_pred(X_train, y_train, X_test, id_test, name_dataset):
    X_train['target'] = y_train
    start_time = timer(None)
    train = h2o.H2OFrame.from_python(X_train)
    test = h2o.H2OFrame.from_python(X_test)

    # Identify predictors and response
    x = train.columns
    y = "target"

    # (limited to 1 hour max runtime by default)
    aml = H2OAutoML()
    aml.train(x=x, y=y, training_frame=train)
    time = timer(start_time)
    preds = aml.predict(test).as_data_frame()
    #Signal fuction
    preds_final = [1 if x > 0.5 else 0 for x in preds.values]

    X_train.drop(columns=["target"], inplace=True)

    time_out = open("time_files/" + name_dataset + '_' + 'h2o', "w")
    time_out.write(time)
    time_out.close()

    submission = pd.DataFrame({"id": id_test, "target": preds_final})

    submission.to_csv("submit_files/" + name_dataset + '_' + 'h2o' +
                      '_submission.csv',
                      index=False)
Exemple #3
0
    def fit(x_train):
        from benchmark_utils import timer

        D = x_train.shape[1]

        if metric in ["euclidean", "angular"]:
            index = faiss.IndexHNSWFlat(D, M)
            index.hnsw.efConstruction = 500
        else:
            raise NotImplementedError(f"The '{metric}' distance is not supported.")

        # Pre-processing:
        start = timer(use_torch=False)
        index.add(x_train)
        elapsed = timer(use_torch=False) - start

        # Return an operator for actual KNN queries:
        def f(x_test, efSearch=10):
            faiss.ParameterSpace().set_index_parameter(index, "efSearch", efSearch)
            start = timer(use_torch=False)
            distances, indices = index.search(x_test, K)
            elapsed = timer(use_torch=False) - start
            return indices, elapsed

        return f, elapsed
Exemple #4
0
def h2o_fit_pred(X_train, y_train, X_test, id_test, name_dataset, id_name,
                 target_name):
    X_train_cp = X_train.copy()
    X_train_cp[target_name] = y_train
    start_time = timer(None)
    print(X_train_cp.head)

    train = h2o.H2OFrame.from_python(X_train_cp)
    test = h2o.H2OFrame.from_python(X_test)

    # Identify predictors and response
    x = train.columns
    y = target_name

    # (limited to 1 hour max runtime by default)
    aml = H2OAutoML()
    aml.train(x=x, y=y, training_frame=train)
    time = timer(start_time)
    print("FIT maked")
    #print(test)
    preds = aml.predict(test).as_data_frame().values[0]
    print("Predict maked")
    print(preds)
    time_out = open(name_dataset + '_' + 'h2o', "w")
    time_out.write(time)
    time_out.close()

    submission = pd.DataFrame({id_name: id_test, target_name: preds})

    submission.to_csv('submission_' + name_dataset + '_' + 'h2o.csv',
                      index=False)
Exemple #5
0
 def f(x_test, efSearch=10):
     faiss.ParameterSpace().set_index_parameter(index, "efSearch",
                                                efSearch)
     start = timer(use_torch=False)
     distances, indices = index.search(x_test, K)
     elapsed = timer(use_torch=False) - start
     return indices, elapsed
Exemple #6
0
    def fit(x_train):

        # Setup the K-NN estimator:
        start = timer(use_torch=False)
        x_train = jax_tensor(x_train)
        elapsed = timer(use_torch=False) - start

        def f(x_test):
            x_test = jax_tensor(x_test)

            # Estimate the largest reasonable batch size
            av_mem = int(5e8)  # 500 Mb
            Ntrain, D = x_train.shape
            Ntest = x_test.shape[0]
            Ntest_loop = min(max(1, av_mem // (4 * D * Ntrain)), Ntest)
            Nloop = (Ntest - 1) // Ntest_loop + 1
            indices = np.zeros((Ntest, K), dtype=int)

            start = timer(use_torch=False)
            # Actual K-NN query:
            for k in range(Nloop):
                x_test_k = x_test[Ntest_loop * k : Ntest_loop * (k + 1), :]
                indices[Ntest_loop * k : Ntest_loop * (k + 1), :] = knn_jax_fun(
                    x_train, x_test_k, K, metric
                )
            elapsed = timer(use_torch=False) - start
            return indices, elapsed

        return f, elapsed
Exemple #7
0
        def f(x_test):
            x_test = tensor(x_test)
            start = timer()
            indices = KNN.kneighbors(x_test)
            elapsed = timer() - start
            indices = indices.cpu().numpy()

            return indices, elapsed
Exemple #8
0
        def f(x_test):
            x_test = jax_tensor(x_test)

            # Actual K-NN query:
            start = timer(use_torch=False)
            indices = knn_jax_fun(x_train, x_test, K, metric)
            indices = np.array(indices)
            elapsed = timer(use_torch=False) - start
            return indices, elapsed
Exemple #9
0
        def f(x_test):
            x_test = tensor(x_test)
            start = timer()

            # Actual K-NN query:
            out = KNN_torch_fun(x_train, x_train_norm, x_test, K, metric)

            elapsed = timer() - start
            indices = out.cpu().numpy()
            return indices, elapsed
Exemple #10
0
        def f(x_test):
            x_test = tensor(x_test)
            start = timer()

            # Actual K-NN query:
            indices = KNN_fun(x_test, x_train)

            elapsed = timer() - start

            indices = indices.cpu().numpy()
            return indices, elapsed
Exemple #11
0
    def fit(x_train):
        # Setup the K-NN estimator:
        start = timer()
        KNN_fun = KNN_meth.fit(x_train).kneighbors
        elapsed = timer() - start

        def f(x_test):
            start = timer()
            distances, indices = KNN_fun(x_test)
            elapsed = timer() - start

            return indices, elapsed

        return f, elapsed
Exemple #12
0
def hyperopt_fit_pred(X_train, y_train, X_test, id_test, name_dataset):
    hp = HyperoptEstimator()
    start_time = timer(None)
    hp.fit(X_train.values, y_train.values)
    time = timer(start_time)
    preds = hp.predict(X_test.values)

    time_out = open(name_dataset + '_' + 'hyperopt', "w")
    time_out.write(time)
    time_out.close()

    submission = pd.DataFrame({"id": id_test, "target": preds})

    submission.to_csv(name_dataset + '_' + 'hyperopt' + '_submission.csv',
                      index=False)
Exemple #13
0
    def fit(x_train):
        x_train = tensor(x_train)
        start = timer()
        KNN.fit(x_train, clusters=clusters, a=a)
        elapsed = timer() - start

        def f(x_test):
            x_test = tensor(x_test)
            start = timer()
            indices = KNN.kneighbors(x_test)
            elapsed = timer() - start
            indices = indices.cpu().numpy()

            return indices, elapsed

        return f, elapsed
Exemple #14
0
def hyperopt_fit_pred(X_train, y_train, X_test, id_test, name_dataset, id_name,
                      target_name):
    hp = HyperoptEstimator(regressor=hpsklearn.components.any_regressor('reg'))
    start_time = timer(None)
    hp.fit(X_train.as_matrix(), y_train.as_matrix())
    time = timer(start_time)
    preds = hp.predict(X_test.as_matrix())

    time_out = open(name_dataset + '_' + 'hyperopt', "w")
    time_out.write(time)
    time_out.close()

    submission = pd.DataFrame({id_name: id_test, target_name: preds})

    submission.to_csv('submission_' + name_dataset + '_' + 'hyperopt.csv',
                      index=False)
Exemple #15
0
def tpot_fit_pred(X_train, y_train, X_test, id_test, name_dataset):
    tp = TPOTClassifier(verbosity=3)
    start_time = timer(None)
    tp.fit(X_train, y_train)
    tp.export('tpot_pipeline_dont_overfit.py')
    time = timer(start_time)
    preds = tp.predict(X_test)

    time_out = open(name_dataset + '_' + 'tpot', "w")
    time_out.write(time)
    time_out.close()

    submission = pd.DataFrame({"id": id_test, "target": preds})

    submission.to_csv(name_dataset + '_' + 'tpot' + '_submission.csv',
                      index=False)
Exemple #16
0
def tpot_fit_pred(X_train, y_train, X_test, id_test, name_dataset, id_name,
                  target_name):
    tp = TPOTRegressor(verbosity=2)
    start_time = timer(None)
    tp.fit(X_train, y_train)
    tp.export('tpot_pipeline_dont_overfit.py')
    time = timer(start_time)
    preds = tp.predict(X_test)

    time_out = open(name_dataset + '_' + 'tpot', "w")
    time_out.write(time)
    time_out.close()

    submission = pd.DataFrame({id_name: id_test, target_name: preds})

    submission.to_csv('submission_' + name_dataset + '_' + 'tpot.csv',
                      index=False)
Exemple #17
0
def autosk_fit_pred(X_train, y_train, X_test, id_test, name_dataset, id_name,
                    target_name):
    ak = autosklearn.regression.AutoSklearnRegressor(ml_memory_limit=51000)
    start_time = timer(None)
    ak.fit(X_train.copy(), y_train.copy())
    ak.refit(X_train.copy(), y_train.copy())
    time = timer(start_time)
    preds = ak.predict(X_test.copy())

    time_out = open(name_dataset + '_' + 'autosk', "w")
    time_out.write(time)
    time_out.close()

    submission = pd.DataFrame({id_name: id_test, target_name: preds})

    submission.to_csv('submission_' + name_dataset + '_' + 'autosk.csv',
                      index=False)
Exemple #18
0
def autosk_fit_pred(X_train, y_train, X_test, id_test, name_dataset):
    ak = autosklearn.classification.AutoSklearnClassifier(
        ml_memory_limit=51000)
    start_time = timer(None)
    ak.fit(X_train.copy(), y_train.copy())
    ak.refit(X_train.copy(), y_train.copy())
    time = timer(start_time)
    preds = ak.predict(X_test.copy())

    time_out = open(name_dataset + '_' + 'autosk', "w")
    time_out.write(time)
    time_out.close()

    submission = pd.DataFrame({"id": id_test, "target": preds})

    submission.to_csv(name_dataset + '_' + 'autosk' + '_submission.csv',
                      index=False)
Exemple #19
0
    def fit(x_train):

        # Setup the K-NN estimator:
        start = timer(use_torch=False)
        x_train = jax_tensor(x_train)
        elapsed = timer(use_torch=False) - start

        def f(x_test):
            x_test = jax_tensor(x_test)

            # Actual K-NN query:
            start = timer(use_torch=False)
            indices = knn_jax_fun(x_train, x_test, K, metric)
            indices = np.array(indices)
            elapsed = timer(use_torch=False) - start
            return indices, elapsed

        return f, elapsed
Exemple #20
0
    def fit(x_train):

        D = x_train.shape[1]

        co = faiss.GpuClonerOptions()
        co.useFloat16 = use_float16

        if metric in ["euclidean", "angular"]:

            if algorithm == "flat":
                index = faiss.IndexFlatL2(D)  # May be used as quantizer
                index = faiss.index_cpu_to_gpu(res, deviceId, index, co)

            elif algorithm == "ivfflat":
                quantizer = faiss.IndexFlatL2(D)  # the other index
                faiss_metric = (
                    faiss.METRIC_L2
                    if metric == "euclidean"
                    else faiss.METRIC_INNER_PRODUCT
                )
                index = faiss.IndexIVFFlat(quantizer, D, nlist, faiss_metric)
                index = faiss.index_cpu_to_gpu(res, deviceId, index, co)

                assert not index.is_trained
                index.train(x_train)  # add vectors to the index
                assert index.is_trained

        else:
            raise NotImplementedError(f"The '{metric}' distance is not supported.")

        # Pre-processing:
        start = timer(use_torch=False)
        index.add(x_train)
        index.nprobe = nprobe
        elapsed = timer(use_torch=False) - start

        # Return an operator for actual KNN queries:
        def f(x_test):
            start = timer(use_torch=False)
            distances, indices = index.search(x_test, K)
            elapsed = timer(use_torch=False) - start
            return indices, elapsed

        return f, elapsed
Exemple #21
0
    def fit(x_train):
        # Setup the K-NN estimator:
        x_train = tensor(x_train)
        start = timer()

        # Encoding as KeOps LazyTensors:
        D = x_train.shape[1]
        X_i = Vi(0, D)  # Purely symbolic "i" variable, without any data array
        X_j = Vj(1, D)  # Purely symbolic "j" variable, without any data array

        # Symbolic distance matrix:
        if metric == "euclidean":
            D_ij = ((X_i - X_j)**2).sum(-1)
        elif metric == "manhattan":
            D_ij = (X_i - X_j).abs().sum(-1)
        elif metric == "angular":
            D_ij = -(X_i | X_j)
        elif metric == "hyperbolic":
            D_ij = ((X_i - X_j)**2).sum(-1) / (X_i[0] * X_j[0])
        else:
            raise NotImplementedError(
                f"The '{metric}' distance is not supported.")

        # K-NN query operator:
        KNN_fun = D_ij.argKmin(K, dim=1)

        # N.B.: The "training" time here should be negligible.
        elapsed = timer() - start

        def f(x_test):
            x_test = tensor(x_test)
            start = timer()

            # Actual K-NN query:
            indices = KNN_fun(x_test, x_train)

            elapsed = timer() - start

            indices = indices.cpu().numpy()
            return indices, elapsed

        return f, elapsed
def tpot_fit_pred(X_train, y_train, X_test, id_test, name_dataset):
    tp = TPOTClassifier(generations=5,
                        population_size=20,
                        random_state=42,
                        verbosity=2)
    start_time = timer(None)
    tp.fit(X_train, y_train)
    tp.export('tpot_pipeline_' + name_dataset + '.py')
    time = timer(start_time)
    preds = tp.predict(X_test)

    time_out = open("time_files/" + name_dataset + '_' + 'tpot', "w")
    time_out.write(time)
    time_out.close()

    submission = pd.DataFrame({"id": id_test, "target": preds})

    submission.to_csv("submit_files/" + name_dataset + '_' +
                      'tpot_submission' + '.csv',
                      index=False)
Exemple #23
0
    def fit(x_train):
        # Setup the K-NN estimator:
        x_train = tensor(x_train)
        start = timer()
        # The "training" time here should be negligible:
        x_train_norm = (x_train ** 2).sum(-1)
        elapsed = timer() - start

        def f(x_test):
            x_test = tensor(x_test)
            start = timer()

            # Actual K-NN query:
            out = KNN_torch_fun(x_train, x_train_norm, x_test, K, metric)

            elapsed = timer() - start
            indices = out.cpu().numpy()
            return indices, elapsed

        return f, elapsed

def hyperopt_fit_pred(X_train, y_train, X_test):
    hp.fit(X_train, y_train)

    return hp.predict(X_test)


all_models = [
    ("hyperopt", hyperopt_fit_pred),
    ("autosk", autosk_fit_pred),
    ("tpot", tpot_fit_pred),
]

submission_time = []
for name, model in all_models:
    print("Training with ", name)
    start_time = timer(None)
    preds = model(X_train.copy(), y_train.copy(), X_test.copy())
    submission_time.append((name, timer(start_time)))
    submission = pd.DataFrame({"id": X_test["id"], "target": preds})

    submission.to_csv('submission_' + name + '.csv', index=False)

sub_time = pd.DataFrame({
    "name": submission_time[0],
    "time": submission_time[1]
})

sub_time.to_csv('submission_time.csv', index=False)
Exemple #25
0
 def f(x_test):
     start = timer(use_torch=False)
     distances, indices = index.search(x_test, K)
     elapsed = timer(use_torch=False) - start
     return indices, elapsed
Exemple #26
0
        def f(x_test):
            start = timer()
            distances, indices = KNN_fun(x_test)
            elapsed = timer() - start

            return indices, elapsed