def fit(x_train): # Setup the K-NN estimator: x_train = tensor(x_train) Ntrain, D = x_train.shape start = timer() # The "training" time here should be negligible: x_train_norm = (x_train ** 2).sum(-1) elapsed = timer() - start def f(x_test): x_test = tensor(x_test) # Estimate the largest reasonable batch size: Ntest = x_test.shape[0] av_mem = int(5e8) # 500 Mb of GPU memory per batch # Remember that a vector of D float32 number takes up 4*D bytes: Ntest_loop = min(max(1, av_mem // (4 * D * Ntrain)), Ntest) Nloop = (Ntest - 1) // Ntest_loop + 1 out = int_tensor(Ntest, K) start = timer() # Actual K-NN query: for k in range(Nloop): x_test_k = x_test[Ntest_loop * k : Ntest_loop * (k + 1), :] out[Ntest_loop * k : Ntest_loop * (k + 1), :] = KNN_torch_fun( x_train, x_train_norm, x_test_k, K, metric ) # torch.cuda.empty_cache() elapsed = timer() - start indices = out.cpu().numpy() return indices, elapsed return f, elapsed
def h20_fit_pred(X_train, y_train, X_test, id_test, name_dataset): X_train['target'] = y_train start_time = timer(None) train = h2o.H2OFrame.from_python(X_train) test = h2o.H2OFrame.from_python(X_test) # Identify predictors and response x = train.columns y = "target" # (limited to 1 hour max runtime by default) aml = H2OAutoML() aml.train(x=x, y=y, training_frame=train) time = timer(start_time) preds = aml.predict(test).as_data_frame() #Signal fuction preds_final = [1 if x > 0.5 else 0 for x in preds.values] X_train.drop(columns=["target"], inplace=True) time_out = open("time_files/" + name_dataset + '_' + 'h2o', "w") time_out.write(time) time_out.close() submission = pd.DataFrame({"id": id_test, "target": preds_final}) submission.to_csv("submit_files/" + name_dataset + '_' + 'h2o' + '_submission.csv', index=False)
def fit(x_train): from benchmark_utils import timer D = x_train.shape[1] if metric in ["euclidean", "angular"]: index = faiss.IndexHNSWFlat(D, M) index.hnsw.efConstruction = 500 else: raise NotImplementedError(f"The '{metric}' distance is not supported.") # Pre-processing: start = timer(use_torch=False) index.add(x_train) elapsed = timer(use_torch=False) - start # Return an operator for actual KNN queries: def f(x_test, efSearch=10): faiss.ParameterSpace().set_index_parameter(index, "efSearch", efSearch) start = timer(use_torch=False) distances, indices = index.search(x_test, K) elapsed = timer(use_torch=False) - start return indices, elapsed return f, elapsed
def h2o_fit_pred(X_train, y_train, X_test, id_test, name_dataset, id_name, target_name): X_train_cp = X_train.copy() X_train_cp[target_name] = y_train start_time = timer(None) print(X_train_cp.head) train = h2o.H2OFrame.from_python(X_train_cp) test = h2o.H2OFrame.from_python(X_test) # Identify predictors and response x = train.columns y = target_name # (limited to 1 hour max runtime by default) aml = H2OAutoML() aml.train(x=x, y=y, training_frame=train) time = timer(start_time) print("FIT maked") #print(test) preds = aml.predict(test).as_data_frame().values[0] print("Predict maked") print(preds) time_out = open(name_dataset + '_' + 'h2o', "w") time_out.write(time) time_out.close() submission = pd.DataFrame({id_name: id_test, target_name: preds}) submission.to_csv('submission_' + name_dataset + '_' + 'h2o.csv', index=False)
def f(x_test, efSearch=10): faiss.ParameterSpace().set_index_parameter(index, "efSearch", efSearch) start = timer(use_torch=False) distances, indices = index.search(x_test, K) elapsed = timer(use_torch=False) - start return indices, elapsed
def fit(x_train): # Setup the K-NN estimator: start = timer(use_torch=False) x_train = jax_tensor(x_train) elapsed = timer(use_torch=False) - start def f(x_test): x_test = jax_tensor(x_test) # Estimate the largest reasonable batch size av_mem = int(5e8) # 500 Mb Ntrain, D = x_train.shape Ntest = x_test.shape[0] Ntest_loop = min(max(1, av_mem // (4 * D * Ntrain)), Ntest) Nloop = (Ntest - 1) // Ntest_loop + 1 indices = np.zeros((Ntest, K), dtype=int) start = timer(use_torch=False) # Actual K-NN query: for k in range(Nloop): x_test_k = x_test[Ntest_loop * k : Ntest_loop * (k + 1), :] indices[Ntest_loop * k : Ntest_loop * (k + 1), :] = knn_jax_fun( x_train, x_test_k, K, metric ) elapsed = timer(use_torch=False) - start return indices, elapsed return f, elapsed
def f(x_test): x_test = tensor(x_test) start = timer() indices = KNN.kneighbors(x_test) elapsed = timer() - start indices = indices.cpu().numpy() return indices, elapsed
def f(x_test): x_test = jax_tensor(x_test) # Actual K-NN query: start = timer(use_torch=False) indices = knn_jax_fun(x_train, x_test, K, metric) indices = np.array(indices) elapsed = timer(use_torch=False) - start return indices, elapsed
def f(x_test): x_test = tensor(x_test) start = timer() # Actual K-NN query: out = KNN_torch_fun(x_train, x_train_norm, x_test, K, metric) elapsed = timer() - start indices = out.cpu().numpy() return indices, elapsed
def f(x_test): x_test = tensor(x_test) start = timer() # Actual K-NN query: indices = KNN_fun(x_test, x_train) elapsed = timer() - start indices = indices.cpu().numpy() return indices, elapsed
def fit(x_train): # Setup the K-NN estimator: start = timer() KNN_fun = KNN_meth.fit(x_train).kneighbors elapsed = timer() - start def f(x_test): start = timer() distances, indices = KNN_fun(x_test) elapsed = timer() - start return indices, elapsed return f, elapsed
def hyperopt_fit_pred(X_train, y_train, X_test, id_test, name_dataset): hp = HyperoptEstimator() start_time = timer(None) hp.fit(X_train.values, y_train.values) time = timer(start_time) preds = hp.predict(X_test.values) time_out = open(name_dataset + '_' + 'hyperopt', "w") time_out.write(time) time_out.close() submission = pd.DataFrame({"id": id_test, "target": preds}) submission.to_csv(name_dataset + '_' + 'hyperopt' + '_submission.csv', index=False)
def fit(x_train): x_train = tensor(x_train) start = timer() KNN.fit(x_train, clusters=clusters, a=a) elapsed = timer() - start def f(x_test): x_test = tensor(x_test) start = timer() indices = KNN.kneighbors(x_test) elapsed = timer() - start indices = indices.cpu().numpy() return indices, elapsed return f, elapsed
def hyperopt_fit_pred(X_train, y_train, X_test, id_test, name_dataset, id_name, target_name): hp = HyperoptEstimator(regressor=hpsklearn.components.any_regressor('reg')) start_time = timer(None) hp.fit(X_train.as_matrix(), y_train.as_matrix()) time = timer(start_time) preds = hp.predict(X_test.as_matrix()) time_out = open(name_dataset + '_' + 'hyperopt', "w") time_out.write(time) time_out.close() submission = pd.DataFrame({id_name: id_test, target_name: preds}) submission.to_csv('submission_' + name_dataset + '_' + 'hyperopt.csv', index=False)
def tpot_fit_pred(X_train, y_train, X_test, id_test, name_dataset): tp = TPOTClassifier(verbosity=3) start_time = timer(None) tp.fit(X_train, y_train) tp.export('tpot_pipeline_dont_overfit.py') time = timer(start_time) preds = tp.predict(X_test) time_out = open(name_dataset + '_' + 'tpot', "w") time_out.write(time) time_out.close() submission = pd.DataFrame({"id": id_test, "target": preds}) submission.to_csv(name_dataset + '_' + 'tpot' + '_submission.csv', index=False)
def tpot_fit_pred(X_train, y_train, X_test, id_test, name_dataset, id_name, target_name): tp = TPOTRegressor(verbosity=2) start_time = timer(None) tp.fit(X_train, y_train) tp.export('tpot_pipeline_dont_overfit.py') time = timer(start_time) preds = tp.predict(X_test) time_out = open(name_dataset + '_' + 'tpot', "w") time_out.write(time) time_out.close() submission = pd.DataFrame({id_name: id_test, target_name: preds}) submission.to_csv('submission_' + name_dataset + '_' + 'tpot.csv', index=False)
def autosk_fit_pred(X_train, y_train, X_test, id_test, name_dataset, id_name, target_name): ak = autosklearn.regression.AutoSklearnRegressor(ml_memory_limit=51000) start_time = timer(None) ak.fit(X_train.copy(), y_train.copy()) ak.refit(X_train.copy(), y_train.copy()) time = timer(start_time) preds = ak.predict(X_test.copy()) time_out = open(name_dataset + '_' + 'autosk', "w") time_out.write(time) time_out.close() submission = pd.DataFrame({id_name: id_test, target_name: preds}) submission.to_csv('submission_' + name_dataset + '_' + 'autosk.csv', index=False)
def autosk_fit_pred(X_train, y_train, X_test, id_test, name_dataset): ak = autosklearn.classification.AutoSklearnClassifier( ml_memory_limit=51000) start_time = timer(None) ak.fit(X_train.copy(), y_train.copy()) ak.refit(X_train.copy(), y_train.copy()) time = timer(start_time) preds = ak.predict(X_test.copy()) time_out = open(name_dataset + '_' + 'autosk', "w") time_out.write(time) time_out.close() submission = pd.DataFrame({"id": id_test, "target": preds}) submission.to_csv(name_dataset + '_' + 'autosk' + '_submission.csv', index=False)
def fit(x_train): # Setup the K-NN estimator: start = timer(use_torch=False) x_train = jax_tensor(x_train) elapsed = timer(use_torch=False) - start def f(x_test): x_test = jax_tensor(x_test) # Actual K-NN query: start = timer(use_torch=False) indices = knn_jax_fun(x_train, x_test, K, metric) indices = np.array(indices) elapsed = timer(use_torch=False) - start return indices, elapsed return f, elapsed
def fit(x_train): D = x_train.shape[1] co = faiss.GpuClonerOptions() co.useFloat16 = use_float16 if metric in ["euclidean", "angular"]: if algorithm == "flat": index = faiss.IndexFlatL2(D) # May be used as quantizer index = faiss.index_cpu_to_gpu(res, deviceId, index, co) elif algorithm == "ivfflat": quantizer = faiss.IndexFlatL2(D) # the other index faiss_metric = ( faiss.METRIC_L2 if metric == "euclidean" else faiss.METRIC_INNER_PRODUCT ) index = faiss.IndexIVFFlat(quantizer, D, nlist, faiss_metric) index = faiss.index_cpu_to_gpu(res, deviceId, index, co) assert not index.is_trained index.train(x_train) # add vectors to the index assert index.is_trained else: raise NotImplementedError(f"The '{metric}' distance is not supported.") # Pre-processing: start = timer(use_torch=False) index.add(x_train) index.nprobe = nprobe elapsed = timer(use_torch=False) - start # Return an operator for actual KNN queries: def f(x_test): start = timer(use_torch=False) distances, indices = index.search(x_test, K) elapsed = timer(use_torch=False) - start return indices, elapsed return f, elapsed
def fit(x_train): # Setup the K-NN estimator: x_train = tensor(x_train) start = timer() # Encoding as KeOps LazyTensors: D = x_train.shape[1] X_i = Vi(0, D) # Purely symbolic "i" variable, without any data array X_j = Vj(1, D) # Purely symbolic "j" variable, without any data array # Symbolic distance matrix: if metric == "euclidean": D_ij = ((X_i - X_j)**2).sum(-1) elif metric == "manhattan": D_ij = (X_i - X_j).abs().sum(-1) elif metric == "angular": D_ij = -(X_i | X_j) elif metric == "hyperbolic": D_ij = ((X_i - X_j)**2).sum(-1) / (X_i[0] * X_j[0]) else: raise NotImplementedError( f"The '{metric}' distance is not supported.") # K-NN query operator: KNN_fun = D_ij.argKmin(K, dim=1) # N.B.: The "training" time here should be negligible. elapsed = timer() - start def f(x_test): x_test = tensor(x_test) start = timer() # Actual K-NN query: indices = KNN_fun(x_test, x_train) elapsed = timer() - start indices = indices.cpu().numpy() return indices, elapsed return f, elapsed
def tpot_fit_pred(X_train, y_train, X_test, id_test, name_dataset): tp = TPOTClassifier(generations=5, population_size=20, random_state=42, verbosity=2) start_time = timer(None) tp.fit(X_train, y_train) tp.export('tpot_pipeline_' + name_dataset + '.py') time = timer(start_time) preds = tp.predict(X_test) time_out = open("time_files/" + name_dataset + '_' + 'tpot', "w") time_out.write(time) time_out.close() submission = pd.DataFrame({"id": id_test, "target": preds}) submission.to_csv("submit_files/" + name_dataset + '_' + 'tpot_submission' + '.csv', index=False)
def fit(x_train): # Setup the K-NN estimator: x_train = tensor(x_train) start = timer() # The "training" time here should be negligible: x_train_norm = (x_train ** 2).sum(-1) elapsed = timer() - start def f(x_test): x_test = tensor(x_test) start = timer() # Actual K-NN query: out = KNN_torch_fun(x_train, x_train_norm, x_test, K, metric) elapsed = timer() - start indices = out.cpu().numpy() return indices, elapsed return f, elapsed
def hyperopt_fit_pred(X_train, y_train, X_test): hp.fit(X_train, y_train) return hp.predict(X_test) all_models = [ ("hyperopt", hyperopt_fit_pred), ("autosk", autosk_fit_pred), ("tpot", tpot_fit_pred), ] submission_time = [] for name, model in all_models: print("Training with ", name) start_time = timer(None) preds = model(X_train.copy(), y_train.copy(), X_test.copy()) submission_time.append((name, timer(start_time))) submission = pd.DataFrame({"id": X_test["id"], "target": preds}) submission.to_csv('submission_' + name + '.csv', index=False) sub_time = pd.DataFrame({ "name": submission_time[0], "time": submission_time[1] }) sub_time.to_csv('submission_time.csv', index=False)
def f(x_test): start = timer(use_torch=False) distances, indices = index.search(x_test, K) elapsed = timer(use_torch=False) - start return indices, elapsed
def f(x_test): start = timer() distances, indices = KNN_fun(x_test) elapsed = timer() - start return indices, elapsed