def test_attribute_selected_classifier_1(self): algorithm = "AttributeSelectedClassifier" service = "classification" params = { '-D': False,'-U':False,'-R':False,'-B':False,'-L':False,'-A':False} if hyper_parameter_check(self.library, service, algorithm, params): self.assertTrue(hyper_parameter_check(self.library, service, algorithm, params)) else: self.assertFalse(hyper_parameter_check(self.library, service, algorithm, params))
def test_naivebayesbinomial_1(self): algorithm = "NaiveBayesBinomial" params = { 'auc_type': 'auto', 'balance_classes': False, 'compute_metrics': True, 'eps_prob': 30, 'eps_sdev': 0, 'fold_assignment': 'auto', 'gainslift_bins': -1, 'ignore_const_cols': True, 'keep_cross_validation_fold_assignment': False, 'keep_cross_validation_models': True, 'keep_cross_validation_predictions': False, 'laplace': 0, 'max_after_balance_size': 5, 'max_confusion_matrix_size': 0, 'max_runtime_secs': 0, 'min_prob': 0.001, 'min_sdev': 0.001, 'score_each_iteration': False, 'seed': -1 } self.assertTrue( hyper_parameter_check(self.library, self.service, algorithm, params))
def test_bagging_1(self): algorithm = "Bagging" service = "classification" params = {'-P': 50, '-O': True, '-print': False, '-store-out-of-bag-predictions': False, '-output-out-of-bag-complexity-statistics': True, '-represent-copies-using-weights': True, '-S': 2, '-num-slots': 1, '-I': 2, '-D': False, '-R':False} self.assertTrue(hyper_parameter_check(self.library, service, algorithm, params))
def test_gradientboostingmachines_1(self): algorithm = "GradientBoostingMachines" params = { 'auc_type': 'auto', 'balance_classes': False, 'build_tree_one_node': False, 'calibrate_model': False, 'categorical_encoding': 'auto', 'check_constant_response': True, 'col_sample_rate': 1, 'col_sample_rate_change_per_level': 1, 'col_sample_rate_per_tree': 1, 'distribution': 'auto', 'fold_assignment': 'auto', 'gainslift_bins': -1, 'histogram_type': 'auto', 'huber_alpha': 0.9, 'ignore_const_cols': True, 'keep_cross_validation_fold_assignment': False, 'keep_cross_validation_models': True, 'keep_cross_validation_predictions': False, 'learn_rate': 0.1, 'learn_rate_annealing': 1, 'max_abs_leafnode_pred': 1.7976, 'max_after_balance_size': 5.0, 'max_confusion_matrix_size': 20, 'max_depth': 5, 'max_runtime_secs': 0, 'min_rows': 10, 'min_split_improvement': 1e-05, 'nbins': 20, 'nbins_cats': 1024, 'nbins_top_level': 1024, 'ntrees': 50, 'pred_noise_bandwidth': 0, 'quantile_alpha': 0.5, 'r2_stopping': 1.7976, 'sample_rate': 1, 'score_each_iteration': False, 'score_tree_interval': 0, 'seed': -1, 'stopping_metric': 'auto', 'stopping_rounds': 0, 'stopping_tolerance': 0.001, 'tweedie_power': 1.5 } self.assertTrue( hyper_parameter_check(self.library, self.service, algorithm, params))
def test_linearregression_1(self): algorithm = "LinearRegression" params = { 'HGLM': False, 'auc_type': 'auto', 'balance_classes': False, 'beta_epsilon': 0.0001, 'calc_like': False, 'cold_start': False, 'compute_p_values': False, 'early_stopping': True, 'family': 'auto', 'fold_assignment': 'auto', 'gradient_epsilon': -1, 'ignore_const_cols': True, 'intercept': True, 'keep_cross_validation_fold_assignment': False, 'keep_cross_validation_models': True, 'keep_cross_validation_predictions': False, 'lambda_min_ratio': -1, 'lambda_search': False, 'link': 'family_default', 'max_active_predictors': -1, 'max_after_balance_size': 5.0, 'max_confusion_matrix_size': 20, 'max_iterations': -1, 'max_runtime_secs': 0, 'missing_values_handling': 'mean_imputation', 'nlambdas': -1, 'non_negative': False, 'obj_reg': -1, 'objective_epsilon': -1, 'prior': -1, 'remove_collinear_columns': False, 'score_each_iteration': False, 'score_iteration_interval': -1, 'seed': -1, 'solver': 'auto', 'standardize': True, 'stopping_metric': 'auto', 'stopping_rounds': 0, 'stopping_tolerance': 0.001, 'theta': 1e-10, 'tweedie_link_power': 1, 'tweedie_variance_power': 0 } self.assertTrue( hyper_parameter_check(self.library, self.service, algorithm, params))
def test_randomforest_1(self): algorithm = "RandomForest" params = { 'auc_type': 'auto', 'balance_classes': False, 'binomial_double_trees': False, 'build_tree_one_node': False, 'calibrate_model': False, 'categorical_encoding': 'auto', 'check_constant_response': True, 'col_sample_rate_change_per_level': 1, 'col_sample_rate_per_tree': 1, 'distribution': 'auto', 'fold_assignment': 'auto', 'histogram_type': 'auto', 'keep_cross_validation_fold_assignment': False, 'keep_cross_validation_models': True, 'keep_cross_validation_predictions': False, 'max_after_balance_size': 1.0, 'max_confusion_matrix_size': 0, 'max_depth': 0, 'max_runtime_secs': 0, 'min_rows': 1, 'min_split_improvement': 1e-05, 'mtries': -1, 'nbins': 2, 'nbins_cats': 1024, 'nbins_top_level': 1024, 'ntrees': 50, 'r2_stopping': -1.7976, 'sample_rate': 0.632, 'score_each_iteration': False, 'score_tree_interval': 10, 'stopping_metric': 'auto', 'stopping_rounds': 10, 'stopping_tolerance': 0.001, 'class_sampling_factors': None, 'sample_rate_per_class': None } self.assertTrue( hyper_parameter_check(self.library, self.service, algorithm, params))
def test_kmeans_1(self): algorithm = "SimpleKMeans" service = "clustering" params = {'-init':0,'-C':False,'-max-candidates':100,'-periodic-pruning':10000,'-min-density':2,'-t2':-1.0,'-t1':-1.5,'-V':False,'-M':False,'-I':1,'-O':False,'-fast':False,'-num-slots':1,'-S':10,'-output-debug-info':False,'-do-not-check-capabilities':False} self.assertTrue(hyper_parameter_check(self.library, service, algorithm, params))
def cluster(self, service, algorithm, dataset, features, lib='weka', number_of_clusters=2, cluster_type="Centroid", model_name=None, params=None, dataset_source=None): """ :param lib: Library for clustering the model. Currently we are supporting DLTK, weka, H2O, scikit-learn libraries. Valid values for this parameter: DLTK, weka, h2o, scikit :param service: Valid parameter values are CLUSTER. :param model_name: Model name and with this name model will be saved. :param algorithm: algorithm by which model will be trained. :param dataset: dataset file location in DLTK storage. :param features: column name list which is used to train classification model. :param number_of_clusters: the dataset will be clustered into number of clusters. :param dataset_source : metabase address for dataset :param params: :return: obj: A json obj containing model info. Args: dataset_source: dataset_source: features: Feature list used while model training dataset_source: To specify data source, None: Dataset file will from DLTK storage will be used database: Query from connected database will be used """ service, library, algorithm, features, label, train_percentage = validate_parameters( service, lib, algorithm, features, "None", cluster=True) # if additional parameters passed, check whether those are valid or not if params is not None: hyper_parameter_flag = hyper_parameter_check( library, service, algorithm, params) assert hyper_parameter_flag, "Please check the params, training failed due to incorrect values" url = self.base_url + '/machine/cluster/' headers = {'ApiKey': self.api_key, 'Content-type': 'application/json'} if params is None: params = {} if model_name is None: model_name = algorithm if dataset_source == "database": body = { 'library': lib, 'task': 'CLUSTER', 'service': service, "jobType": "DATABASE", "queryId": dataset, 'config': { 'name': model_name, 'algorithm': algorithm, 'numOfClusters': int(number_of_clusters), 'epsilon': 0.1, 'features': features, 'params': params, 'clusterType': cluster_type } } else: body = { 'library': lib, 'task': 'CLUSTER', 'service': service, 'config': { 'name': model_name, 'algorithm': algorithm, 'datasetUrl': dataset, 'numOfClusters': int(number_of_clusters), 'epsilon': 0.1, 'features': features, 'params': params, 'clusterType': cluster_type } } body = json.dumps(body) response = requests.post(url=url, data=body, headers=headers) response = response.json() return response
def test_kstar_1(self): algorithm = "KStar" service = "classification" params = {'-B': 50, '-E': True, '-M': 'a'} self.assertTrue(hyper_parameter_check(self.library, service, algorithm, params))
def test_libsvm_1(self): algorithm = "LibSVM" service = "classification" params = {'-S': 3,'-K': 1,'-D': 1,'-R':0,'-C':0.5,'-N':1,'-Z':True,'-J':True,'-V':True,'-P':0.5,'-M':20,'-E':0.1,'-H':False,'-W':1,'-B':False,'-seed':1} self.assertTrue(hyper_parameter_check(self.library, service, algorithm, params))
def test_naive_bayes_multinomial_1(self): algorithm = "NaiveBayesMultinomial" service = "classification" params = {'-output-debug-info': True,'-do-not-check-capabilities': False,'-num-decimal-places': 3,'-batch-size': 50} self.assertTrue(hyper_parameter_check(self.library, service, algorithm, params))
def test_canopy_1(self): algorithm = "Canopy" service = "clustering" params = {'-max-candidates':100,'-periodic-pruning':10000,'-min-density':2,'-t2':-1.0,'-t1':-1.5,'-M':False,'-S':1,'-output-debug-info':False,'-do-not-check-capabilities':False} self.assertTrue(hyper_parameter_check(self.library, service, algorithm, params))
def test_random_tree_1(self): algorithm = "RandomTree" service = "classification" params = {'-K': 0, '-M': 2, '-V': 0.1,'-S':2,'-depth':1,'-N':1,'-U':True,'-B':True,'-output-debug-info':False,'-do-not-check-capabilities':True,'-num-decimal-places':1} self.assertTrue(hyper_parameter_check(self.library, service, algorithm, params))
def test_ibk_1(self): algorithm = "IBk" service = "classification" params = {'-I': False, '-F': True, '-K': 2,'-E':True,'-W':True,'-X':False,} self.assertTrue(hyper_parameter_check(self.library, service, algorithm, params))
def test_deeplearning_1(self): algorithm = "DeepLearning" params = { 'activation': 'rectifier', 'adaptive_rate': True, 'auc_type': 'auto', 'autoencoder': False, 'average_activation': 0.0, 'balance_classes': False, 'categorical_encoding': 'auto', 'classification_stop': 0, 'col_major': False, 'diagnostics': True, 'distribution': 'auto', 'elastic_averaging': False, 'elastic_averaging_moving_rate': 0.9, 'elastic_averaging_regularization': 0.001, 'epochs': 10, 'epsilon': 1e-08, 'export_weights_and_biases': False, 'fast_mode': True, 'fold_assignment': 'auto', 'force_load_balance': True, 'huber_alpha': 0.9, 'ignore_const_cols': True, 'initial_weight_distribution': 'uniform_adaptive', 'initial_weight_scale': 0, 'input_dropout_ratio': 0, 'keep_cross_validation_fold_assignment': False, 'keep_cross_validation_models': True, 'keep_cross_validation_predictions': False, 'l1': 0, 'l2': 0, 'loss': 'automatic', 'max_after_balance_size': 5.0, 'max_categorical_features': 2147483647, 'max_confusion_matrix_size': 20, 'max_runtime_secs': 0.0, 'max_w2': 3.4028235e+38, 'mini_batch_size': 1, 'missing_values_handling': 'mean_imputation', 'momentum_ramp': 1000000, 'momentum_stable': 0, 'momentum_start': 0, 'nesterov_accelerated_gradient': True, 'overwrite_with_best_model': True, 'quantile_alpha': 0.5, 'quiet_mode': False, 'rate': 0.005, 'rate_annealing': 1e-06, 'rate_decay': 1, 'regression_stop': 1e-06, 'replicate_training_data': True, 'reproducible': False, 'rho': 0.99, 'score_duty_cycle': 0.1, 'score_each_iteration': False, 'score_interval': 5, 'score_training_samples': 10000, 'score_validation_samples': 0, 'score_validation_sampling': 'uniform', 'seed': -1, 'shuffle_training_data': False, 'single_node_mode': False, 'sparse': False, 'sparsity_beta': 0, 'standardize': True, 'stopping_metric': 'auto', 'stopping_rounds': 5, 'stopping_tolerance': 0, 'target_ratio_comm_to_comp': 0.05, 'train_samples_per_iteration': -2, 'tweedie_power': 1.5, 'use_all_factor_levels': True, 'variable_importances': True } self.assertTrue( hyper_parameter_check(self.library, self.service, algorithm, params))
def test_make_density_based_clusterer_1(self): algorithm = "MakeDensityBasedClusterer" service = "clustering" params = {'-M':1e-06,'-S':10,'-V':False} self.assertTrue(hyper_parameter_check(self.library, service, algorithm, params))
def test_farthest_first_1(self): algorithm = "FarthestFirst" service = "clustering" params = {'-S':1} self.assertTrue(hyper_parameter_check(self.library, service, algorithm, params))
def test_smo_1(self): algorithm = "SMO" service = "classification" params = {'-no-checks': True, '-C': 2, '-N': 1, '-L': 0.1, '-P': 0.001, '-M': False, '-V': 1, '-W': 1, '-output-debug-info': False,'-do-not-check-capabilities':True,'-num-decimal-places':2} self.assertTrue(hyper_parameter_check(self.library, service, algorithm, params))
def test_multilayer_perceptron_1(self): algorithm = "MultilayerPerceptron" service = "classification" params = {'-L': 0.4,'-M': 0.5,'-N': 200,'-V': 50,'-S': 1,'-E': 21,'-A': False,'-B': False,'-H': "a",'-C': False,'-I': False,'-R': False,'-D': False} self.assertTrue(hyper_parameter_check(self.library, service, algorithm, params))
def test_linear_regression_1(self): algorithm = "LinearRegression" service = "regression" params = {'-S': 2, '-C': True, '-R': 0.00001, '-minimal': False, '-additional-stats': True, '-output-debug-info': False, '-do-not-check-capabilities': False} self.assertTrue(hyper_parameter_check(self.library, service, algorithm, params))
def test_random_forest_1(self): algorithm = "RandomForest" service = "classification" params = {'-P': 99, '-O': False, '-store-out-of-bag-predictions': False,'-output-out-of-bag-complexity-statistics': False,'-print':False,'-attribute-importance':False,'-I':80,'-num-slots':1,'-K':0,'-M':2,'-V':0.1,'-S':1,'-depth':1,'-N':0,'-U':True,'-B':True,'-output-debug-info':True,'-do-not-check-capabilities':False,'-num-decimal-places':1} self.assertTrue(hyper_parameter_check(self.library, service, algorithm, params))
def test_logistic_1(self): algorithm = "Logistic" service = "classification" params = { '-S': False, '-M': 2} self.assertTrue(hyper_parameter_check(self.library, service, algorithm, params))
def test_adaboostm1_1(self): algorithm = "AdaBoostM1" service = "classification" params = {'-P': 99, '-Q': False, '-S': 2,'-I': 20,'-D':False} self.assertTrue(hyper_parameter_check(self.library, service, algorithm, params))
def test_additive_regression_1(self): algorithm = "AdditiveRegression" service = "regression" params = {'-S': 0.4, '-I': 5, '-A': False, '-D': False} self.assertTrue(hyper_parameter_check(self.library, service, algorithm, params))
def test_hierarchical_clusterer_1(self): algorithm = "HierarchicalClusterer" service = "clustering" params = {'-L':'SINGLE','-P':False,'-D':False,'-B':False} self.assertTrue(hyper_parameter_check(self.library, service, algorithm, params))
def test_em_1(self): algorithm = "EM" service = "clustering" params = {'-X':5,'-K':10,'-max':-1,'-ll-cv':1e-06,'-I':100,'-ll-iter':1e-06,'-V':False,'-M':1e-06,'-O':False,'-num-slots':1,'-S':100,'-output-debug-info':False,'-do-not-check-capabilities':False} self.assertTrue(hyper_parameter_check(self.library, service, algorithm, params))
def test_decision_table_1(self): algorithm = "DecisionTable" service = "classification" params = {'-X': 2, '-E': 'mae','-I':True,'-R':True,'-P':False} self.assertTrue(hyper_parameter_check(self.library, service, algorithm, params))
def train(self, task, algorithm, dataset, label, features, model_name=None, lib="weka", train_percentage=80, folds=5, cross_validation=False, params=None, dataset_source=None, evaluation_plots=False): """ :param task: Training task to perform. Valid parameter values are classification, regression. :param algorithm: Algorithm used for training the model. :param dataset: dataset file location in DLTK storage. :param label: Target variable. :param features: List of features used for training the model. :param model_name: Model will be saved with the name specified in this parameter. :param lib: Library for training the model. Currently we are supporting scikit, h2o and weka. :param train_percentage: Percentage of data used for training the model. Rest of the data will be used to test the model. :param dataset_source: To specify data source, None: Dataset file from DLTK storage will be used database: Query from connected database will be used :param folds: number of folds for cross validation :param cross_validation: Evaluates model using crossvalidation if set to True. :rtype: A json object containing the file path in storage. """ task, library, algorithm, features, label, train_percentage = validate_parameters( task, lib, algorithm, features, label, train_percentage) # if additional parameters passed, check whether those are valid or not if params is not None: hyper_parameter_flag = hyper_parameter_check( library, task, algorithm, params) assert hyper_parameter_flag, "Please check the params, training failed due to incorrect values" url = self.base_url + '/machine/' + task + '/train/' headers = {"ApiKey": self.api_key, "Content-type": "application/json"} if params is None: params = {} if model_name is None: model_name = algorithm if dataset_source == "database": body = { "library": lib, "task": "train", "jobType": "DATABASE", "queryId": dataset, "config": { "name": model_name, "algorithm": algorithm, "label": label, "trainPercentage": train_percentage, "features": features, "params": params, "folds": folds, "crossValidation": cross_validation, "evalPlots": evaluation_plots } } else: body = { "library": lib, "task": "train", "config": { "name": model_name, "algorithm": algorithm, "datasetUrl": dataset, "label": label, "trainPercentage": train_percentage, "features": features, "params": params, "folds": folds, "crossValidation": cross_validation, "evalPlots": evaluation_plots } } body = json.dumps(body) response = requests.post(url=url, data=body, headers=headers) response = response.json() return response