def test_scenario03(self): """ Scenario: Successfully building test predictions from source Given I created the dataset in setup_scenario02 And I create BigML topic model resources from source to test "<test>" with options "<options>" and log predictions in "<output>" And I check that the dataset has been created And I check that the topic model has been created And I check that the topic distributions are ready Then the local topic distribution file is like "<topic_distribution_file>" Examples: | test | options | output |topic_distribution_file | | ../data/spam.csv | --test-separator="\t --prediction-header" |./scenario3_td/topic_distributions.csv | ./check_files/topic_distributions_spam.csv | """ print self.test_scenario03.__doc__ examples = [[ 'data/spam.csv', '--test-separator="\t" --prediction-header', 'scenario3_td/topic_distributions.csv', 'check_files/topic_distributions_spam.csv', './check_files/topic_distributions_spam.csv' ]] for example in examples: print "\nTesting with:\n", example topic_pred.i_create_all_td_resources_from_source( \ self, example[0], example[1], example[2]) test_pred.i_check_create_dataset(self, suffix=None) topic_pred.i_check_create_topic_model(self) topic_pred.i_check_create_topic_distributions(self) topic_pred.i_check_topic_distributions(self, example[3])
def test_scenario2(self): """ Scenario: Successfully building test predictions from source Given I have previously executed "<scenario>" or reproduce it with arguments <kwargs> And I create BigML resources using source to find anomaly scores for "<test>" and log predictions in "<output>" And I check that the dataset has been created And I check that the anomaly detector has been created And I check that the anomaly scores are ready Then the local anomaly scores file is like "<predictions_file>" Examples: |scenario | kwargs | test | output |predictions_file | | scenario_an_1| {"data": "../data/tiny_kdd.csv", "output": "./scenario_an_1/anomaly_scores.csv", "test": "../data/test_kdd.csv"} | ../data/test_kdd.csv | ./scenario_an_2/anomaly_scores.csv | ./check_files/anomaly_scores_kdd.csv | """ print self.test_scenario2.__doc__ examples = [ ['scenario_an_1', '{"data": "data/tiny_kdd.csv", "output": "scenario_an_1/anomaly_scores.csv", "test": "data/test_kdd.csv"}', 'data/test_kdd.csv', 'scenario_an_2/anomaly_scores.csv', 'check_files/anomaly_scores_kdd.csv']] for example in examples: print "\nTesting with:\n", example test_pred.i_have_previous_scenario_or_reproduce_it(self, example[0], example[1]) test_anomaly.i_create_anomaly_resources_from_source(self, test=example[2], output=example[3]) test_pred.i_check_create_dataset(self) test_anomaly.i_check_create_anomaly(self) test_anomaly.i_check_create_anomaly_scores(self) test_anomaly.i_check_anomaly_scores(self, example[4])
def test_scenario2(self): """ Scenario: Successfully building predictions for data streamed to stdin: Given I create BigML resources uploading train "<data>" file to test "<test>" read from stdin with name "<name>" and log predictions in "<output>" And I check that the source has been created And I check that the dataset has been created And I check that the model has been created And I check that the predictions are ready Then the local prediction file is like "<predictions_file>" Examples: | data | test | output |predictions_file | name | | ../data/iris.csv | ../data/test_iris.csv | ./scenario_st_2/predictions.csv | ./check_files/predictions_iris.csv | Source name: áéí | """ print self.test_scenario2.__doc__ examples = [ ['data/iris.csv', 'data/test_iris.csv', 'scenario_st_2/predictions.csv', 'check_files/predictions_iris.csv', 'Source name: áéí']] for example in examples: print "\nTesting with:\n", example stdin.i_create_all_resources_to_test_from_stdin(self, data=example[0], test=example[1], name=example[4], output=example[2]) test_pred.i_check_create_source(self) test_pred.i_check_create_dataset(self, suffix=None) test_pred.i_check_create_model(self) test_pred.i_check_create_predictions(self) test_pred.i_check_predictions(self, example[3])
def test_scenario11(self): """ Scenario: Successfully building feature selection from dataset setting objective: Given I create BigML dataset uploading train "<data>" file in "<output>" And I check that the source has been created And I check that the dataset has been created And I create BigML feature selection <kfold>-fold cross-validation with options "<options>" And I check that the <kfold>-datasets have been created And I check that the <kfold>-ensembles have been created And I check that all the <kfold>-fold cross-validations have been created Then the predictions file "<predictions_file>" is like "<estimated_file>" Examples: | data |output | kfold | options | predictions_file | estimated_file | ../data/iris.csv |./scenario_a_14/evaluation | 2 | --exclude-features="species,petal length" --predictions.csv --number-of-models 2| scenario_a_14/kfold2_pred/predictions.csv | check_files/analyze_predictions_iris_e.csv """ print self.test_scenario11.__doc__ examples = [ ['data/iris.csv', 'scenario_a_14/evaluation', '2', ' --exclude-features="petal length,sepal length" --predictions-csv --number-of-models 2','scenario_a_14/test/kfold2_pred/predictions.csv', 'check_files/analyze_predictions_iris_e.csv']] for example in examples: print "\nTesting with:\n", example test_pred.i_create_dataset(self, data=example[0], output=example[1]) test_pred.i_check_create_source(self) test_pred.i_check_create_dataset(self) test_pred.i_create_kfold_cross_validation_options(self, k_folds=example[2], options=example[3]) test_pred.i_check_create_kfold_datasets(self, example[2]) test_pred.i_check_create_kfold_ensembles(self, example[2]) test_pred.i_check_create_all_kfold_cross_validations(self, example[2]) test_pred.i_check_predictions_file(self, example[4], example[5])
def test_scenario2(self): """ Scenario: Successfully building test predictions from source Given I have previously executed "<scenario>" or reproduce it with arguments <kwargs> And I create BigML resources using source to find centroids for "<test>" and log predictions in "<output>" And I check that the dataset has been created And I check that the cluster has been created And I check that the centroids are ready Then the local centroids file is like "<predictions_file>" Examples: |scenario | kwargs | test | output |predictions_file | | scenario_c_1| {"data": "../data/diabetes.csv", "output": "./scenario_c_1/centroids.csv", "test": "../data/diabetes.csv"} | ../data/diabetes.csv | ./scenario_c_2/centroids.csv | ./check_files/centroids_diabetes.csv | """ print self.test_scenario2.__doc__ examples = [ ['scenario_c_1', '{"data": "data/diabetes.csv", "output": "scenario_c_1/centroids.csv", "test": "data/diabetes.csv"}', 'data/diabetes.csv', 'scenario_c_2/centroids.csv', 'check_files/centroids_diabetes.csv']] for example in examples: print "\nTesting with:\n", example test_pred.i_have_previous_scenario_or_reproduce_it(self, example[0], example[1]) test_cluster.i_create_cluster_resources_from_source(self, test=example[2], output=example[3]) test_pred.i_check_create_dataset(self, suffix=None) test_pred.i_check_create_cluster(self) test_cluster.i_check_create_centroids(self) test_pred.i_check_predictions(self, example[4])
def test_scenario9(self): """ Scenario: Successfully building random fields analysis from dataset: Given I create BigML dataset uploading train "<data>" file in "<output>" And I check that the source has been created And I check that the dataset has been created And I create BigML random fields analysis with <kfold>-cross-validation improving "<metric>" And I check that the <kfold>-datasets have been created And I check that the <kfold>-random trees have been created And I check that all the <kfold>-fold cross-validations have been created Then the best random candidates number is "<random_candidates>", with "<metric>" of <metric_value> Examples: | data | output | kfold | metric | random_candidates | metric_value | | ../data/iris.csv | ./scenario_a_11/evaluation |2 | precision | 4 | 96.09% | """ print self.test_scenario9.__doc__ examples = [ ['data/iris.csv', 'scenario_a_11/evaluation', '2', 'precision', '4', '96.09%']] for example in examples: print "\nTesting with:\n", example test_pred.i_create_dataset(self, data=example[0], output=example[1]) test_pred.i_check_create_source(self) test_pred.i_check_create_dataset(self) test_pred.i_create_random_analysis(self, k_fold=example[2], metric=example[3]) test_pred.i_check_create_kfold_datasets(self, example[2]) test_pred.i_check_create_kfold_random_forest(self, example[2]) test_pred.i_check_create_all_kfold_cross_validations(self, example[2]) test_pred.i_check_random_candidates(self, example[4], example[3], example[5])
def test_scenario1(self): """ Scenario: Successfully building k-fold cross-validation from dataset: Given I create BigML dataset uploading train "<data>" file in "<output>" And I check that the source has been created And I check that the dataset has been created And I create BigML <kfold>-fold cross-validation And I check that the <kfold>-datasets have been created And I check that the <kfold>-models have been created And I check that the <kfold>-fold cross-validation has been created Then the evaluation file is like "<json_evaluation_file>" Examples: | data | output | kfold | json_evaluation_file | | ../data/iris.csv | ./scenario_a_1/evaluation | 2 | ./check_files/evaluation_kfold.json | """ print self.test_scenario1.__doc__ examples = [ ['data/iris.csv', 'scenario_a_1/evaluation', '2', 'check_files/evaluation_kfold.json']] for example in examples: print "\nTesting with:\n", example test_pred.i_create_dataset(self, data=example[0], output=example[1]) test_pred.i_check_create_source(self) test_pred.i_check_create_dataset(self) test_pred.i_create_kfold_cross_validation(self, k_folds=example[2]) test_pred.i_check_create_kfold_datasets(self, example[2]) test_pred.i_check_create_kfold_models(self, example[2]) test_pred.i_check_create_kfold_cross_validation(self, example[2]) evaluation.then_the_evaluation_file_is_like(self, example[3])
def test_scenario1(self): """ Scenario: Successfully building test predictions from dataset specifying objective field and model fields Given I create a BigML dataset from "<data>" and store logs in "<output_dir>" And I check that the source has been created And I check that the dataset has been created And I create BigML resources using dataset, objective field <objective> and model fields <fields> to test "<test>" and log predictions in "<output>" And I check that the model has been created And I check that the predictions are ready Then the local prediction file is like "<predictions_file>" Examples: |data | output_dir | test | output |predictions_file | objective | fields | | ../data/iris_2fb.csv| ./scénario1 | ../data/test_iris2fb.csv | ./scénario1/predictions.csv | ./check_files/predictions_iris_2fb.csv | spécies | "pétal width" | """ print self.test_scenario1.__doc__ examples = [ ['data/iris_2fb.csv', u'scénario1', 'data/test_iris2fb.csv', u'scénario1/predictions.csv', 'check_files/predictions_iris_2fb.csv', u'spécies', u'"pétal width"']] for example in examples: print "\nTesting with:\n", example dataset_adv.i_create_dataset(self, data=example[0], output_dir=example[1]) test_pred.i_check_create_source(self) test_pred.i_check_create_dataset(self, suffix=None) test_pred.i_create_resources_from_dataset_objective_model(self, objective=example[5], fields=example[6], test=example[2], output=example[3]) test_pred.i_check_create_model(self) test_pred.i_check_create_predictions(self) test_pred.i_check_predictions(self, example[4])
def setup_scenario1(self): """ Scenario: Successfully building multi-label test predictions from start: Given I create BigML multi-label resources tagged as "<tag>" with "<label_separator>" label separator and <number_of_labels> labels uploading train "<data>" file with "<training_separator>" field separator to test "<test>" and log predictions in "<output>" And I check that the source has been created And I check that the dataset has been created And I check that the models have been created And I check that the predictions are ready Then the local prediction file is like "<predictions_file>" Examples: |tag |label_separator |number_of_labels | data |training_separator | test | output |predictions_file | |my_multilabel_1|:|7| ../data/multilabel.csv |,| ../data/test_multilabel.csv | ./scenario_ml_1/predictions.csv | ./check_files/predictions_ml.csv | """ print self.setup_scenario1.__doc__ examples = [[ 'my_multilabel_1', ':', '7', 'data/multilabel.csv', ',', 'data/test_multilabel.csv', 'scenario_ml_1/predictions.csv', 'check_files/predictions_ml.csv' ]] for example in examples: print "\nTesting with:\n", example ml_pred.i_create_all_ml_resources(self, tag=example[0], label_separator=example[1], number_of_labels=example[2], data=example[3], training_separator=example[4], test=example[5], output=example[6]) test_pred.i_check_create_source(self) test_pred.i_check_create_dataset(self) test_pred.i_check_create_models(self) test_pred.i_check_create_predictions(self) test_pred.i_check_predictions(self, example[7])
def test_scenario2(self): """ Given I create BigML resources uploading train "<data>" file to test "<test>" remotely with proportional missing strategy and log predictions in "<output>" And I check that the source has been created And I check that the dataset has been created And I check that the model has been created And I check that the source has been created from the test file And I check that the dataset has been created from the test file And I check that the batch prediction has been created And I check that the predictions are ready Then the local prediction file is like "<predictions_file>" Examples: | data | test | output |predictions_file | | ../data/iris.csv | ../data/test_iris_nulls.csv | ./scenario_mis_2/predictions.csv | ./check_files/predictions_iris_nulls.csv """ print self.test_scenario2.__doc__ examples = [ ['data/iris.csv', 'data/test_iris_nulls.csv', 'scenario_mis_2/predictions.csv', 'check_files/predictions_iris_nulls.csv']] for example in examples: print "\nTesting with:\n", example test_pred.i_create_all_resources_remote_proportional(self, data=example[0], test=example[1], output=example[2]) test_pred.i_check_create_source(self) test_pred.i_check_create_dataset(self, suffix=None) test_pred.i_check_create_model(self) test_pred.i_check_create_test_source(self) test_pred.i_check_create_test_dataset(self) test_pred.i_check_create_batch_prediction(self) test_pred.i_check_create_predictions(self) test_pred.i_check_predictions(self, example[3])
def setup_scenario02(self): """ Scenario: Successfully building test predictions from start: Given I create BigML logistic regression resources uploading train "<data>" file to test "<test>" and log predictions in "<output>" And I check that the source has been created And I check that the dataset has been created And I check that the model has been created And I check that the predictions are ready Then the local prediction file is like "<predictions_file>" Examples: | data | test | output |predictions_file | | ../data/iris.csv | ../data/test_iris.csv | ./scenario1_lr/predictions.csv | ./check_files/predictions_iris_lr.csv | """ print self.setup_scenario02.__doc__ examples = [ ['data/iris.csv', 'data/test_iris.csv', 'scenario1_lr/predictions.csv', 'check_files/predictions_iris_lr.csv']] for example in examples: print "\nTesting with:\n", example lr_pred.i_create_all_lr_resources(self, example[0], example[1], example[2]) test_pred.i_check_create_source(self) test_pred.i_check_create_dataset(self, suffix=None) lr_pred.i_check_create_lr_model(self) test_pred.i_check_create_predictions(self) test_pred.i_check_predictions(self, example[3])
def setup_scenario02(self): """ Scenario: Successfully building test predictions from start: Given I create BigML resources uploading train "<data>" file to test "<test>" and log predictions in "<output>" And I check that the source has been created And I check that the dataset has been created And I check that the model has been created And I check that the predictions are ready Then the local prediction file is like "<predictions_file>" Examples: | data | test | output |predictions_file | """ examples = [ ['data/grades.csv', 'data/test_grades.csv', 'scenario1_r/predictions.csv', 'check_files/predictions_grades.csv'], ['data/iris.csv', 'data/test_iris.csv', 'scenario1/predictions.csv', 'check_files/predictions_iris.csv']] show_doc(self.setup_scenario02, examples) for example in examples: print "\nTesting with:\n", example test_pred.i_create_all_resources(self, example[0], example[1], example[2]) test_pred.i_check_create_source(self) test_pred.i_check_create_dataset(self, suffix=None) test_pred.i_check_create_model(self) test_pred.i_check_create_predictions(self) test_pred.i_check_predictions(self, example[3])
def test_scenario7(self): """ Scenario: Successfully building feature selection for a category from dataset: Given I create BigML dataset uploading train "<data>" file with attributes "<attributes>" in "<output>" And I check that the source has been created And I check that the dataset has been created And I create BigML feature selection <kfold>-fold cross-validations improving "<metric>" for category "<category>" And I check that the <kfold>-datasets have been created And I check that the <kfold>-models have been created And I check that all the <kfold>-fold cross-validations have been created Then the best feature selection is "<selection>", with "<metric>" of <metric_value> Examples: | data | attributes | output | kfold | metric | category | selection | metric_value | ../data/spam.csv | ../data/spam_attributes.json |./scenario_a_9/evaluation | 2 | recall | spam | Message | 61.24% """ print self.test_scenario7.__doc__ examples = [ ['data/spam.csv', 'data/spam_attributes.json', 'scenario_a_9/evaluation', '2', 'recall', 'spam', 'Message', '61.24%']] for example in examples: print "\nTesting with:\n", example test_pred.i_create_dataset_with_attributes(self, data=example[0], attributes=example[1], output=example[2]) test_pred.i_check_create_source(self) test_pred.i_check_create_dataset(self) test_pred.i_create_kfold_cross_validation_metric_category(self, k_folds=example[3], metric=example[4], category=example[5]) test_pred.i_check_create_kfold_datasets(self, example[3]) test_pred.i_check_create_kfold_models(self, example[3]) test_pred.i_check_create_all_kfold_cross_validations(self, example[3]) test_pred.i_check_feature_selection(self, example[6], example[4], example[7])
def test_scenario5(self): """ Scenario: Successfully building nodes threshold analysis from dataset file: Given I create BigML dataset uploading train "<data>" file in "<output>" And I check that the source has been created And I check that the dataset has been created And I create BigML nodes analysis from dataset file from <min_nodes> to <max_nodes> by <nodes_step> with <kfold>-cross-validation improving "<metric>" And I check that the <kfold>-datasets have been created And I check that the <kfold>-models have been created And I check that all the <kfold>-fold cross-validations have been created Then the best node threshold is "<node_threshold>", with "<metric>" of <metric_value> Examples: | data | output | min_nodes | max_nodes | nodes_step | kfold | metric | node_threshold | metric_value | ../data/iris.csv | ./scenario_a_4/evaluation | 3 | 14 | 2 |2 | precision | 9 | 94.71% """ print self.test_scenario5.__doc__ examples = [ ['data/iris.csv', 'scenario_a_4/evaluation', '3', '14', '2', '2', 'precision', '9', '94.71%']] for example in examples: print "\nTesting with:\n", example test_pred.i_create_dataset(self, data=example[0], output=example[1]) test_pred.i_check_create_source(self) test_pred.i_check_create_dataset(self) test_pred.i_create_nodes_analysis_from_dataset_file( self, min_nodes=example[2], max_nodes=example[3], nodes_step=example[4], k_fold=example[5], metric=example[6]) test_pred.i_check_create_kfold_datasets(self, example[5]) test_pred.i_check_create_kfold_models(self, example[5]) test_pred.i_check_create_all_kfold_cross_validations(self, example[5]) test_pred.i_check_node_threshold(self, example[7], example[6], example[8])
def test_scenario7(self): """ Scenario: Successfully building anomalous dataset test predictions from anomaly Given I create BigML anomaly detector from data <data> with options <options> and generate a new dataset of anomalies in "<output_dir>" And I check that the source has been created And I check that the dataset has been created And I check that the anomaly detector has been created Then I check that the new top anomalies dataset has been created And the top anomalies in the anomaly detector are <top_anomalies> And the forest size in the anomaly detector is <forest_size> And the number of records in the top anomalies dataset is <top_anomalies> Examples: | data | options | output_dir | top_anomalies | forest_size | | data/tiny_kdd.csv" | --top-anomalies 15 --forest-size 40 | scenario_an_7 | 15 | 40 | """ print self.test_scenario7.__doc__ examples = [ ['data/tiny_kdd.csv', '--top-n 15 --forest-size 40 ', 'scenario_an_7', '15', '40']] for example in examples: print "\nTesting with:\n", example test_anomaly.i_create_anomaly_resources_with_options(self, example[0], example[1], output_dir=example[2]) test_pred.i_check_create_source(self) test_pred.i_check_create_dataset(self) test_anomaly.i_check_create_anomaly(self) test_pred.i_check_create_dataset(self, suffix='gen ') test_anomaly.i_check_top_anomalies(self, example[3]) test_anomaly.i_check_forest_size(self, example[4]) test_anomaly.i_check_dataset_lines_number(self, example[3])
def test_scenario2(self): """ Scenario: Successfully building test predictions from source Given I have previously executed "<scenario>" or reproduce it with arguments <kwargs> And I create BigML multi-label resources using source to test "<test>" and log predictions in "<output>" And I check that the dataset has been created And I check that the models have been created And I check that the predictions are ready Then the local prediction file is like "<predictions_file>" Examples: |scenario | kwargs | test | output |predictions_file | | scenario_ml_1| {"tag": "my_multilabel_1", "data": "../data/multilabel.csv", "label_separator": ":", "number_of_labels": 7, "training_separator": ",", "output": "./scenario_ml_1/predictions.csv", "test": "../data/test_multilabel.csv"} | ../data/test_multilabel.csv | ./scenario_ml_2/predictions.csv | ./check_files/predictions_ml_comma.csv | """ print self.test_scenario2.__doc__ examples = [[ 'scenario_ml_1', '{"tag": "my_multilabel_1", "data": "data/multilabel.csv", "label_separator": ":", "number_of_labels": 7, "training_separator": ",", "output": "scenario_ml_1/predictions.csv", "test": "data/test_multilabel.csv"}', 'data/test_multilabel.csv', 'scenario_ml_2/predictions.csv', 'check_files/predictions_ml_comma.csv' ]] for example in examples: print "\nTesting with:\n", example test_pred.i_have_previous_scenario_or_reproduce_it( self, example[0], example[1]) test_pred.i_create_resources_from_source(self, multi_label='multi-label', test=example[2], output=example[3]) test_pred.i_check_create_dataset(self) test_pred.i_check_create_models(self) test_pred.i_check_create_predictions(self) test_pred.i_check_predictions(self, example[4])
def test_scenario2(self): """ Scenario: Successfully building test predictions from source Given I have previously executed "<scenario>" or reproduce it with arguments <kwargs> And I create BigML multi-label resources using source with objective "<objective>" and model fields "<model_fields>" to test "<test>" and log predictions in "<output>" And I check that the dataset has been created And I check that the models have been created And I check that the predictions are ready Then the local prediction file is like "<predictions_file>" Examples: |scenario | kwargs | objective | model_fields | test | output |predictions_file | | scenario_mlm_1| {"tag": "my_multilabelm_1", "data": "../data/multilabel_multi.csv", "label_separator": ":", "number_of_labels": 7, "training_separator": ",", "output": "./scenario_mlm_1/predictions.csv", "test": "../data/test_multilabel.csv", "ml_fields": "type,class", "model_fields": "-type,-type - W,-type - A,-type - C,-type - S,-type - R,-type - T,-type - P", "objective": "class"} | class | -type,-type - W,-type - A,-type - C,-type - S,-type - R,-type - T,-type - P |../data/test_multilabel.csv | ./scenario_mlm_2/predictions.csv | ./check_files/predictions_ml_comma.csv | """ print self.test_scenario2.__doc__ examples = [ ['scenario_mlm_1', '{"tag": "my_multilabelm_1", "data": "data/multilabel_multi.csv", "label_separator": ":", "number_of_labels": 7, "training_separator": ",", "output": "scenario_mlm_1/predictions.csv", "test": "data/test_multilabel.csv", "ml_fields": "type,class", "model_fields": "-type,-type - W,-type - A,-type - C,-type - S,-type - R,-type - T,-type - P", "objective": "class"}', 'class', '-type,-type - W,-type - A,-type - C,-type - S,-type - R,-type - T,-type - P', 'data/test_multilabel.csv', 'scenario_mlm_2/predictions.csv', 'check_files/predictions_ml_comma.csv']] for example in examples: print "\nTesting with:\n", example test_pred.i_have_previous_scenario_or_reproduce_it(self, example[0], example[1]) test_pred.i_create_resources_from_source_with_objective(self, multi_label='multi-label ', objective=example[2], model_fields=example[3], test=example[4], output=example[5]) test_pred.i_check_create_dataset(self) test_pred.i_check_create_models(self) test_pred.i_check_create_predictions(self) test_pred.i_check_predictions(self, example[6])
def setup_scenario1(self): """ Scenario: Successfully building test anomaly scores from scratch: Given I create BigML resources uploading train "<data>" file to create anomaly scores for "<test>" and log predictions in "<output>" And I check that the source has been created And I check that the dataset has been created And I check that the anomaly detector has been created And I check that the anomaly scores are ready Then the local anomaly scores file is like "<predictions_file>" Examples: | data | test | output |predictions_file | | ../data/tiny_kdd.csv | ../data/test_kdd.csv | ./scenario_an_1/anomaly_scores.csv | ./check_files/anomaly_scores_kdd.csv | """ print self.setup_scenario1.__doc__ examples = [ ['data/tiny_kdd.csv', 'data/test_kdd.csv', 'scenario_an_1/anomaly_scores.csv', 'check_files/anomaly_scores_kdd.csv']] for example in examples: print "\nTesting with:\n", example test_anomaly.i_create_all_anomaly_resources(self, data=example[0], test=example[1], output=example[2]) test_pred.i_check_create_source(self) test_pred.i_check_create_dataset(self) test_anomaly.i_check_create_anomaly(self) test_anomaly.i_check_create_anomaly_scores(self) test_anomaly.i_check_anomaly_scores(self, example[3])
def test_scenario6(self): """ Scenario: Successfully building feature selection from dataset excluding features: Given I create BigML dataset uploading train "<data>" file in "<output>" And I check that the source has been created And I check that the dataset has been created And I create BigML feature selection <kfold>-fold cross-validations excluding "<features>" with separator "<args_separator>" improving "<metric>" And I check that the <kfold>-datasets have been created And I check that the <kfold>-models have been created And I check that all the <kfold>-fold cross-validations have been created Then the best feature selection is "<selection>", with "<metric>" of <metric_value> Examples: | data | output | kfold | features | args_separator | metric | selection | metric_value | | ../data/iris.csv | ./scenario_a_7/evaluation | 2 | petal length!sepal width | ! | accuracy | petal width | 95.33% | """ print self.test_scenario6.__doc__ examples = [ ['data/iris.csv', 'scenario_a_7/evaluation', '2', 'petal length!sepal width', '!', 'accuracy', 'petal width', '95.33%']] for example in examples: print "\nTesting with:\n", example test_pred.i_create_dataset(self, data=example[0], output=example[1]) test_pred.i_check_create_source(self) test_pred.i_check_create_dataset(self) test_pred.i_create_kfold_cross_validation_separator_metric_no_fields(self, k_folds=example[2], features=example[3], args_separator=example[4], metric=example[5]) test_pred.i_check_create_kfold_datasets(self, example[2]) test_pred.i_check_create_kfold_models(self, example[2]) test_pred.i_check_create_all_kfold_cross_validations(self, example[2]) test_pred.i_check_feature_selection(self, example[6], example[5], example[7])
def test_scenario1(self): """ Scenario: Successfully building test predictions with missing-splits model: Given I create BigML resources uploading train "<data>" file to test "<test>" with a missing-splits model and log predictions in "<output>" And I check that the source has been created And I check that the dataset has been created And I check that the model has been created And I check that the predictions are ready Then the local prediction file is like "<predictions_file>" Examples: | data | test | output |predictions_file | | ../data/iris_missing.csv | ../data/test_iris_missing.csv | ./scenario_mspl_1/predictions.csv | ./check_files/predictions_iris_missing.csv | """ print self.test_scenario1.__doc__ examples = [ ['data/iris_missing.csv', 'data/test_iris_missing.csv', 'scenario_mspl_1/predictions.csv', 'check_files/predictions_iris_missing.csv']] for example in examples: print "\nTesting with:\n", example test_pred.i_create_all_resources_missing_splits(self, data=example[0], test=example[1], output=example[2]) test_pred.i_check_create_source(self) test_pred.i_check_create_dataset(self, suffix=None) test_pred.i_check_create_model(self) test_pred.i_check_create_predictions(self) test_pred.i_check_predictions(self, example[3])
def test_scenario3(self): """ Scenario: Successfully building feature selection from dataset setting objective: Given I create BigML dataset uploading train "<data>" file in "<output>" And I check that the source has been created And I check that the dataset has been created And I create BigML feature selection <kfold>-fold cross-validations for "<objective>" improving "<metric>" And I check that the <kfold>-datasets have been created And I check that the <kfold>-models have been created And I check that all the <kfold>-fold cross-validations have been created Then the best feature selection is "<selection>", with "<metric>" of <metric_value> Examples: | data | objective |output | kfold | metric | selection | metric_value | | ../data/iris_2f.csv | 0 |./scenario_a_5/evaluation | 2 | r_squared| species | 0.352845 | | ../data/iris_2f.csv | 0 |./scenario_a_8/evaluation | 2 | mean_squared_error| species | 0.475200 | """ print self.test_scenario3.__doc__ examples = [ ['data/iris_2f.csv', '0', 'scenario_a_5/evaluation', '2', 'r_squared', 'species', '0.352845'], ['data/iris_2f.csv', '0', 'scenario_a_8/evaluation', '2', 'mean_squared_error', 'species', '0.475200']] for example in examples: print "\nTesting with:\n", example test_pred.i_create_dataset(self, data=example[0], output=example[2]) test_pred.i_check_create_source(self) test_pred.i_check_create_dataset(self) test_pred.i_create_kfold_cross_validation_objective(self, k_folds=example[3], objective=example[1], metric=example[4]) test_pred.i_check_create_kfold_datasets(self, example[3]) test_pred.i_check_create_kfold_models(self, example[3]) test_pred.i_check_create_all_kfold_cross_validations(self, example[3]) test_pred.i_check_feature_selection(self, example[5], example[4], example[6])
def test_scenario3(self): """ Scenario: Successfully building evaluations from start: Given I create BigML resources uploading train "<data>" file to create model and log in "<output>" And I check that the source has been created And I check that the dataset has been created And I check that the model has been created And I evaluate "<test>" with proportional missing strategy And I check that the source has been created And I check that the dataset has been created And I check that the evaluation has been created Then the evaluation file is like "<json_evaluation_file>" Examples: | data | test | output | json_evaluation_file | | ../data/iris.csv | ../data/iris_nulls.csv | ./scenario_mis_3/evaluation | ./check_files/evaluation_iris_nulls.json | """ print self.test_scenario3.__doc__ examples = [ ['data/iris.csv', 'data/iris_nulls.csv', 'scenario_mis_3/evaluation', 'check_files/evaluation_iris_nulls.json']] for example in examples: print "\nTesting with:\n", example test_pred.i_create_all_resources_to_model(self, data=example[0], output=example[2]) test_pred.i_check_create_source(self) test_pred.i_check_create_dataset(self, suffix=None) test_pred.i_check_create_model(self) evaluation.i_create_proportional_to_evaluate(self, test=example[1]) test_pred.i_check_create_source(self) test_pred.i_check_create_dataset(self) test_pred.i_check_create_evaluation(self) evaluation.then_the_evaluation_file_is_like(self, example[3])
def test_scenario4(self): """ Scenario: Successfully building test anomaly score predictions from training set as a dataset: Given I create BigML resources uploading train "<data>" file to find anomaly scores for the training set remotely saved to dataset with no CSV output and log resources in "<output_dir>" And I check that the source has been created And I check that the dataset has been created And I check that the anomaly detector has been created And I check that the batch anomaly scores prediction has been created Then I check that the batch anomaly scores dataset exists And no local CSV file is created Examples: | data | output_dir | | ../data/iris.csv | ./scenario_ab_4 | """ print self.test_scenario3.__doc__ examples = [ ['data/iris.csv', 'scenario_ab_4']] for example in examples: print "\nTesting with:\n", example test_anomaly.i_create_all_anomaly_resources_without_test_split(self, data=example[0], output_dir=example[1]) test_pred.i_check_create_source(self) test_pred.i_check_create_dataset(self) test_anomaly.i_check_create_anomaly(self) test_batch.i_check_create_batch_anomaly_scores(self) test_anomaly.i_check_create_batch_anomaly_score_dataset(self) test_anomaly.i_check_no_local_CSV(self)
def test_scenario03(self): """ Scenario: Successfully building test predictions from source Given I have previously executed "<scenario>" or reproduce it with arguments <kwargs> And I create BigML logistic regression resources using source to test "<test>" and log predictions in "<output>" And I check that the dataset has been created And I check that the model has been created And I check that the predictions are ready Then the local prediction file is like "<predictions_file>" Examples: |scenario | kwargs | test | output |predictions_file | | scenario1| {"data": "../data/iris.csv", "output": "./scenario1_lr/predictions.csv", "test": "../data/test_iris.csv"} | ../data/test_iris.csv | ./scenario2/predictions.csv | ./check_files/predictions_iris.csv | """ print self.test_scenario03.__doc__ examples = [ ['scenario1_lr', '{"data": "data/iris.csv", "output": "scenario1_lr/predictions.csv", "test": "data/test_iris.csv"}', 'data/test_iris.csv', 'scenario2_lr/predictions.csv', 'check_files/predictions_iris_lr.csv']] for example in examples: print "\nTesting with:\n", example test_pred.i_have_previous_scenario_or_reproduce_it(self, example[0], example[1]) lr_pred.i_create_lr_resources_from_source(self, None, test=example[2], output=example[3]) test_pred.i_check_create_dataset(self, suffix=None) lr_pred.i_check_create_lr_model(self) test_pred.i_check_create_predictions(self) test_pred.i_check_predictions(self, example[4])
def test_scenario1(self): """ Scenario: Successfully building test centroid predictions from scratch: Given I create BigML resources uploading train "<data>" file to find centroids for "<test>" remotely with mapping file "<fields_map>" and log predictions in "<output>" And I check that the source has been created And I check that the dataset has been created And I check that the cluster has been created And I check that the source has been created from the test file And I check that the dataset has been created from the test file And I check that the batch centroid prediction has been created And I check that the centroids are ready Then the local centroids file is like "<predictions_file>" Examples: | data | test | fields_map | output |predictions_file | | ../data/grades.csv | ../data/grades_perm.csv | ../data/grades_fields_map_perm.csv | ./scenario_cb_1_r/centroids.csv | ./check_files/centroids_grades.csv | """ print self.test_scenario1.__doc__ examples = [ ['data/grades.csv', 'data/grades_perm.csv', 'data/grades_fields_map_perm.csv', 'scenario_cb_1_r/centroids.csv', 'check_files/centroids_grades.csv']] for example in examples: print "\nTesting with:\n", example test_cluster.i_create_all_cluster_resources_with_mapping(self, data=example[0], test=example[1], fields_map=example[2], output=example[3]) test_pred.i_check_create_source(self) test_pred.i_check_create_dataset(self, suffix=None) test_pred.i_check_create_cluster(self) test_pred.i_check_create_test_source(self) test_pred.i_check_create_test_dataset(self) batch_pred.i_check_create_batch_centroid(self) test_cluster.i_check_create_centroids(self) test_pred.i_check_predictions(self, example[4])
def test_scenario1(self): """ Scenario: Successfully exporting models with params in the available languages: Given I create BigML resources uploading train "<data>" file using "<source_attributes>" and log in "<output>" And I check that the source has been created And I check that the dataset has been created And I check that the model has been created And I export the model as a function in "<language>"to "<output>" Then the export file is like "<check_file>" Examples: | data | source_attributes | output | language | check_file | ../data/movies.csv | data/movies_source_attrs.json | ./scenario_exp_1/model | python | model_function.py """ print self.test_scenario1.__doc__ examples = [ ['data/movies.csv', 'data/movies_source_attrs.json', 'scenario_exp_1_a/model', 'python', 'check_files/export/model_function.py'], ['data/movies.csv', 'data/movies_source_attrs.json', 'scenario_exp_1_b/model', 'javascript', 'check_files/export/model_function.js'], ['data/movies.csv', 'data/movies_source_attrs.json', 'scenario_exp_1_c/model', 'r', 'check_files/export/model_function.R'], ['data/iris.csv', '', 'scenario_exp_1_d/model', 'tableau', 'check_files/export/model_function.tb'], ['data/iris.csv', '', 'scenario_exp_1_e/model', 'mysql', 'check_files/export/model_function.sql'], ['data/libros.csv', 'data/libros_source_attrs.json', 'scenario_exp_1_f/model', 'python', 'check_files/export/model_function_utf8.py'], ['data/libros.csv', 'data/libros_source_attrs.json', 'scenario_exp_1_g/model', 'r', 'check_files/export/model_function_utf8.R'], ['data/libros.csv', 'data/libros_source_attrs.json', 'scenario_exp_1_h/model', 'javascript', 'check_files/export/model_function_utf8.js']] for example in examples: print "\nTesting with:\n", example export.i_create_all_resources_to_model_with_source_attrs( \ self, data=example[0], source_attributes=example[1], output=example[2]) test_pred.i_check_create_source(self) test_pred.i_check_create_dataset(self) test_pred.i_check_create_model(self) export.i_export_model(self, language=example[3], output=example[2]) export.i_check_if_the_output_is_like_expected_file( \ self, language=example[3], expected_file=example[4])
def test_scenario1(self): """ Scenario: Successfully building test anomaly score predictions from scratch: Given I create BigML resources uploading train "<data>" file to find anomaly scores for "<test>" remotely with mapping file "<fields_map>" and log predictions in "<output>" And I check that the source has been created And I check that the dataset has been created And I check that the anomaly detector has been created And I check that the source has been created from the test file And I check that the dataset has been created from the test file And I check that the batch anomaly scores prediction has been created And I check that the anomaly scores are ready Then the local anomaly scores file is like "<predictions_file>" Examples: | data | test | fields_map | output |predictions_file | | ../data/grades.csv | ../data/grades_perm.csv | ../data/grades_fields_map_perm.csv | ./scenario_ab_1_r/anomalies.csv | ./check_files/anomaly_scores_grades.csv | """ print self.test_scenario1.__doc__ examples = [ ['data/grades.csv', 'data/grades_perm.csv', 'data/grades_fields_map_perm.csv', 'scenario_ab_1_r/anomalies.csv', 'check_files/anomaly_scores_grades.csv']] for example in examples: print "\nTesting with:\n", example test_anomaly.i_create_all_anomaly_resources_with_mapping(self, data=example[0], test=example[1], fields_map=example[2], output=example[3]) test_pred.i_check_create_source(self) test_pred.i_check_create_dataset(self) test_anomaly.i_check_create_anomaly(self) test_pred.i_check_create_test_source(self) test_pred.i_check_create_test_dataset(self) test_batch.i_check_create_batch_anomaly_scores(self) test_anomaly.i_check_create_anomaly_scores(self) test_anomaly.i_check_anomaly_scores(self, example[4])
def test_scenario2(self): """ Scenario: Successfully building remote test centroid predictions from scratch to dataset: Given I create BigML resources uploading train "<data>" file to find centroids for "<test>" remotely to dataset with no CSV and log resources in "<output_dir>" And I check that the source has been created And I check that the dataset has been created And I check that the cluster has been created And I check that the source has been created from the test file And I check that the dataset has been created from the test file And I check that the batch centroid prediction has been created Then I check that the batch centroids dataset exists And no local CSV file is created Examples: | data | test | output_dir | | ../data/grades.csv | ../data/test_grades.csv | ./scenario_cb_2 | """ print self.test_scenario2.__doc__ examples = [ ['data/grades.csv', 'data/test_grades.csv', 'scenario_cb_2']] for example in examples: print "\nTesting with:\n", example test_cluster.i_create_all_cluster_resources_to_dataset(self, data=example[0], test=example[1], output_dir=example[2]) test_pred.i_check_create_source(self) test_pred.i_check_create_dataset(self, suffix=None) test_pred.i_check_create_cluster(self) test_pred.i_check_create_test_source(self) test_pred.i_check_create_test_dataset(self) batch_pred.i_check_create_batch_centroid(self) batch_pred.i_check_create_batch_centroids_dataset(self) test_anomaly.i_check_no_local_CSV(self)
def test_scenario1(self): """ Scenario: Successfully building test centroids from scratch: Given I create BigML resources uploading train "<data>" file to create centroids for "<test>" and log predictions in "<output>" And I check that the source has been created And I check that the dataset has been created And I check that the cluster has been created And I check that the centroids are ready Then the local centroids file is like "<predictions_file>" Examples: | data | test | output |predictions_file | | ../data/grades.csv | ../data/grades.csv | ./scenario_c_1_r/centroids.csv | ./check_files/centroids_grades.csv | | ../data/diabetes.csv | ../data/diabetes.csv | ./scenario_c_1/centroids.csv | ./check_files/centroids_diabetes.csv | """ print self.test_scenario1.__doc__ examples = [ ['data/grades.csv', 'data/grades.csv', 'scenario_c_1_r/centroids.csv', 'check_files/centroids_grades.csv'], ['data/diabetes.csv', 'data/diabetes.csv', 'scenario_c_1/centroids.csv', 'check_files/centroids_diabetes.csv']] for example in examples: print "\nTesting with:\n", example test_cluster.i_create_all_cluster_resources(self, data=example[0], test=example[1], output=example[2]) test_pred.i_check_create_source(self) test_pred.i_check_create_dataset(self, suffix=None) test_pred.i_check_create_cluster(self) test_cluster.i_check_create_centroids(self) test_pred.i_check_predictions(self, example[3])
def setup_scenario1(self): """ Scenario: Successfully building multi-label test predictions from start: Given I create BigML multi-label resources tagged as "<tag>" with "<label_separator>" label separator and <number_of_labels> labels uploading train "<data>" file with "<training_separator>" field separator and "<ml_fields>" as multi-label fields using model_fields "<model_fields>" and objective "<objective>" to test "<test>" and log predictions in "<output>" And I check that the source has been created And I check that the dataset has been created And I check that the models have been created And I check that the predictions are ready Then the local prediction file is like "<predictions_file>" Examples: |tag |label_separator |number_of_labels | data |training_separator | ml_fields | model_fields | objective | test | output |predictions_file | |my_multilabelm_1|:|7| ../data/multilabel_multi.csv |, | type,class | -type,-type - W,-type - A,-type - C,-type - S,-type - R,-type - T,-type - P | class |../data/test_multilabel.csv | ./scenario_mlm_1/predictions.csv | ./check_files/predictions_ml.csv | """ print self.setup_scenario1.__doc__ examples = [ ['my_multilabelm_1', ':', '7', 'data/multilabel_multi.csv', ',', 'type,class', '-type,-type - W,-type - A,-type - C,-type - S,-type - R,-type - T,-type - P', 'class', 'data/test_multilabel.csv', 'scenario_mlm_1/predictions.csv', 'check_files/predictions_ml.csv']] for example in examples: print "\nTesting with:\n", example ml_pred.i_create_all_mlm_resources(self, tag=example[0], label_separator=example[1], number_of_labels=example[2], data=example[3], training_separator=example[4], ml_fields=example[5], model_fields=example[6], objective=example[7], test=example[8], output=example[9]) test_pred.i_check_create_source(self) test_pred.i_check_create_dataset(self) test_pred.i_check_create_models(self) test_pred.i_check_create_predictions(self) test_pred.i_check_predictions(self, example[10])
def test_scenario8(self): """ Scenario: Successfully building a new dataset from an existing one and analyzing it Given I create a BigML dataset from "<data>" and store logs in "<output_dir>" And I check that the source has been created And I check that the dataset has been created And I create a new BigML dataset using the specs in JSON file "<new_fields>" and a model with "<model_fields>" And I check that the new dataset has been created And I check that the model has been created And I create BigML nodes analysis from <min_nodes> to <max_nodes> by <nodes_step> with <kfold>-cross-validation improving "<metric>" And I check that the <kfold>-datasets have been created And I check that the <kfold>-models have been created And I check that all the <kfold>-fold cross-validations have been created Then the best node threshold is "<node_threshold>", with "<metric>" of <metric_value> Examples: |data |output_dir |new_fields | field | model_fields| min_nodes | max_nodes | nodes_step | kfold | metric | node_threshold | metric_value | |../data/iris.csv | ./scenario_a_10 |../data/new_fields.json| outlier? |petal length,outlier?,species| 3 | 14 | 2 |2 | precision | 9 | 94.71% | """ print self.test_scenario8.__doc__ examples = [ ['data/iris.csv', 'scenario_a_10', 'data/new_fields2.json', u'outlier?', u'outlier?,species', '3', '14', '2', '2', 'precision', '5', '98.21%']] for example in examples: print "\nTesting with:\n", example dataset_adv.i_create_dataset(self, data=example[0], output_dir=example[1]) test_pred.i_check_create_source(self) test_pred.i_check_create_dataset(self, suffix=None) dataset_adv.i_create_dataset_new_fields(self, json_file=example[2], model_fields=example[4]) test_pred.i_check_create_new_dataset(self) test_pred.i_check_create_model(self) test_pred.i_create_nodes_analysis(self, min_nodes=example[5], max_nodes=example[6], nodes_step=example[7], k_fold=example[8], metric=example[9]) test_pred.i_check_create_kfold_datasets(self, example[8]) test_pred.i_check_create_kfold_models(self, example[8]) test_pred.i_check_create_all_kfold_cross_validations(self, example[8]) test_pred.i_check_node_threshold(self, example[10], example[9], example[11])
def test_scenario4(self): """ Scenario: Successfully building feature selection from filtered dataset setting objective: Given I create BigML dataset uploading train "<data>" file in "<output>" And I check that the source has been created And I check that the dataset has been created And I filter out field "<field>" from dataset and log to "<output_dir>" And I check that the new dataset has been created And I create BigML feature selection <kfold>-fold cross-validations for "<objective>" improving "<metric>" And I check that the <kfold>-datasets have been created And I check that the <kfold>-models have been created And I check that all the <kfold>-fold cross-validations have been created Then the best feature selection is "<selection>", with "<metric>" of <metric_value> Examples: | data | field | objective |output | output_dir | kfold | metric | selection | metric_value | | ../data/iris_2fd.csv | sepal length | species |./scenario_a_6/evaluation |./scenario_a_6 | 2 | recall | petal width | 100.00% | """ print self.test_scenario4.__doc__ examples = [ ['data/iris_2fd.csv', 'sepal length', 'species', 'scenario_a_6/evaluation', 'scenario_a_6', '2', 'recall', 'petal width', '100.00%']] for example in examples: print "\nTesting with:\n", example test_pred.i_create_dataset(self, data=example[0], output=example[3]) test_pred.i_check_create_source(self) test_pred.i_check_create_dataset(self) dataset.i_filter_field_from_dataset(self, field=example[1], output_dir=example[4]) test_pred.i_check_create_new_dataset(self) test_pred.i_create_kfold_cross_validation_objective(self, k_folds=example[5], objective=example[2], metric=example[6]) test_pred.i_check_create_kfold_datasets(self, example[5]) test_pred.i_check_create_kfold_models(self, example[5]) test_pred.i_check_create_all_kfold_cross_validations(self, example[5]) test_pred.i_check_feature_selection(self, example[7], example[6], example[8])
def test_scenario2(self): """ Scenario: Successfully building feature selection from dataset: Given I create BigML dataset uploading train "<data>" file in "<output>" And I check that the source has been created And I check that the dataset has been created And I create BigML feature selection <kfold>-fold cross-validations improving "<metric>" And I check that the <kfold>-datasets have been created And I check that the <kfold>-models have been created And I check that all the <kfold>-fold cross-validations have been created Then the best feature selection is "<selection>", with "<metric>" of <metric_value> And I generate a report from the output directory And a symlink file is generated in the reports directory Examples: | data | output | kfold | metric | selection | metric_value | ../data/iris_2f.csv | ./scenario_a_2/evaluation | 2 | accuracy | petal width | 100.00% | ../data/iris_2f.csv | ./scenario_a_3/evaluation | 2 | phi | petal width | 1 """ print self.test_scenario2.__doc__ examples = [ ['data/iris_2f.csv', 'scenario_a_2/evaluation', '2', 'accuracy', 'petal width', '100.00%'], ['data/iris_2f.csv', 'scenario_a_3/evaluation', '2', 'phi', 'petal width', '1']] for example in examples: print "\nTesting with:\n", example test_pred.i_create_dataset(self, data=example[0], output=example[1]) test_pred.i_check_create_source(self) test_pred.i_check_create_dataset(self) test_pred.i_create_kfold_cross_validation_metric(self, k_folds=example[2], metric=example[3]) test_pred.i_check_create_kfold_datasets(self, example[2]) test_pred.i_check_create_kfold_models(self, example[2]) test_pred.i_check_create_all_kfold_cross_validations(self, example[2]) test_pred.i_check_feature_selection(self, example[4], example[3], example[5]) test_pred.i_generate_report(self) test_pred.is_symlink(self)
def setup_for_fusion(step, train=None, output_dir=None): train = res_filename(train) command = ("bigmler --train \"" + train + "\" --store --output-dir " + output_dir) shell_execute(command, "%s/predictions" % output_dir) test_pred.i_check_create_source(step) test_pred.i_check_create_dataset(step) test_pred.i_check_create_model(step) command = ("bigmler deepnet --dataset \"" + world.dataset["resource"] + "\" --store --output-dir " + output_dir) shell_execute(command, "%s/predictions" % output_dir) test_dn.i_check_create_dn_model(step)
def test_scenario8(self): """ Scenario: Successfully building a new dataset from an existing one and analyzing it Given I create a BigML dataset from "<data>" and store logs in "<output_dir>" And I check that the source has been created And I check that the dataset has been created And I create a new BigML dataset using the specs in JSON file "<new_fields>" and a model with "<model_fields>" And I check that the new dataset has been created And I check that the model has been created And I create BigML nodes analysis from <min_nodes> to <max_nodes> by <nodes_step> with <kfold>-cross-validation improving "<metric>" And I check that the <kfold>-datasets have been created And I check that the <kfold>-models have been created And I check that all the <kfold>-fold cross-validations have been created Then the best node threshold is "<node_threshold>", with "<metric>" of <metric_value> Examples: |data |output_dir |new_fields | field | model_fields| min_nodes | max_nodes | nodes_step | kfold | metric | node_threshold | metric_value | |../data/iris.csv | ./scenario_a_10 |../data/new_fields.json| outlier? |petal length,outlier?,species| 3 | 14 | 2 |2 | precision | 9 | 94.71% | """ print self.test_scenario1.__doc__ examples = [[ 'data/iris.csv', 'scenario_a_10', 'data/new_fields2.json', u'outlier?', u'outlier?,species', '3', '14', '2', '2', 'precision', '5', '98.21%' ]] for example in examples: print "\nTesting with:\n", example dataset_adv.i_create_dataset(self, data=example[0], output_dir=example[1]) test_pred.i_check_create_source(self) test_pred.i_check_create_dataset(self, suffix=None) dataset_adv.i_create_dataset_new_fields(self, json_file=example[2], model_fields=example[4]) test_pred.i_check_create_new_dataset(self) test_pred.i_check_create_model(self) test_pred.i_create_nodes_analysis(self, min_nodes=example[5], max_nodes=example[6], nodes_step=example[7], k_fold=example[8], metric=example[9]) test_pred.i_check_create_kfold_datasets(self, example[8]) test_pred.i_check_create_kfold_models(self, example[8]) test_pred.i_check_create_all_kfold_cross_validations( self, example[8]) test_pred.i_check_node_threshold(self, example[10], example[9], example[11])
def test_scenario10(self): """ Scenario: Successfully building feature selection from dataset setting objective: Given I create BigML dataset uploading train "<data>" file in "<output>" And I check that the source has been created And I check that the dataset has been created And I create BigML feature selection <kfold>-fold cross-validation with options "<options>" And I check that the <kfold>-datasets have been created And I check that the <kfold>-models have been created And I check that all the <kfold>-fold cross-validations have been created Then the predictions file "<predictions_file>" is like "<estimated_file>" Examples: | data |output | kfold | options | predictions_file | estimated_file | ../data/iris.csv |./scenario_a_12/evaluation | 2 | --exclude-features="petal length, sepal length" --predictions-csv| scenario_a_12/kfold2_pred/predictions.csv | check_files/analyze_predictions_iris.csv | ../data/iris.csv |./scenario_a_13/evaluation | 2 | --exclude-features="species,petal length" --predictions.csv --objective 0| scenario_a_13/kfold6_pred/predictions.csv | check_files/analyze_predictions_iris_2.csv | ../data/iris.csv |./scenario_a_14/evaluation | 2 | --exclude-features="species,petal length" --predictions.csv --number-of-models 2| scenario_a_14/kfold2_pred/predictions.csv | check_files/analyze_predictions_iris_2.csv """ print self.test_scenario10.__doc__ examples = [ [ 'data/iris.csv', 'scenario_a_12/evaluation', '2', ' --exclude-features="petal length,sepal length" --predictions-csv', 'scenario_a_12/test/kfold2_pred/predictions.csv', 'check_files/analyze_predictions_iris.csv' ], [ 'data/iris.csv', 'scenario_a_13/evaluation', '2', ' --exclude-features="species,sepal length" --predictions-csv --objective 0', 'scenario_a_13/test/kfold6_pred/predictions.csv', 'check_files/analyze_predictions_iris2.csv' ] ] for example in examples: print "\nTesting with:\n", example test_pred.i_create_dataset(self, data=example[0], output=example[1]) test_pred.i_check_create_source(self) test_pred.i_check_create_dataset(self) test_pred.i_create_kfold_cross_validation_options( self, k_folds=example[2], options=example[3]) test_pred.i_check_create_kfold_datasets(self, example[2]) test_pred.i_check_create_kfold_models(self, example[2]) test_pred.i_check_create_all_kfold_cross_validations( self, example[2]) test_pred.i_check_predictions_file(self, example[4], example[5])
def test_scenario4(self): """ Scenario: Successfully building feature selection from filtered dataset setting objective: Given I create BigML dataset uploading train "<data>" file in "<output>" And I check that the source has been created And I check that the dataset has been created And I filter out field "<field>" from dataset and log to "<output_dir>" And I check that the new dataset has been created And I create BigML feature selection <kfold>-fold cross-validations for "<objective>" improving "<metric>" And I check that the <kfold>-datasets have been created And I check that the <kfold>-models have been created And I check that all the <kfold>-fold cross-validations have been created Then the best feature selection is "<selection>", with "<metric>" of <metric_value> Examples: | data | field | objective |output | output_dir | kfold | metric | selection | metric_value | | ../data/iris_2fd.csv | sepal length | species |./scenario_a_6/evaluation |./scenario_a_6 | 2 | recall | petal width | 100.00% | """ print self.test_scenario4.__doc__ examples = [[ 'data/iris_2fd.csv', 'sepal length', 'species', 'scenario_a_6/evaluation', 'scenario_a_6', '2', 'recall', 'petal width', '100.00%' ]] for example in examples: print "\nTesting with:\n", example test_pred.i_create_dataset(self, data=example[0], output=example[3]) test_pred.i_check_create_source(self) test_pred.i_check_create_dataset(self) dataset.i_filter_field_from_dataset(self, field=example[1], output_dir=example[4]) test_pred.i_check_create_new_dataset(self) test_pred.i_create_kfold_cross_validation_objective( self, k_folds=example[5], objective=example[2], metric=example[6]) test_pred.i_check_create_kfold_datasets(self, example[5]) test_pred.i_check_create_kfold_models(self, example[5]) test_pred.i_check_create_all_kfold_cross_validations( self, example[5]) test_pred.i_check_feature_selection(self, example[7], example[6], example[8])
def test_scenario1(self): """ Scenario: Successfully building a new sample from a dataset Given I create a BigML dataset from "<data>" and store logs in "<output_dir>" And I check that the source has been created And I check that the dataset has been created Then I create a new sample from the dataset and get the sample using options "<sample_options>" storing logs in "<output_dir>" And I check that the sample has been created And the sample file is like "<sample_CSV>" Examples: |data |output_dir |sample_options | sample_CSV |../data/iris.csv | ./scenario_smp_1 | --occurrence --sample-header --row-index | ./check_files/sample_iris.csv |../data/iris.csv | ./scenario_smp_2 | --precision 0 --rows 10 --row-offset 10 --unique | ./check_files/sample_iris2.csv |../data/iris.csv | ./scenario_smp_3 | --row-order-by="-petal length" --row-fields "petal length,petal width" --mode linear | ./check_files/sample_iris3.csv """ print self.test_scenario1.__doc__ examples = [ [ 'data/iris.csv', 'scenario_smp_1', '--occurrence --sample-header --row-index', 'check_files/sample_iris.csv' ], [ 'data/iris.csv', 'scenario_smp_2', '--precision 0 --rows 10 --row-offset 10 --unique', 'check_files/sample_iris2.csv' ], [ 'data/iris.csv', 'scenario_smp_3', '--row-order-by="-petal length" --row-fields "petal length,petal width" --mode linear', 'check_files/sample_iris3.csv' ] ] for example in examples: print "\nTesting with:\n", example dataset.i_create_dataset(self, data=example[0], output_dir=example[1]) test_pred.i_check_create_source(self) test_pred.i_check_create_dataset(self) test_sample.i_create_sample(self, options=example[2], output_dir=example[1]) test_sample.i_check_create_sample(self) test_sample.i_check_sample_file(self, check_sample_file=example[3])
def test_scenario2(self): """ Scenario: Successfully building feature selection from dataset: Given I create BigML dataset uploading train "<data>" file in "<output>" And I check that the source has been created And I check that the dataset has been created And I create BigML feature selection <kfold>-fold cross-validations improving "<metric>" And I check that the <kfold>-datasets have been created And I check that the <kfold>-models have been created And I check that all the <kfold>-fold cross-validations have been created Then the best feature selection is "<selection>", with "<metric>" of <metric_value> And I generate a report from the output directory And a symlink file is generated in the reports directory Examples: | data | output | kfold | metric | selection | metric_value | ../data/iris_2f.csv | ./scenario_a_2/evaluation | 2 | accuracy | petal width | 100.00% | ../data/iris_2f.csv | ./scenario_a_3/evaluation | 2 | phi | petal width | 1 """ print self.test_scenario2.__doc__ examples = [[ 'data/iris_2f.csv', 'scenario_a_2/evaluation', '2', 'accuracy', 'petal width', '100.00%' ], [ 'data/iris_2f.csv', 'scenario_a_3/evaluation', '2', 'phi', 'petal width', '1' ]] for example in examples: print "\nTesting with:\n", example test_pred.i_create_dataset(self, data=example[0], output=example[1]) test_pred.i_check_create_source(self) test_pred.i_check_create_dataset(self) test_pred.i_create_kfold_cross_validation_metric( self, k_folds=example[2], metric=example[3]) test_pred.i_check_create_kfold_datasets(self, example[2]) test_pred.i_check_create_kfold_models(self, example[2]) test_pred.i_check_create_all_kfold_cross_validations( self, example[2]) test_pred.i_check_feature_selection(self, example[4], example[3], example[5]) test_pred.i_generate_report(self) test_pred.is_symlink(self)
def test_scenario3(self): """ Scenario: Successfully building feature selection from dataset setting objective: Given I create BigML dataset uploading train "<data>" file in "<output>" And I check that the source has been created And I check that the dataset has been created And I create BigML feature selection <kfold>-fold cross-validations for "<objective>" improving "<metric>" And I check that the <kfold>-datasets have been created And I check that the <kfold>-models have been created And I check that all the <kfold>-fold cross-validations have been created Then the best feature selection is "<selection>", with "<metric>" of <metric_value> Examples: | data | objective |output | kfold | metric | selection | metric_value | | ../data/iris_2f.csv | 0 |./scenario_a_5/evaluation | 2 | r_squared| species | 0.352845 | | ../data/iris_2f.csv | 0 |./scenario_a_8/evaluation | 2 | mean_squared_error| species | 0.475200 | """ print self.test_scenario3.__doc__ examples = [[ 'data/iris_2f.csv', '0', 'scenario_a_5/evaluation', '2', 'r_squared', 'species', '0.352845' ], [ 'data/iris_2f.csv', '0', 'scenario_a_8/evaluation', '2', 'mean_squared_error', 'species', '0.475200' ]] for example in examples: print "\nTesting with:\n", example test_pred.i_create_dataset(self, data=example[0], output=example[2]) test_pred.i_check_create_source(self) test_pred.i_check_create_dataset(self) test_pred.i_create_kfold_cross_validation_objective( self, k_folds=example[3], objective=example[1], metric=example[4]) test_pred.i_check_create_kfold_datasets(self, example[3]) test_pred.i_check_create_kfold_models(self, example[3]) test_pred.i_check_create_all_kfold_cross_validations( self, example[3]) test_pred.i_check_feature_selection(self, example[5], example[4], example[6])
def test_scenario2(self): """ Scenario: Successfully retraining from a model using sampled dataset Given I create a BigML balanced model from "<data>" sampling 50% of data and store logs in "<output_dir>" And I check that the source has been created And I check that the dataset has been created And I check that the model has been created And I retrain the model from "<data>" and store logs in "<output_dir>" And I check that the source has been created And I check that the dataset has been created And I check that the model has been created Then I check that the model has doubled its rows And I check that the model is balanced Examples: |data |output_dir | output_dir_ret |../data/iris.csv | ./scenario_rt_2 |./scenario_rt_2b | """ print self.test_scenario2.__doc__ examples = [['data/iris.csv', 'scenario_rt_2', 'scenario_rt_2b'], [ 'https://static.bigml.com/csv/iris.csv', 'scenario_rt_2c', 'scenario_rt_2d' ]] for example in examples: print "\nTesting with:\n", example test_pred.i_create_balanced_model_from_sample( self, data=example[0], output_dir=example[1]) test_pred.i_check_create_source(self) test_pred.i_check_create_dataset(self, suffix=None) test_pred.i_check_create_dataset(self, suffix='gen ') test_pred.i_check_create_model(self) test_pred.i_retrain_model(self, data=example[0], output_dir=example[2]) if not example[0].startswith("https"): test_pred.i_check_create_source(self) execute_steps.i_check_create_execution(self, number_of_executions=2) test_pred.i_check_create_model_in_execution(self) test_pred.i_check_model_double(self) test_pred.i_check_model_is_balanced(self)
def test_scenario6(self): """ Scenario: Successfully building feature selection from dataset excluding features: Given I create BigML dataset uploading train "<data>" file in "<output>" And I check that the source has been created And I check that the dataset has been created And I create BigML feature selection <kfold>-fold cross-validations excluding "<features>" with separator "<args_separator>" improving "<metric>" And I check that the <kfold>-datasets have been created And I check that the <kfold>-models have been created And I check that all the <kfold>-fold cross-validations have been created Then the best feature selection is "<selection>", with "<metric>" of <metric_value> Examples: | data | output | kfold | features | args_separator | metric | selection | metric_value | | ../data/iris.csv | ./scenario_a_7/evaluation | 2 | petal length!sepal width | ! | accuracy | petal width | 95.33% | """ print self.test_scenario6.__doc__ examples = [[ 'data/iris.csv', 'scenario_a_7/evaluation', '2', 'petal length!sepal width', '!', 'accuracy', 'petal width', '95.33%' ]] for example in examples: print "\nTesting with:\n", example test_pred.i_create_dataset(self, data=example[0], output=example[1]) test_pred.i_check_create_source(self) test_pred.i_check_create_dataset(self) test_pred.i_create_kfold_cross_validation_separator_metric_no_fields( self, k_folds=example[2], features=example[3], args_separator=example[4], metric=example[5]) test_pred.i_check_create_kfold_datasets(self, example[2]) test_pred.i_check_create_kfold_models(self, example[2]) test_pred.i_check_create_all_kfold_cross_validations( self, example[2]) test_pred.i_check_feature_selection(self, example[6], example[5], example[7])
def test_scenario7(self): """ Scenario: Successfully building feature selection for a category from dataset: Given I create BigML dataset uploading train "<data>" file with attributes "<attributes>" in "<output>" And I check that the source has been created And I check that the dataset has been created And I create BigML feature selection <kfold>-fold cross-validations improving "<metric>" for category "<category>" And I check that the <kfold>-datasets have been created And I check that the <kfold>-models have been created And I check that all the <kfold>-fold cross-validations have been created Then the best feature selection is "<selection>", with "<metric>" of <metric_value> Examples: | data | attributes | output | kfold | metric | category | selection | metric_value | ../data/spam.csv | ../data/spam_attributes.json |./scenario_a_9/evaluation | 2 | recall | spam | Message | 61.24% """ print self.test_scenario7.__doc__ examples = [[ 'data/spam.csv', 'data/spam_attributes.json', 'scenario_a_9/evaluation', '2', 'recall', 'spam', 'Message', '61.24%' ]] for example in examples: print "\nTesting with:\n", example test_pred.i_create_dataset_with_attributes(self, data=example[0], attributes=example[1], output=example[2]) test_pred.i_check_create_source(self) test_pred.i_check_create_dataset(self) test_pred.i_create_kfold_cross_validation_metric_category( self, k_folds=example[3], metric=example[4], category=example[5]) test_pred.i_check_create_kfold_datasets(self, example[3]) test_pred.i_check_create_kfold_models(self, example[3]) test_pred.i_check_create_all_kfold_cross_validations( self, example[3]) test_pred.i_check_feature_selection(self, example[6], example[4], example[7])
def test_scenario5(self): """ Scenario: Successfully building nodes threshold analysis from dataset file: Given I create BigML dataset uploading train "<data>" file in "<output>" And I check that the source has been created And I check that the dataset has been created And I create BigML nodes analysis from dataset file from <min_nodes> to <max_nodes> by <nodes_step> with <kfold>-cross-validation improving "<metric>" And I check that the <kfold>-datasets have been created And I check that the <kfold>-models have been created And I check that all the <kfold>-fold cross-validations have been created Then the best node threshold is "<node_threshold>", with "<metric>" of <metric_value> Examples: | data | output | min_nodes | max_nodes | nodes_step | kfold | metric | node_threshold | metric_value | ../data/iris.csv | ./scenario_a_4/evaluation | 3 | 14 | 2 |2 | precision | 9 | 94.71% """ print self.test_scenario5.__doc__ examples = [[ 'data/iris.csv', 'scenario_a_4/evaluation', '3', '14', '2', '2', 'precision', '9', '94.71%' ]] for example in examples: print "\nTesting with:\n", example test_pred.i_create_dataset(self, data=example[0], output=example[1]) test_pred.i_check_create_source(self) test_pred.i_check_create_dataset(self) test_pred.i_create_nodes_analysis_from_dataset_file( self, min_nodes=example[2], max_nodes=example[3], nodes_step=example[4], k_fold=example[5], metric=example[6]) test_pred.i_check_create_kfold_datasets(self, example[5]) test_pred.i_check_create_kfold_models(self, example[5]) test_pred.i_check_create_all_kfold_cross_validations( self, example[5]) test_pred.i_check_node_threshold(self, example[7], example[6], example[8])
def test_scenario2(self): """ Scenario: Successfully building association from source Given I have previously executed "<scenario>" or reproduce it with arguments <kwargs> And I create BigML association using source and log resources in "<output_dir>" And I check that the dataset has been created And I check that the association has been created Examples: |scenario | kwargs | output_dir | scenario_ass_1| {"data": "../data/iris.csv", "output_dir": "./scenario_ass_1/} | ./scenario_ass_2 | """ print self.test_scenario2.__doc__ examples = [ ['scenario_ass_1', '{"data": "data/iris.csv", "output_dir": "scenario_ass_1"}', 'scenario_ass_2']] for example in examples: print "\nTesting with:\n", example test_pred.i_have_previous_scenario_or_reproduce_it(self, example[0], example[1]) test_association.i_create_association_from_source(self, output_dir=example[2]) test_pred.i_check_create_dataset(self, suffix=None) test_pred.i_check_create_association(self)
def test_scenario3(self): """ Scenario: Successfully building test anomaly score predictions from test split in a dataset: Given I create BigML resources uploading train "<data>" file to find anomaly scores with test split "<test_split>" remotely saved to dataset with no CSV output and log resources in "<output_dir>" And I check that the source has been created And I check that the dataset has been created And I check that the anomaly detector has been created And I check that the train dataset has been created And I check that the dataset has been created from the test file And I check that the batch anomaly scores prediction has been created Then I check that the batch anomaly scores dataset exists And no local CSV file is created Examples: | data | test_split | output_dir | | ../data/iris.csv | 0.2 | ./scenario_ab_3 | """ print self.test_scenario3.__doc__ examples = [['data/iris.csv', '0.2', 'scenario_ab_3']] for example in examples: print "\nTesting with:\n", example test_anomaly.i_create_all_anomaly_resources_with_test_split_no_CSV( self, data=example[0], test_split=example[1], output_dir=example[2]) test_pred.i_check_create_source(self) test_pred.i_check_create_dataset(self) test_anomaly.i_check_create_anomaly(self) test_pred.i_check_create_dataset(self, suffix='train ') test_pred.i_check_create_dataset(self, suffix='test ') test_batch.i_check_create_batch_anomaly_scores(self) test_anomaly.i_check_create_batch_anomaly_score_dataset(self) test_anomaly.i_check_no_local_CSV(self)
def test_scenario3(self): """ Scenario: Successfully building remote test centroid predictions from scratch with prediction fields: Given I create BigML resources uploading train "<data>" file to find centroids for "<test>" remotely with prediction fields "<prediction_fields>" and log resources in "<output>" And I check that the source has been created And I check that the dataset has been created And I check that the cluster has been created And I check that the source has been created from the test file And I check that the dataset has been created from the test file And I check that the batch centroid prediction has been created And I check that the centroids are ready Then the local centroids file is like "<predictions_file>" Examples: | data | test | prediction_fields | output | predictions_file | ../data/grades.csv | ../data/test_grades.csv | Assignment |./scenario_cb_3_r/centroids.csv | ./check_files/centroids_grades_field.csv | """ print self.test_scenario3.__doc__ examples = [[ 'data/grades.csv', 'data/grades.csv', 'Assignment', './scenario_cb_3_r/centroids.csv', "./check_files/centroids_grades_field.csv" ]] for example in examples: print "\nTesting with:\n", example test_cluster.i_create_all_cluster_resources_with_prediction_fields( self, data=example[0], test=example[1], prediction_fields=example[2], output=example[3]) test_pred.i_check_create_source(self) test_pred.i_check_create_dataset(self, suffix=None) test_pred.i_check_create_cluster(self) test_pred.i_check_create_test_source(self) test_pred.i_check_create_test_dataset(self) batch_pred.i_check_create_batch_centroid(self) test_cluster.i_check_create_centroids(self) test_pred.i_check_predictions(self, example[4])
def test_scenario2(self): """ Scenario: Successfully building test anomaly score predictions from test split: Given I create BigML resources uploading train "<data>" file to find anomaly scores with test split "<test_split>" remotely and log predictions in "<output>" And I check that the source has been created And I check that the dataset has been created And I check that the anomaly detector has been created And I check that the train dataset has been created And I check that the dataset has been created from the test file And I check that the batch anomaly scores prediction has been created And I check that the anomaly scores are ready Then the local anomaly scores file is like "<predictions_file>" Examples: | data | test_split | output |predictions_file | | ../data/iris.csv | 0.2 | ./scenario_ab_2/anomalies.csv | ./check_files/anomaly_scores_iris.csv | """ print self.test_scenario2.__doc__ examples = [ ['data/iris.csv', '0.2', 'scenario_ab_2/anomalies.csv', 'check_files/anomaly_scores_iris.csv']] for example in examples: print "\nTesting with:\n", example test_anomaly.i_create_all_anomaly_resources_with_test_split(self, data=example[0], test_split=example[1], output=example[2]) test_pred.i_check_create_source(self) test_pred.i_check_create_dataset(self) test_anomaly.i_check_create_anomaly(self) test_pred.i_check_create_dataset(self, suffix='train ') test_pred.i_check_create_dataset(self, suffix='test ') test_batch.i_check_create_batch_anomaly_scores(self) test_anomaly.i_check_create_anomaly_scores(self) test_anomaly.i_check_anomaly_scores(self, example[3])
def test_scenario1(self): """ Scenario: Successfully building test anomaly score predictions from scratch: Given I create BigML resources uploading train "<data>" file to find anomaly scores for "<test>" remotely with mapping file "<fields_map>" and log predictions in "<output>" And I check that the source has been created And I check that the dataset has been created And I check that the anomaly detector has been created And I check that the source has been created from the test file And I check that the dataset has been created from the test file And I check that the batch anomaly scores prediction has been created And I check that the anomaly scores are ready Then the local anomaly scores file is like "<predictions_file>" Examples: | data | test | fields_map | output |predictions_file | | ../data/grades.csv | ../data/grades_perm.csv | ../data/grades_fields_map_perm.csv | ./scenario_ab_1_r/anomalies.csv | ./check_files/anomaly_scores_grades.csv | """ print self.test_scenario1.__doc__ examples = [[ 'data/grades.csv', 'data/grades_perm.csv', 'data/grades_fields_map_perm.csv', 'scenario_ab_1_r/anomalies.csv', 'check_files/anomaly_scores_grades.csv' ]] for example in examples: print "\nTesting with:\n", example test_anomaly.i_create_all_anomaly_resources_with_mapping( self, data=example[0], test=example[1], fields_map=example[2], output=example[3]) test_pred.i_check_create_source(self) test_pred.i_check_create_dataset(self) test_anomaly.i_check_create_anomaly(self) test_pred.i_check_create_test_source(self) test_pred.i_check_create_test_dataset(self) test_batch.i_check_create_batch_anomaly_scores(self) test_anomaly.i_check_create_anomaly_scores(self) test_anomaly.i_check_anomaly_scores(self, example[4])
def setup_scenario02(self): """ Scenario: Successfully building text source from local file: Given I create BigML dataset uploading train "<data>" file with attributes "<attributes>" in "<output>" And I check that the source has been created Then I check that the dataset has been created Examples: | data | attributes | output | | ../data/spam.csv | ../data/spam_attributes.json | scenario2_td/topic_distributions.csv | """ print self.setup_scenario02.__doc__ examples = [[ 'data/spam.csv', 'data/spam_attributes.json', 'scenario2_td/topic_distributions.csv' ]] for example in examples: print "\nTesting with:\n", example test_pred.i_create_dataset_with_attributes( \ self, data=example[0], attributes=example[1], output=example[2]) test_pred.i_check_create_source(self) test_pred.i_check_create_dataset(self, suffix=None)
def test_scenario2(self): """ Scenario: Successfully updating a dataset with attributes in a JSON file Given I create a BigML dataset from "<data>" and store logs in "<output_dir>" And I check that the source has been created And I check that the dataset has been created And I update the dataset using the specs in JSON file "<new_fields>" Then I check that property "<property>" for field id "<field_id>" is "<value>" of type "<type>" Examples: |data |output_dir |new_fields | property | field_id | value | type |../data/iris.csv | ./scenario_d_2 |../data/attributes.json| preferred | 000001 | false | boolean |../data/iris.csv | ./scenario_d_2_b |../data/attributes_col.json| preferred | 000001 | false | boolean """ print self.test_scenario2.__doc__ examples = [[ 'data/iris.csv', 'scenario_d_2', 'data/attributes.json', 'preferred', '000001', 'false', 'boolean' ], [ 'data/iris.csv', 'scenario_d_2_b', 'data/attributes_col.json', 'preferred', '000001', 'false', 'boolean' ]] for example in examples: print "\nTesting with:\n", example dataset_adv.i_create_dataset(self, data=example[0], output_dir=example[1]) test_pred.i_check_create_source(self) test_pred.i_check_create_dataset(self, suffix=None) dataset_adv.i_update_dataset_new_properties(self, json_file=example[2]) dataset_adv.i_check_dataset_has_property(self, attribute=example[3], field_id=example[4], value=example[5], type=example[6])
def test_scenario1(self): """ Scenario: Successfully building test predictions from dataset specifying objective field and model fields Given I create a BigML dataset from "<data>" and store logs in "<output_dir>" And I check that the source has been created And I check that the dataset has been created And I create BigML resources using dataset, objective field <objective> and model fields <fields> to test "<test>" and log predictions in "<output>" And I check that the model has been created And I check that the predictions are ready Then the local prediction file is like "<predictions_file>" Examples: |data | output_dir | test | output |predictions_file | objective | fields | | ../data/iris_2fb.csv| ./scénario1 | ../data/test_iris2fb.csv | ./scénario1/predictions.csv | ./check_files/predictions_iris_2fb.csv | spécies | "pétal width" | """ print self.test_scenario1.__doc__ examples = [[ 'data/iris_2fb.csv', u'scénario1', 'data/test_iris2fb.csv', u'scénario1/predictions.csv', 'check_files/predictions_iris_2fb.csv', u'spécies', u'"pétal width"' ]] for example in examples: print "\nTesting with:\n", example dataset_adv.i_create_dataset(self, data=example[0], output_dir=example[1]) test_pred.i_check_create_source(self) test_pred.i_check_create_dataset(self, suffix=None) test_pred.i_create_resources_from_dataset_objective_model( self, objective=example[5], fields=example[6], test=example[2], output=example[3]) test_pred.i_check_create_model(self) test_pred.i_check_create_predictions(self) test_pred.i_check_predictions(self, example[4])
def test_scenario1(self): """ Scenario 1: Successfully building test predictions from scratch: Given I create BigML resources uploading train "<data>" file to test "<test>" remotely with mapping file "<fields_map>" and log predictions in "<output>" And I check that the source has been created And I check that the dataset has been created And I check that the model has been created And I check that the source has been created from the test file And I check that the dataset has been created from the test file And I check that the batch prediction has been created And I check that the predictions are ready Then the local prediction file is like "<predictions_file>" Examples: | data | test | fields_map | output |predictions_file | | ../data/grades.csv | ../data/test_grades.csv | ../data/grades_fields_map.csv | ./scenario_r1_r/predictions.csv | ./check_files/predictions_grades.csv | """ print self.test_scenario1.__doc__ examples = [[ 'data/grades.csv', 'data/test_grades.csv', 'data/grades_fields_map.csv', 'scenario_r1_r/predictions.csv', 'check_files/predictions_grades.csv' ]] for example in examples: print "\nTesting with:\n", example test_pred.i_create_all_resources_batch_map(self, data=example[0], test=example[1], fields_map=example[2], output=example[3]) test_pred.i_check_create_source(self) test_pred.i_check_create_dataset(self, suffix=None) test_pred.i_check_create_model(self) test_batch_pred.i_check_create_test_source(self) test_batch_pred.i_check_create_test_dataset(self) test_batch_pred.i_check_create_batch_prediction(self) test_pred.i_check_create_predictions(self) test_pred.i_check_predictions(self, example[4])
def test_scenario1(self): """ Scenario: Successfully building feature selection from dataset in dev mode: Given I want to use api in DEV mode And I create BigML dataset in dev mode uploading train "<data>" file in "<output>" And I check that the source has been created And I check that the dataset has been created And I create BigML feature selection <kfold>-fold cross-validations improving "<metric>" in dev mode And I check that the <kfold>-datasets have been created And I check that the <kfold>-models have been created And I check that all the <kfold>-fold cross-validations have been created Then the best feature selection is "<selection>", with "<metric>" of <metric_value> Examples: | data | output | kfold | metric | selection | metric_value | | ../data/iris_2f.csv | ./scenario_a_2/evaluation | 2 | accuracy | petal width | 100.00% | """ print self.test_scenario1.__doc__ examples = [[ 'data/iris_2f.csv', 'scenario_a_2/evaluation', '2', 'accuracy', 'petal width', '100.00%' ]] for example in examples: print "\nTesting with:\n", example common.i_want_api_dev_mode(self) test_pred.i_create_dev_dataset(self, data=example[0], output=example[1]) test_pred.i_check_create_source(self) test_pred.i_check_create_dataset(self) test_pred.i_create_kfold_cross_validation_in_dev( self, k_folds=example[2], metric=example[3]) test_pred.i_check_create_kfold_datasets(self, example[2]) test_pred.i_check_create_kfold_models(self, example[2]) test_pred.i_check_create_all_kfold_cross_validations( self, example[2]) test_pred.i_check_feature_selection(self, example[4], example[3], example[5])
def test_scenario3(self): """ Scenario: Successfully building a objective weighted model Given I create a BigML objective weighted model from "<data>" using the objective weights in file "<path>" and store logs in "<output_dir>" And I check that the source has been created And I check that the dataset has been created And I check that the model has been created Then I check that the model uses as objective weights "<weights>" Examples: |data |path | output_dir | weights |../data/iris.csv | ../data/weights.csv |./scenario_w_3 | [["Iris-setosa",5], ["Iris-versicolor",3]] """ print self.test_scenario3.__doc__ examples = [ ['data/iris.csv', 'data/weights.csv', 'scenario_w_3', '[["Iris-setosa",5], ["Iris-versicolor",3]]']] for example in examples: print "\nTesting with:\n", example test_pred.i_create_objective_weighted_model(self, data=example[0], path=example[1], output_dir=example[2]) test_pred.i_check_create_source(self) test_pred.i_check_create_dataset(self, suffix=None) test_pred.i_check_create_model(self) test_pred.i_check_objective_weighted_model(self, weights=example[3])
def setup_scenario02(self): """ Scenario: Successfully building test forecasts from start: Given I create BigML time series resources uploading train "<data>" file to test "<test>" and log forecasts in "<output>" And I check that the source has been created And I check that the dataset has been created And I check that the time series has been created Then the local forecasts file is like "<forecasts_file>" Examples: | data | test | output |forecasts_file | ../data/grades.csv | ./data/test_grades.json | ./scenario1_ts/forecasts """ print self.setup_scenario02.__doc__ examples = [ ['data/grades.csv', 'data/test_grades.json', 'scenario1_ts/forecasts', 'check_files/forecasts_grades_final.csv', 'scenario1_ts/forecasts_000005.csv']] for example in examples: print "\nTesting with:\n", example ts_pred.i_create_all_ts_resources(self, example[0], example[1], example[2]) test_pred.i_check_create_source(self) test_pred.i_check_create_dataset(self, suffix=None) ts_pred.i_check_create_time_series(self) ts_pred.i_check_forecasts(self, example[3])
def test_scenario3(self): """ Scenario: Successfully exporting a dataset to a CSV file Given I create a BigML dataset from "<data>" and store logs in "<output_dir>" And I check that the source has been created And I check that the dataset has been created And I export the dataset to the CSV file "<csv_file>" Then file "<csv_file>" is like file "<data>" Examples: |data |output_dir |csv_file | |../data/iris.csv | ./scenario_d_3 |dataset.csv """ print self.test_scenario3.__doc__ examples = [ ['data/iris.csv', 'scenario_d_3', 'dataset.csv']] for example in examples: print "\nTesting with:\n", example dataset_adv.i_create_dataset(self, data=example[0], output_dir=example[1]) test_pred.i_check_create_source(self) test_pred.i_check_create_dataset(self, suffix=None) dataset_adv.i_export_the_dataset(self, example[2]) dataset_adv.i_files_equal(self, example[2], example[0])
def test_scenario1(self): """ Scenario: Successfully building a balanced model Given I create a BigML balanced model from "<data>" and store logs in "<output_dir>" And I check that the source has been created And I check that the dataset has been created And I check that the model has been created Then I check that the model is balanced Examples: |data |output_dir | |../data/iris.csv | ./scenario_w_1 | """ print self.test_scenario1.__doc__ examples = [ ['data/iris.csv', 'scenario_w_1']] for example in examples: print "\nTesting with:\n", example test_pred.i_create_balanced_model(self, data=example[0], output_dir=example[1]) test_pred.i_check_create_source(self) test_pred.i_check_create_dataset(self, suffix=None) test_pred.i_check_create_model(self) test_pred.i_check_model_is_balanced(self)
def test_scenario2(self): """ Scenario: Successfully building a field weighted model Given I create a BigML field weighted model from "<data>" using field "<field>" as weight and "<objective>" and store logs in "<output_dir>" And I check that the source has been created And I check that the dataset has been created And I check that the model has been created Then I check that the model uses as weight "<field_id>" Examples: |data |field | output_dir | field_id | objective |../data/iris_w.csv | weight |./scenario_w_2 | 000005 |000004 """ print self.test_scenario2.__doc__ examples = [ ['data/iris_w.csv', 'weight', 'scenario_w_2', '000005', 'species']] for example in examples: print "\nTesting with:\n", example test_pred.i_create_weighted_field_model(self, data=example[0], field=example[1], output_dir=example[2], objective=example[4]) test_pred.i_check_create_source(self) test_pred.i_check_create_dataset(self, suffix=None) test_pred.i_check_create_model(self) test_pred.i_check_weighted_model(self, field=example[3])
def test_scenario3(self): """ Scenario: Successfully building evaluations from start: Given I create BigML resources uploading train "<data>" file to create model and log in "<output>" And I check that the source has been created And I check that the dataset has been created And I check that the model has been created And I evaluate "<test>" with proportional missing strategy And I check that the source has been created And I check that the dataset has been created And I check that the evaluation has been created Then the evaluation file is like "<json_evaluation_file>" Examples: | data | test | output | json_evaluation_file | | ../data/iris.csv | ../data/iris_nulls.csv | ./scenario_mis_3/evaluation | ./check_files/evaluation_iris_nulls.json | """ print self.test_scenario3.__doc__ examples = [[ 'data/iris.csv', 'data/iris_nulls.csv', 'scenario_mis_3/evaluation', 'check_files/evaluation_iris_nulls.json' ]] for example in examples: print "\nTesting with:\n", example test_pred.i_create_all_resources_to_model(self, data=example[0], output=example[2]) test_pred.i_check_create_source(self) test_pred.i_check_create_dataset(self, suffix=None) test_pred.i_check_create_model(self) evaluation.i_create_proportional_to_evaluate(self, test=example[1]) test_pred.i_check_create_source(self) test_pred.i_check_create_dataset(self) test_pred.i_check_create_evaluation(self) evaluation.then_the_evaluation_file_is_like(self, example[3])