Example #1
0
def test_structured_data_from_csv_partial_col_type_classifier(tmp_dir):
    clf = ak.StructuredDataClassifier(
        column_types=common.PARTIAL_COLUMN_TYPES_FROM_CSV,
        directory=tmp_dir,
        max_trials=1)
    clf.fit(x=common.TRAIN_FILE_PATH, y='survived', epochs=2,
            validation_data=common.TEST_FILE_PATH)
def test_structured_data_get_col_names_from_df(fit, tmp_path):
    clf = ak.StructuredDataClassifier(
        directory=tmp_path,
        seed=test_utils.SEED,
    )
    clf.fit(x=test_utils.TRAIN_CSV_PATH, y="survived")

    assert nest.flatten(clf.inputs)[0].column_names[0] == "sex"
Example #3
0
def test_structured_data_from_csv_classifier(tmp_dir):
    clf = ak.StructuredDataClassifier(directory=tmp_dir, max_trials=1)
    clf.fit(x=common.TRAIN_FILE_PATH,
            y='survived',
            epochs=2,
            validation_data=common.TEST_FILE_PATH)
    x_test = common.csv_test('classification')
    assert clf.predict(x_test).shape == (len(x_test), 1)
Example #4
0
def test_raise_error_unknown_str_in_col_type(tmp_path):
    with pytest.raises(ValueError) as info:
        ak.StructuredDataClassifier(
            column_types={'age': 'num', 'parch': 'categorical'},
            directory=tmp_path,
            seed=utils.SEED)

    assert 'Column_types should be either "categorical"' in str(info.value)
Example #5
0
def test_structured_clf_fit_call_auto_model_fit(fit, tmp_path):
    auto_model = ak.StructuredDataClassifier(directory=tmp_path, seed=utils.SEED)

    auto_model.fit(
        x=utils.generate_structured_data(num_instances=100),
        y=utils.generate_one_hot_labels(num_instances=100, num_classes=3))

    assert fit.is_called
Example #6
0
def test_structured_data_from_numpy_classifier(tmp_dir):
    num_data = 500
    data = common.structured_data(num_data)
    x_train = data
    y = np.random.randint(0, 3, num_data)
    y_train = y
    clf = ak.StructuredDataClassifier(directory=tmp_dir, max_trials=1)
    clf.fit(x_train, y_train, epochs=2, validation_data=(x_train, y_train))
def test_structured_clf_predict_csv_call_automodel_predict(
        predict, fit, tmp_path):
    auto_model = ak.StructuredDataClassifier(directory=tmp_path,
                                             seed=utils.SEED)

    auto_model.fit(x=utils.TRAIN_CSV_PATH, y="survived")
    auto_model.predict(x=utils.TEST_CSV_PATH)

    assert predict.is_called
def test_structured_clf_evaluate_call_automodel_evaluate(
        evaluate, fit, tmp_path):
    auto_model = ak.StructuredDataClassifier(directory=tmp_path,
                                             seed=utils.SEED)

    auto_model.fit(x=utils.TRAIN_CSV_PATH, y="survived")
    auto_model.evaluate(x=utils.TRAIN_CSV_PATH, y="survived")

    assert evaluate.is_called
Example #9
0
def test_structured_data_classifier_transform_new_data(tmp_dir):
    num_data = 200
    num_train = 100
    data = common.structured_data(num_data)
    x_train, x_test = data[:num_train], data[num_train:]
    y = np.random.randint(0, 3, num_data)
    y_train, y_test = y[:num_train], y[num_train:]
    clf = ak.StructuredDataClassifier(directory=tmp_dir, max_trials=1)
    clf.fit(x_train, y_train, epochs=2, validation_data=(x_test, y_test))
Example #10
0
def test_structured_data_from_csv_less_col_name_classifier(tmp_dir):
    with pytest.raises(ValueError) as info:
        clf = ak.StructuredDataClassifier(
            column_names=common.LESS_COLUMN_NAMES_FROM_CSV,
            directory=tmp_dir,
            max_trials=1)
        clf.fit(x=common.TRAIN_FILE_PATH, y='survived', epochs=2,
                validation_data=common.TEST_FILE_PATH)
    assert 'Expect column_names to have length' in str(info.value)
Example #11
0
def test_structured_data_from_csv_false_col_type_classifier(tmp_dir):
    with pytest.raises(ValueError) as info:
        clf = ak.StructuredDataClassifier(
            column_types=common.FALSE_COLUMN_TYPES_FROM_CSV,
            directory=tmp_dir,
            max_trials=1)
        clf.fit(x=common.TRAIN_FILE_PATH, y='survived', epochs=2,
                validation_data=common.TEST_FILE_PATH)
    assert 'Column_types should be either "categorical"' in str(info.value)
Example #12
0
 def build_pipeline(self):
     """
     Makes a pipeline based on data_config
     """
     if self.problem_type == "classification":
         automl_pipeline = ak.StructuredDataClassifier(**self.automl_settings)
     elif self.problem_type == "regression":
         automl_pipeline = ak.StructuredDataRegressor(**self.automl_settings)
     return automl_pipeline
Example #13
0
def test_structured_data_from_csv_col_type_mismatch_classifier(tmp_dir):
    with pytest.raises(ValueError) as info:
        clf = ak.StructuredDataClassifier(
            column_types=common.COLUMN_TYPES_FROM_CSV,
            directory=tmp_dir,
            max_trials=1)
        clf.fit(x=common.TRAIN_FILE_PATH, y='survived', epochs=2,
                validation_data=common.TEST_FILE_PATH)
    assert 'Column_names and column_types are mismatched.' in str(info.value)
Example #14
0
def test_structured_data_from_numpy_classifier(tmp_dir):
    num_data = 500
    num_train = 400
    data = common.structured_data(num_data)
    x_train, x_test = data[:num_train], data[num_train:]
    y = np.random.randint(0, 3, num_data)
    y_train, y_test = y[:num_train], y[num_train:]
    clf = ak.StructuredDataClassifier(directory=tmp_dir, max_trials=1)
    clf.fit(x_train, y_train, epochs=2, validation_data=(x_train, y_train))
    assert clf.predict(x_test).shape == (len(y_test), 1)
def test_structured_clf_fit_call_auto_model_fit(fit, tmp_path):
    auto_model = ak.StructuredDataClassifier(directory=tmp_path,
                                             seed=utils.SEED)

    auto_model.fit(
        x=pd.read_csv(utils.TRAIN_CSV_PATH).to_numpy().astype(
            np.unicode)[:100],
        y=utils.generate_one_hot_labels(num_instances=100, num_classes=3),
    )

    assert fit.is_called
Example #16
0
def test_structured_data_from_numpy_col_name_classifier(tmp_dir):
    num_data = 500
    data = common.structured_data(num_data)
    x_train = data
    y = np.random.randint(0, 3, num_data)
    y_train = y
    clf = ak.StructuredDataClassifier(
        column_names=common.COLUMN_NAMES_FROM_NUMPY,
        directory=tmp_dir,
        max_trials=1)
    clf.fit(x_train, y_train, epochs=2, validation_data=(x_train, y_train))
Example #17
0
def test_structured_data_clf_convert_csv_to_df_and_np(fit, tmp_path):
    auto_model = ak.StructuredDataClassifier(directory=tmp_path, seed=utils.SEED)

    auto_model.fit(x=utils.TRAIN_FILE_PATH,
                   y='survived',
                   epochs=2,
                   validation_data=(utils.TEST_FILE_PATH, 'survived'))

    _, kwargs = fit.call_args_list[0]
    assert isinstance(kwargs['x'], pandas.DataFrame)
    assert isinstance(kwargs['y'], np.ndarray)
Example #18
0
def test_titaninc_accuracy_over_77(tmp_path):
    TRAIN_DATA_URL = "https://storage.googleapis.com/tf-datasets/titanic/train.csv"
    TEST_DATA_URL = "https://storage.googleapis.com/tf-datasets/titanic/eval.csv"

    train_file_path = tf.keras.utils.get_file("train.csv", TRAIN_DATA_URL)
    test_file_path = tf.keras.utils.get_file("eval.csv", TEST_DATA_URL)
    clf = ak.StructuredDataClassifier(max_trials=10, directory=tmp_path)

    clf.fit(train_file_path, "survived")

    accuracy = clf.evaluate(test_file_path, "survived")[1]
    assert accuracy >= 0.77
def test_raise_error_unknown_str_in_col_type(tmp_path):
    with pytest.raises(ValueError) as info:
        ak.StructuredDataClassifier(
            column_types={
                "age": "num",
                "parch": "categorical"
            },
            directory=tmp_path,
            seed=test_utils.SEED,
        )

    assert 'column_types should be either "categorical"' in str(info.value)
Example #20
0
def test_structured_data_classifier_from_csv(init, fit):
    clf = ak.StructuredDataClassifier(directory=tmp_dir,
                                      max_trials=1,
                                      seed=common.SEED)

    clf.fit(x=common.TRAIN_FILE_PATH, y='survived', epochs=2,
            validation_data=common.TEST_FILE_PATH)

    assert init.called
    _, kwargs = fit.call_args_list[0]
    assert isinstance(kwargs['x'], pandas.DataFrame)
    assert isinstance(kwargs['y'], np.ndarray)
Example #21
0
def main():
    train_file_path = tf.keras.utils.get_file("train.csv", TRAIN_DATA_URL)
    test_file_path = tf.keras.utils.get_file("eval.csv", TEST_DATA_URL)
    clf = ak.StructuredDataClassifier(max_trials=10, directory='tmp_dir', overwrite=True)

    start_time = timeit.default_timer()
    clf.fit(train_file_path, 'survived')
    stop_time = timeit.default_timer()

    accuracy = clf.evaluate(test_file_path, 'survived')[1]
    print('Accuracy: {accuracy}%'.format(accuracy=round(accuracy * 100, 2)))
    print('Total time: {time} seconds.'.format(time=round(stop_time - start_time, 2)))
Example #22
0
def test_structured_data_from_numpy_classifier(tmp_dir):
    num_data = 500
    num_train = 400
    data = common.generate_structured_data(num_data)
    x_train, x_test = data[:num_train], data[num_train:]
    y = common.generate_one_hot_labels(num_instances=num_data, num_classes=3)
    y_train, y_test = y[:num_train], y[num_train:]
    clf = ak.StructuredDataClassifier(directory=tmp_dir,
                                      max_trials=1,
                                      seed=common.SEED)
    clf.fit(x_train, y_train, epochs=2, validation_data=(x_train, y_train))
    assert clf.predict(x_test).shape == (len(y_test), 3)
def test_structured_data_col_type_no_name_error(tmp_path):
    with pytest.raises(ValueError) as info:
        clf = ak.StructuredDataClassifier(
            column_types={
                "age": "numerical",
                "parch": "categorical"
            },
            directory=tmp_path,
            seed=test_utils.SEED,
        )
        clf.fit(x=np.random.rand(100, 30), y=np.random.rand(100, 1))

    assert "column_names must be specified" in str(info.value)
def test_raise_error_unknown_name_in_col_type(tmp_path):
    with pytest.raises(ValueError) as info:
        ak.StructuredDataClassifier(
            column_types={
                "age": "numerical",
                "parch": "categorical"
            },
            column_names=["age", "fare"],
            directory=tmp_path,
            seed=utils.SEED,
        )

    assert "Column_names and column_types are mismatched" in str(info.value)
Example #25
0
def test_structured_data_from_numpy_col_type_classifier(tmp_dir):
    num_data = 500
    data = common.structured_data(num_data)
    x_train = data
    y = np.random.randint(0, 3, num_data)
    y_train = y
    with pytest.raises(ValueError) as info:
        clf = ak.StructuredDataClassifier(
            column_types=common.COLUMN_TYPES_FROM_NUMPY,
            directory=tmp_dir,
            max_trials=1)
        clf.fit(x_train, y_train, epochs=2, validation_data=(x_train, y_train))
    assert str(info.value) == 'Column names must be specified.'
Example #26
0
def test_structured_data_classifier(tmp_path):
    num_data = 500
    num_train = 400
    data = pd.read_csv(utils.TRAIN_CSV_PATH).to_numpy().astype(np.unicode)[:num_data]
    x_train, x_test = data[:num_train], data[num_train:]
    y = utils.generate_one_hot_labels(num_instances=num_data, num_classes=3)
    y_train, y_test = y[:num_train], y[num_train:]
    clf = ak.StructuredDataClassifier(
        directory=tmp_path, max_trials=1, seed=utils.SEED
    )
    clf.fit(x_train, y_train, epochs=2, validation_data=(x_train, y_train))
    clf.export_model()
    assert clf.predict(x_test).shape == (len(y_test), 3)
Example #27
0
def test_structured_classifier(init, fit):
    num_data = 500
    train_x = common.generate_structured_data(num_data)
    train_y = common.generate_one_hot_labels(num_instances=num_data, num_classes=3)

    clf = ak.StructuredDataClassifier(
        column_names=common.COLUMN_NAMES_FROM_NUMPY,
        directory=tmp_dir,
        max_trials=1,
        seed=common.SEED)
    clf.fit(train_x, train_y, epochs=2, validation_data=(train_x, train_y))

    assert init.called
    assert fit.called
def test_structured_data_input_name_type_mismatch_error(tmp_path):
    with pytest.raises(ValueError) as info:
        clf = ak.StructuredDataClassifier(
            column_types={
                "_age": "numerical",
                "parch": "categorical"
            },
            column_names=["age", "fare"],
            directory=tmp_path,
            seed=test_utils.SEED,
        )
        clf.fit(x=test_utils.TRAIN_CSV_PATH, y="survived")

    assert "column_names and column_types are mismatched." in str(info.value)
def test_structured_data_clf_convert_csv_to_df_and_np(fit, tmp_path):
    auto_model = ak.StructuredDataClassifier(directory=tmp_path,
                                             seed=test_utils.SEED)

    auto_model.fit(
        x=test_utils.TRAIN_CSV_PATH,
        y="survived",
        epochs=2,
        validation_data=(test_utils.TEST_CSV_PATH, "survived"),
    )

    _, kwargs = fit.call_args_list[0]
    assert isinstance(kwargs["x"], pd.DataFrame)
    assert isinstance(kwargs["y"], np.ndarray)
Example #30
0
def train_autokeras(X, Y, x, y, modelfile, max_trials=10, epochs=600):
    from sklearn import metrics
    import autokeras as ak
    from sklearn.preprocessing import MinMaxScaler

    clf = ak.StructuredDataClassifier(overwrite=True, max_trials=max_trials)
    clf.fit(X, Y, validation_data=(x, y), epochs=epochs)
    model = clf.export_model()
    model.save(modelfile)

    akpred = clf.predict(x)

    acc = getaccuracy(akpred, y)

    return acc, model