Exemple #1
0
def test_unmarked_date_column():
    np.random.seed(0)

    df_twitter_train, df_twitter_test = utils.get_twitter_sentiment_multilabel_classification_dataset(
    )

    column_descriptions = {
        'airline_sentiment': 'output',
        'airline': 'categorical',
        'text': 'nlp',
        'tweet_location': 'categorical',
        'user_timezone': 'categorical'
        # tweet_created is our date column. We want to test that "forgetting" to mark it as a date column throws a user warning before throwing the error
        # , 'tweet_created': 'date'
    }

    ml_predictor = Predictor(type_of_estimator='classifier',
                             column_descriptions=column_descriptions)
    with warnings.catch_warnings(record=True) as w:

        try:
            ml_predictor.train(df_twitter_train)
        except TypeError as e:
            pass

        print('Here are the caught warnings:')
        print(w)

        assert len(w) == 1
Exemple #2
0
def test_nlp_multilabel_classification():
    np.random.seed(0)

    df_twitter_train, df_twitter_test = utils.get_twitter_sentiment_multilabel_classification_dataset(
    )

    column_descriptions = {
        'airline_sentiment': 'output',
        'airline': 'categorical',
        'text': 'nlp',
        'tweet_location': 'categorical',
        'user_timezone': 'categorical',
        'tweet_created': 'date'
    }

    ml_predictor = Predictor(type_of_estimator='classifier',
                             column_descriptions=column_descriptions)
    ml_predictor.train(df_twitter_train)

    test_score = ml_predictor.score(df_twitter_test,
                                    df_twitter_test.airline_sentiment,
                                    verbose=0)
    # Make sure our score is good, but not unreasonably good
    print('test_score')
    print(test_score)
    assert 0.67 < test_score < 0.79
def test_multilabel_classification_predict_on_Predictor_instance():
    np.random.seed(0)

    df_twitter_train, df_twitter_test = utils.get_twitter_sentiment_multilabel_classification_dataset()
    # Note that this does not take 'text' into account, intentionally (as that takes a while longer to train)
    ml_predictor = utils.train_basic_multilabel_classifier(df_twitter_train)

    predictions = ml_predictor.predict(df_twitter_test)
    test_score = accuracy_score(predictions, df_twitter_test.airline_sentiment)
    # Make sure our score is good, but not unreasonably good
    print('test_score')
    print(test_score)
    assert 0.72 < test_score < 0.77
def test_multilabel_classification_predict_on_Predictor_instance():
    np.random.seed(0)

    df_twitter_train, df_twitter_test = utils.get_twitter_sentiment_multilabel_classification_dataset()
    ml_predictor = utils.train_basic_multilabel_classifier(df_twitter_train)

    predictions = ml_predictor.predict(df_twitter_test)
    test_score = accuracy_score(predictions, df_twitter_test.airline_sentiment)
    # Right now we're getting a score of -.205
    # Make sure our score is good, but not unreasonably good
    print('test_score')
    print(test_score)
    assert 0.67 < test_score < 0.79
Exemple #5
0
def test_multilabel_classification():
    np.random.seed(0)

    df_twitter_train, df_twitter_test = utils.get_twitter_sentiment_multilabel_classification_dataset(
    )
    ml_predictor = utils.train_basic_multilabel_classifier(df_twitter_train)

    test_score = ml_predictor.score(df_twitter_test,
                                    df_twitter_test.airline_sentiment,
                                    verbose=0)
    # Right now we're getting a score of -.205
    # Make sure our score is good, but not unreasonably good
    print('test_score')
    print(test_score)
    assert 0.67 < test_score < 0.79
Exemple #6
0
def test_saving_trained_pipeline_multilabel_classification():
    np.random.seed(0)

    df_twitter_train, df_twitter_test = utils.get_twitter_sentiment_multilabel_classification_dataset(
    )
    ml_predictor = utils.train_basic_multilabel_classifier(df_twitter_train)

    file_name = ml_predictor.save(str(random.random()))

    with open(file_name, 'rb') as read_file:
        saved_ml_pipeline = dill.load(read_file)
    os.remove(file_name)

    test_score = saved_ml_pipeline.score(df_twitter_test,
                                         df_twitter_test.airline_sentiment)
    # Right now we're getting a score of -.205
    # Make sure our score is good, but not unreasonably good
    print('test_score')
    print(test_score)
    assert 0.67 < test_score < 0.79
Exemple #7
0
    def test_getting_single_predictions_nlp_date_multilabel_classification(
            model_name=None):
        # auto_ml does not support multilabel classification for deep learning at the moment
        if model_name == 'DeepLearningClassifier':
            return

        np.random.seed(0)

        df_twitter_train, df_twitter_test = utils.get_twitter_sentiment_multilabel_classification_dataset(
        )

        column_descriptions = {
            'airline_sentiment': 'output',
            'airline': 'categorical',
            'text': 'nlp',
            'tweet_location': 'categorical',
            'user_timezone': 'categorical',
            'tweet_created': 'date'
        }

        ml_predictor = Predictor(type_of_estimator='classifier',
                                 column_descriptions=column_descriptions)
        ml_predictor.train(df_twitter_train, model_names=model_name)

        file_name = ml_predictor.save(str(random.random()))

        # if model_name == 'DeepLearningClassifier':
        #     from auto_ml.utils_models import load_keras_model

        #     saved_ml_pipeline = load_keras_model(file_name)
        # else:
        #     with open(file_name, 'rb') as read_file:
        #         saved_ml_pipeline = dill.load(read_file)
        saved_ml_pipeline = load_ml_model(file_name)

        os.remove(file_name)
        try:
            keras_file_name = file_name[:-5] + '_keras_deep_learning_model.h5'
            os.remove(keras_file_name)
        except:
            pass

        df_twitter_test_dictionaries = df_twitter_test.to_dict('records')

        # 1. make sure the accuracy is the same

        predictions = []
        for row in df_twitter_test_dictionaries:
            predictions.append(saved_ml_pipeline.predict(row))

        print('predictions')
        print(predictions)

        first_score = accuracy_score(df_twitter_test.airline_sentiment,
                                     predictions)
        print('first_score')
        print(first_score)
        # Make sure our score is good, but not unreasonably good
        lower_bound = 0.73
        # if model_name == 'LGBMClassifier':
        #     lower_bound = 0.655
        assert lower_bound < first_score < 0.79

        # 2. make sure the speed is reasonable (do it a few extra times)
        data_length = len(df_twitter_test_dictionaries)
        start_time = datetime.datetime.now()
        for idx in range(1000):
            row_num = idx % data_length
            saved_ml_pipeline.predict(df_twitter_test_dictionaries[row_num])
        end_time = datetime.datetime.now()
        duration = end_time - start_time

        print('duration.total_seconds()')
        print(duration.total_seconds())

        # It's very difficult to set a benchmark for speed that will work across all machines.
        # On my 2013 bottom of the line 15" MacBook Pro, this runs in about 0.8 seconds for 1000 predictions
        # That's about 1 millisecond per prediction
        # Assuming we might be running on a test box that's pretty weak, multiply by 3
        # Also make sure we're not running unreasonably quickly
        # time_upper_bound = 3
        # if model_name == 'XGBClassifier':
        #     time_upper_bound = 4
        assert 0.2 < duration.total_seconds() < 15

        # 3. make sure we're not modifying the dictionaries (the score is the same after running a few experiments as it is the first time)

        predictions = []
        for row in df_twitter_test_dictionaries:
            predictions.append(saved_ml_pipeline.predict(row))

        print('predictions')
        print(predictions)
        print('df_twitter_test_dictionaries')
        print(df_twitter_test_dictionaries)
        second_score = accuracy_score(df_twitter_test.airline_sentiment,
                                      predictions)
        print('second_score')
        print(second_score)
        # Make sure our score is good, but not unreasonably good
        assert lower_bound < second_score < 0.79
Exemple #8
0
def test_getting_single_predictions_multilabel_classification_with_dates():
    np.random.seed(0)

    df_twitter_train, df_twitter_test = utils.get_twitter_sentiment_multilabel_classification_dataset(
    )
    ml_predictor = utils.train_basic_multilabel_classifier(df_twitter_train)
    file_name = ml_predictor.save(str(random.random()))

    with open(file_name, 'rb') as read_file:
        saved_ml_pipeline = dill.load(read_file)
    os.remove(file_name)

    df_twitter_test_dictionaries = df_twitter_test.to_dict('records')

    # 1. make sure the accuracy is the same

    predictions = []
    for row in df_twitter_test_dictionaries:
        predictions.append(saved_ml_pipeline.predict(row))

    print('predictions')
    print(predictions)

    first_score = accuracy_score(df_twitter_test.airline_sentiment,
                                 predictions)
    print('first_score')
    print(first_score)
    # Make sure our score is good, but not unreasonably good
    assert 0.67 < first_score < 0.79

    # 2. make sure the speed is reasonable (do it a few extra times)
    data_length = len(df_twitter_test_dictionaries)
    start_time = datetime.datetime.now()
    for idx in range(1000):
        row_num = idx % data_length
        saved_ml_pipeline.predict(df_twitter_test_dictionaries[row_num])
    end_time = datetime.datetime.now()
    duration = end_time - start_time

    print('duration.total_seconds()')
    print(duration.total_seconds())

    # It's very difficult to set a benchmark for speed that will work across all machines.
    # On my 2013 bottom of the line 15" MacBook Pro, this runs in about 0.8 seconds for 1000 predictions
    # That's about 1 millisecond per prediction
    # Assuming we might be running on a test box that's pretty weak, multiply by 3
    # Also make sure we're not running unreasonably quickly
    assert 0.2 < duration.total_seconds() < 3

    # 3. make sure we're not modifying the dictionaries (the score is the same after running a few experiments as it is the first time)

    predictions = []
    for row in df_twitter_test_dictionaries:
        predictions.append(saved_ml_pipeline.predict(row))

    print('predictions')
    print(predictions)
    print('df_twitter_test_dictionaries')
    print(df_twitter_test_dictionaries)
    second_score = accuracy_score(df_twitter_test.airline_sentiment,
                                  predictions)
    print('second_score')
    print(second_score)
    # Make sure our score is good, but not unreasonably good
    assert 0.67 < second_score < 0.79