def test_unmarked_categorical_column_throws_warning():
    df_titanic_train, df_titanic_test = utils.get_titanic_binary_classification_dataset(
    )

    column_descriptions = {
        'survived': 'output'
        # This is the column we are "forgetting" to mark as categorical
        # , 'sex': 'categorical'
        ,
        'embarked': 'categorical',
        'pclass': 'categorical'
    }

    ml_predictor = Predictor(type_of_estimator='classifier',
                             column_descriptions=column_descriptions)

    with warnings.catch_warnings(record=True) as caught_w:

        ml_predictor.train(df_titanic_train)
        print(
            'we should be throwing a warning for the user to give them useful feedback on the unlabeled categorical column'
        )
        assert len(caught_w) == 1

    ml_predictor.predict(df_titanic_test)

    # We want to make sure the above does not throw an error
    assert True
def test_unexpected_datetime_column_handled_without_errors():
    df_titanic_train, df_titanic_test = utils.get_titanic_binary_classification_dataset(
    )

    column_descriptions = {
        'survived': 'output',
        'sex': 'categorical',
        'embarked': 'categorical',
        'pclass': 'categorical'
    }

    ml_predictor = Predictor(type_of_estimator='classifier',
                             column_descriptions=column_descriptions)

    ml_predictor.train(df_titanic_train)

    test_dict = df_titanic_test.sample(frac=0.1).to_dict('records')[0]

    test_dict['unexpected_column'] = datetime.date.today()
    test_dict['anoter_unexpected_column'] = datetime.datetime.today()

    ml_predictor.predict(test_dict)

    # We want to make sure the above does not throw an error
    assert True
def process_auto_ml(X_train, X_test, y_train, df_types, m_type, seed, *args):
    """Function that trains and tests data using auto_ml"""

    from auto_ml import Predictor

    # convert column names to numbers to avoid column name collisions (bug)
    names = {c: str(i) for i, c in enumerate(X_train.columns)}
    X_train.columns = names
    X_test.columns = names

    df_types.loc[df_types['NAME'] == 'target', 'TYPE'] = 'output'
    df_types = df_types[df_types['TYPE'] != 'numerical'].set_index('NAME')
    df_types = df_types.rename(index=names)['TYPE'].to_dict()
    X_train['target'] = y_train

    cmodels = ['AdaBoostClassifier', 'ExtraTreesClassifier', 'RandomForestClassifier', 'XGBClassifier']
    rmodels = ['BayesianRidge', 'ElasticNet', 'Lasso', 'LassoLars', 'LinearRegression', 
        'Perceptron', 'LogisticRegression', 'AdaBoostRegressor', 'ExtraTreesRegressor', 
        'PassiveAggressiveRegressor', 'RandomForestRegressor', 'SGDRegressor', 'XGBRegressor']
    
    automl = Predictor(type_of_estimator='classifier' if m_type == 'classification' else 'regressor',
                       column_descriptions=df_types)

    automl.train(X_train, model_names=cmodels if m_type == 'classification' else rmodels,
        scoring='f1_score' if m_type == 'classification' else 'mean_squared_error',
        cv=5, verbose=False)

    return (automl.predict_proba(X_test) if m_type == 'classification' else 
            automl.predict(X_test))
Example #4
0
class AutoLearn:
    def __init__(self):
        # self.trained_ml_pipeline = Sequential()
        self.trained_ml_pipeline = Predictor(
            type_of_estimator='regressor',
            column_descriptions=column_descriptions)
        with open(file_name, 'rb') as read_file:
            datastruct = dill.load(read_file)
        for step in datastruct.named_steps:
            pipeline_step = datastruct.named_steps[step]
            if pipeline_step.get(
                    'model_name',
                    'reallylongnonsensicalstring')[:12] == 'DeepLearning':
                keras_file_name = file_name[:-5] + pipeline_step.model + '_keras_deep_learning_model.h5'
                print(keras_file_name)
                self.trained_ml_pipeline = load_model(keras_file_name)

    def test_deepl(self, test_data):
        Predictions = self.trained_ml_pipeline.predict(test_data)
        return Predictions

    def getsummary(self):
        return self.trained_ml_pipeline.summary()
Example #5
0
    df_train = df_train.dropna()
    df_test_middle = df_test_middle.dropna()

    df_test = df_test_middle.drop(columns='daysOnMarket')
    df_test_label = df_test_middle['daysOnMarket']

    value_list = []
    for i in range(len(df_train.columns)):
        value_list.append('categorical')

    column_description1 = {
        key: value
        for key in df_train.columns for value in value_list
        if df_train[key].dtype == 'object'
    }
    column_description2 = {
        'daysOnMarket': 'output',
        'buildingTypeId': 'categorical',
    }

    print(column_description1)
    column_descriptions = dict(column_description1, **column_description2)

    ml_predictor = Predictor(type_of_estimator='Regressor',
                             column_descriptions=column_descriptions)

    ml_predictor.train(df_train, model_names='XGBRegressor')

    # ml_predictor.score(df_test)
    x = ml_predictor.predict(df_test)
    print(mean_absolute_error(df_test_label, x))
Example #6
0
df_train = table.sample(frac=.5)
df_test = table.sample(frac=.5)

rew_descriptions = {
    'acts1': 'output',
    'cur_action': 'categorical',
    'prev_action': 'categorical'
}

ml_predictor = Predictor(type_of_estimator='regressor',
                         column_descriptions=rew_descriptions)
ml_predictor.train(df_train,
                   model_names=['DeepLearningClassifier'],
                   ml_for_analytics=True)
# ml_predictor.score(df_test, df_test.acts1)
ml_predictor.predict(table[-1:])
ml_predictor.save(file_name='reward.ml', verbose=True)

etable = pd.read_sql_query("SELECT * from sarsa", conn)

df_etrain = etable.sample(frac=.5)
df_etest = etable.sample(frac=.5)

esteem_descriptions = {
    'esteem': 'output',
    'cluster': 'categorical',
}

ml_predictor2 = Predictor(type_of_estimator='regressor',
                          column_descriptions=esteem_descriptions)
ml_predictor2.train(df_etrain, ml_for_analytics=True)
Example #7
0
        # 'bedrooms': 'categorical',
        # 'year': 'categorical',
        # 'month': 'categorical',
    }

    print(column_description1)
    # 合并两个字典
    column_descriptions = dict(column_description1, **column_description2)

    ml_predictor = Predictor(type_of_estimator='Regressor',
                             column_descriptions=column_descriptions)

    ml_predictor.train(df_train, model_names='XGBRegressor')

    # 预测预测数据
    x = ml_predictor.predict(df_test)
    x_dataframe = pd.DataFrame(x, columns=['predictions'])
    merge_data = pd.concat((origin_data, x_dataframe), axis=1)
    merge_data_df = pd.DataFrame(merge_data)
    merge_data_df.to_csv('./merge_data_bak/merge_data_auto_ml.csv',
                         index=False)
    print(x_dataframe.describe())
    print(df_test_label.describe())

    print(mean_absolute_error(df_test_label, x))
    compute_ratio(merge_data_df)
    # compute_ratio2(merge_data_df)

    # 预测训练数据
    train_prediction = ml_predictor.predict(df_train_prediction)
    train_dataframe = pd.DataFrame(train_prediction,
Example #8
0
def _getMetrics(dataset, n_fold):
    results = {'mse': dict(), 'r2': dict()}
    #Get metrics for all datasets
    while (dataset):
        dataset_name = dataset[0]
        if dataset[0] == 'boston':
            #SPECIFIC FIELDS============================================
            #regressor problem
            #506 instasnces, 13 attributes (numeric/categorical)
            problem_type = 'regressor'
            output = 'MEDV'
            column_descriptions = {output: 'output', 'CHAS': 'categorical'}
            raw_data = load_boston()
            #===========================================================
            df = pd.DataFrame(raw_data.data)
            df.columns = raw_data.feature_names
            df[output] = raw_data['target']
            train_set, test_set = model_selection.train_test_split(
                df, test_size=0.4, random_state=42)
            train_set_x = train_set.drop(output, axis=1)
        else:
            exit()

        if problem_type == 'regressor':
            result = {
                'mse': {
                    dataset_name: {
                        'auto_ml': 0,
                        'mlp': 0
                    }
                },
                'r2': {
                    dataset_name: {
                        'auto_ml': 0,
                        'mlp': 0
                    }
                }
            }
        else:
            result = {}

        average_factor = 1.0 / n_fold
        for i in range(n_fold):

            ml_predictor = Predictor(type_of_estimator=problem_type,
                                     column_descriptions=column_descriptions)
            ml_predictor.train(train_set,
                               verbose=False,
                               compare_all_models=True)

            y_auto_ml_predicted = ml_predictor.predict(test_set)

            if problem_type == 'regressor':
                train_set_y = np.asarray(train_set[output], dtype=np.float64)
                mlp = neural_network.MLPRegressor(hidden_layer_sizes=(13, 13,
                                                                      13),
                                                  max_iter=1000)
                mlp.fit(train_set_x, train_set_y)
            else:
                train_set_y = np.asarray(train_set[output], dtype="|S6")
                mlp = neural_network.MLPClassifier(hidden_layer_sizes=(13, 13,
                                                                       13),
                                                   max_iter=1000)
                mlp.fit(train_set_x, train_set_y)

            y_mlp_predicted = mlp.predict(test_set.drop(output, axis=1))

            if problem_type == 'regressor':
                result['mse'][dataset_name]['auto_ml'] += (
                    metrics.mean_squared_error(test_set['MEDV'],
                                               y_auto_ml_predicted) *
                    average_factor)
                result['r2'][dataset_name]['auto_ml'] += (
                    metrics.r2_score(test_set['MEDV'], y_auto_ml_predicted) *
                    average_factor)
                result['mse'][dataset_name]['mlp'] += (
                    metrics.mean_squared_error(
                        test_set['MEDV'], y_mlp_predicted) * average_factor)
                result['r2'][dataset_name]['mlp'] += (
                    metrics.r2_score(test_set['MEDV'], y_mlp_predicted) *
                    average_factor)

        #Take average
        for test, data in result.iteritems():
            results[test][dataset_name] = data[dataset_name]

        del dataset[0]
    return results
Example #9
0
try:
    last_output = pd.read_csv('cache_of_regression_output.csv')
    df_test = last_output
except:
    column_descriptions = {'lmp': 'output', 'time_utc': 'ignore'}

    ml_predictor = Predictor(type_of_estimator='regressor',
                             column_descriptions=column_descriptions)
    # ml_predictor.train(df_train, model_names=['DeepLearningRegressor'])
    ml_predictor.train(
        df_train)  # just use gradient-boosted regressor instead of tensorflow

    ml_predictor.score(df_test, df_test.lmp)

    predictions = ml_predictor.predict(df_test)
    df_test['PredictedLMP'] = predictions

    df_test.to_csv('cache_of_regression_output.csv',
                   columns=['time_utc', 'lmp', 'PredictedLMP'])

# trying to follow this here: https://www.dataquest.io/blog/tutorial-time-series-analysis-with-pandas/
import seaborn as sns
import matplotlib.pyplot as plt
# Use seaborn style defaults and set the default figure size

# sns.set(rc={'figure.figsize':(11, 4)})
# ax = df_test['PredictedLMP'].plot(linewidth=0.5, label='LMP')
# ax.set_ylabel('Actual vs. Predicted LMP')
# plt.show()