def test_unmarked_categorical_column_throws_warning(): df_titanic_train, df_titanic_test = utils.get_titanic_binary_classification_dataset( ) column_descriptions = { 'survived': 'output' # This is the column we are "forgetting" to mark as categorical # , 'sex': 'categorical' , 'embarked': 'categorical', 'pclass': 'categorical' } ml_predictor = Predictor(type_of_estimator='classifier', column_descriptions=column_descriptions) with warnings.catch_warnings(record=True) as caught_w: ml_predictor.train(df_titanic_train) print( 'we should be throwing a warning for the user to give them useful feedback on the unlabeled categorical column' ) assert len(caught_w) == 1 ml_predictor.predict(df_titanic_test) # We want to make sure the above does not throw an error assert True
def test_unexpected_datetime_column_handled_without_errors(): df_titanic_train, df_titanic_test = utils.get_titanic_binary_classification_dataset( ) column_descriptions = { 'survived': 'output', 'sex': 'categorical', 'embarked': 'categorical', 'pclass': 'categorical' } ml_predictor = Predictor(type_of_estimator='classifier', column_descriptions=column_descriptions) ml_predictor.train(df_titanic_train) test_dict = df_titanic_test.sample(frac=0.1).to_dict('records')[0] test_dict['unexpected_column'] = datetime.date.today() test_dict['anoter_unexpected_column'] = datetime.datetime.today() ml_predictor.predict(test_dict) # We want to make sure the above does not throw an error assert True
def process_auto_ml(X_train, X_test, y_train, df_types, m_type, seed, *args): """Function that trains and tests data using auto_ml""" from auto_ml import Predictor # convert column names to numbers to avoid column name collisions (bug) names = {c: str(i) for i, c in enumerate(X_train.columns)} X_train.columns = names X_test.columns = names df_types.loc[df_types['NAME'] == 'target', 'TYPE'] = 'output' df_types = df_types[df_types['TYPE'] != 'numerical'].set_index('NAME') df_types = df_types.rename(index=names)['TYPE'].to_dict() X_train['target'] = y_train cmodels = ['AdaBoostClassifier', 'ExtraTreesClassifier', 'RandomForestClassifier', 'XGBClassifier'] rmodels = ['BayesianRidge', 'ElasticNet', 'Lasso', 'LassoLars', 'LinearRegression', 'Perceptron', 'LogisticRegression', 'AdaBoostRegressor', 'ExtraTreesRegressor', 'PassiveAggressiveRegressor', 'RandomForestRegressor', 'SGDRegressor', 'XGBRegressor'] automl = Predictor(type_of_estimator='classifier' if m_type == 'classification' else 'regressor', column_descriptions=df_types) automl.train(X_train, model_names=cmodels if m_type == 'classification' else rmodels, scoring='f1_score' if m_type == 'classification' else 'mean_squared_error', cv=5, verbose=False) return (automl.predict_proba(X_test) if m_type == 'classification' else automl.predict(X_test))
class AutoLearn: def __init__(self): # self.trained_ml_pipeline = Sequential() self.trained_ml_pipeline = Predictor( type_of_estimator='regressor', column_descriptions=column_descriptions) with open(file_name, 'rb') as read_file: datastruct = dill.load(read_file) for step in datastruct.named_steps: pipeline_step = datastruct.named_steps[step] if pipeline_step.get( 'model_name', 'reallylongnonsensicalstring')[:12] == 'DeepLearning': keras_file_name = file_name[:-5] + pipeline_step.model + '_keras_deep_learning_model.h5' print(keras_file_name) self.trained_ml_pipeline = load_model(keras_file_name) def test_deepl(self, test_data): Predictions = self.trained_ml_pipeline.predict(test_data) return Predictions def getsummary(self): return self.trained_ml_pipeline.summary()
df_train = df_train.dropna() df_test_middle = df_test_middle.dropna() df_test = df_test_middle.drop(columns='daysOnMarket') df_test_label = df_test_middle['daysOnMarket'] value_list = [] for i in range(len(df_train.columns)): value_list.append('categorical') column_description1 = { key: value for key in df_train.columns for value in value_list if df_train[key].dtype == 'object' } column_description2 = { 'daysOnMarket': 'output', 'buildingTypeId': 'categorical', } print(column_description1) column_descriptions = dict(column_description1, **column_description2) ml_predictor = Predictor(type_of_estimator='Regressor', column_descriptions=column_descriptions) ml_predictor.train(df_train, model_names='XGBRegressor') # ml_predictor.score(df_test) x = ml_predictor.predict(df_test) print(mean_absolute_error(df_test_label, x))
df_train = table.sample(frac=.5) df_test = table.sample(frac=.5) rew_descriptions = { 'acts1': 'output', 'cur_action': 'categorical', 'prev_action': 'categorical' } ml_predictor = Predictor(type_of_estimator='regressor', column_descriptions=rew_descriptions) ml_predictor.train(df_train, model_names=['DeepLearningClassifier'], ml_for_analytics=True) # ml_predictor.score(df_test, df_test.acts1) ml_predictor.predict(table[-1:]) ml_predictor.save(file_name='reward.ml', verbose=True) etable = pd.read_sql_query("SELECT * from sarsa", conn) df_etrain = etable.sample(frac=.5) df_etest = etable.sample(frac=.5) esteem_descriptions = { 'esteem': 'output', 'cluster': 'categorical', } ml_predictor2 = Predictor(type_of_estimator='regressor', column_descriptions=esteem_descriptions) ml_predictor2.train(df_etrain, ml_for_analytics=True)
# 'bedrooms': 'categorical', # 'year': 'categorical', # 'month': 'categorical', } print(column_description1) # 合并两个字典 column_descriptions = dict(column_description1, **column_description2) ml_predictor = Predictor(type_of_estimator='Regressor', column_descriptions=column_descriptions) ml_predictor.train(df_train, model_names='XGBRegressor') # 预测预测数据 x = ml_predictor.predict(df_test) x_dataframe = pd.DataFrame(x, columns=['predictions']) merge_data = pd.concat((origin_data, x_dataframe), axis=1) merge_data_df = pd.DataFrame(merge_data) merge_data_df.to_csv('./merge_data_bak/merge_data_auto_ml.csv', index=False) print(x_dataframe.describe()) print(df_test_label.describe()) print(mean_absolute_error(df_test_label, x)) compute_ratio(merge_data_df) # compute_ratio2(merge_data_df) # 预测训练数据 train_prediction = ml_predictor.predict(df_train_prediction) train_dataframe = pd.DataFrame(train_prediction,
def _getMetrics(dataset, n_fold): results = {'mse': dict(), 'r2': dict()} #Get metrics for all datasets while (dataset): dataset_name = dataset[0] if dataset[0] == 'boston': #SPECIFIC FIELDS============================================ #regressor problem #506 instasnces, 13 attributes (numeric/categorical) problem_type = 'regressor' output = 'MEDV' column_descriptions = {output: 'output', 'CHAS': 'categorical'} raw_data = load_boston() #=========================================================== df = pd.DataFrame(raw_data.data) df.columns = raw_data.feature_names df[output] = raw_data['target'] train_set, test_set = model_selection.train_test_split( df, test_size=0.4, random_state=42) train_set_x = train_set.drop(output, axis=1) else: exit() if problem_type == 'regressor': result = { 'mse': { dataset_name: { 'auto_ml': 0, 'mlp': 0 } }, 'r2': { dataset_name: { 'auto_ml': 0, 'mlp': 0 } } } else: result = {} average_factor = 1.0 / n_fold for i in range(n_fold): ml_predictor = Predictor(type_of_estimator=problem_type, column_descriptions=column_descriptions) ml_predictor.train(train_set, verbose=False, compare_all_models=True) y_auto_ml_predicted = ml_predictor.predict(test_set) if problem_type == 'regressor': train_set_y = np.asarray(train_set[output], dtype=np.float64) mlp = neural_network.MLPRegressor(hidden_layer_sizes=(13, 13, 13), max_iter=1000) mlp.fit(train_set_x, train_set_y) else: train_set_y = np.asarray(train_set[output], dtype="|S6") mlp = neural_network.MLPClassifier(hidden_layer_sizes=(13, 13, 13), max_iter=1000) mlp.fit(train_set_x, train_set_y) y_mlp_predicted = mlp.predict(test_set.drop(output, axis=1)) if problem_type == 'regressor': result['mse'][dataset_name]['auto_ml'] += ( metrics.mean_squared_error(test_set['MEDV'], y_auto_ml_predicted) * average_factor) result['r2'][dataset_name]['auto_ml'] += ( metrics.r2_score(test_set['MEDV'], y_auto_ml_predicted) * average_factor) result['mse'][dataset_name]['mlp'] += ( metrics.mean_squared_error( test_set['MEDV'], y_mlp_predicted) * average_factor) result['r2'][dataset_name]['mlp'] += ( metrics.r2_score(test_set['MEDV'], y_mlp_predicted) * average_factor) #Take average for test, data in result.iteritems(): results[test][dataset_name] = data[dataset_name] del dataset[0] return results
try: last_output = pd.read_csv('cache_of_regression_output.csv') df_test = last_output except: column_descriptions = {'lmp': 'output', 'time_utc': 'ignore'} ml_predictor = Predictor(type_of_estimator='regressor', column_descriptions=column_descriptions) # ml_predictor.train(df_train, model_names=['DeepLearningRegressor']) ml_predictor.train( df_train) # just use gradient-boosted regressor instead of tensorflow ml_predictor.score(df_test, df_test.lmp) predictions = ml_predictor.predict(df_test) df_test['PredictedLMP'] = predictions df_test.to_csv('cache_of_regression_output.csv', columns=['time_utc', 'lmp', 'PredictedLMP']) # trying to follow this here: https://www.dataquest.io/blog/tutorial-time-series-analysis-with-pandas/ import seaborn as sns import matplotlib.pyplot as plt # Use seaborn style defaults and set the default figure size # sns.set(rc={'figure.figsize':(11, 4)}) # ax = df_test['PredictedLMP'].plot(linewidth=0.5, label='LMP') # ax.set_ylabel('Actual vs. Predicted LMP') # plt.show()