def cvtestNone(xtrain,labels,tid):
    #test DecisionTreeRegressor with depth = None
    cvresult=np.zeros(7);
    for i in range(1):
        clf_1 = DecisionTreeRegressor(max_depth=None,random_state=rng,min_samples_leaf=2)
        cv = cross_validation.ShuffleSplit(xtrain.shape[0], n_iter=10,test_size=0.1, random_state=rng)
        scores = cross_validation.cross_val_score(clf_1, xtrain, labels[:,tid],cv=cv,scoring='mean_squared_error')
        cvresult[i-1]=np.sum(scores)/10
        print tid,'depth=None',np.sum(scores)/10,scores
        print clf_1.get_params()
 def model_dec_tree(args, y):
     dect = DecisionTreeRegressor()
     dect.fit(args, y)
     res = dect.score(args, y)
     params = dect.get_params()
     dump = None
     return res, params, dump
Exemple #3
0
 def model_dec_tree(args, y):
     alpha = 0.1
     l1_ratio = 0.7
     dect = DecisionTreeRegressor()
     dect.fit(args, y)
     res = dect.score(args, y)
     params = dect.get_params()
     dump = None  #pickle.dumps(dect.tree_)
     return res, params, dump
Exemple #4
0
def search_bestparam_DecisionTreeRegressor(X_train, y_train, df_search_best_param):
    print(f"Search best params for DecisionTreeRegressor ...")
    model = DecisionTreeRegressor()
    print("Supported params", model.get_params())
    param_grid = {
          "criterion": ["mse", "mae"],
          "min_samples_split": [10, 20, 40],
          "max_depth": [2, 6, 8],
          "min_samples_leaf": [20, 40, 100],
          "max_leaf_nodes": [5, 20, 100]
      }
    search_bestparam(model, param_grid, X_train, y_train, df_search_best_param)
Exemple #5
0
def DecisionTreeModel(X, y):
    '''
    Defines a decision tree model and fit features X to outputs y.
    In the process plot the learning curve.
    returns the regression model.
    :param X: features
    :param y: outputs
    :return: a regression model
    '''
    n = 5
    regressor = DecisionTreeRegressor(max_depth=n, random_state=0)

    train_sizes = np.linspace(1, X.shape[0] * 0.8 - 1, 9).astype(int)
    print("train_sizes: ", train_sizes)
    print("train_sizes fracs: ", train_sizes / float(X.shape[0]))

    cv = ShuffleSplit(n_splits=10, train_size=0.8, random_state=0)
    sizes, train_scores, test_scores = curves(regressor,
                                              X,
                                              y,
                                              cv=cv,
                                              train_sizes=train_sizes,
                                              scoring='r2')
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)

    plt.figure()
    #plt.subplot(1,2,1)
    plt.plot(sizes, train_scores_mean, '-o')
    plt.fill_between(sizes,
                     train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std,
                     alpha=0.15,
                     color='b')
    #plt.subplot(1,2,2)
    plt.plot(sizes, test_scores_mean, '-o')
    plt.fill_between(sizes,
                     test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std,
                     alpha=0.15,
                     color='r')
    plt.xlabel('training size')
    plt.ylabel('r2 score')
    plt.show()
    print(regressor.get_params())

    pass
def dtr(days_train):
    begin = datetime.datetime.now()
    grid = False
    # train_x = read_coo_mtx(training_X_file)
    # train_y = np.loadtxt(open(training_Y_file), dtype=int)
    # test_x = read_coo_mtx(test_X_file)
    # test_y = np.loadtxt(open(test_Y_file), dtype=int)

    train_x, train_y, valid_x, valid_y = divideTrainValid(days_train)
    # print( train_x.shape, test_x.shape
    print( "Loading data completed.")
    print( "Read time: " + str(datetime.datetime.now() - begin))

    classifier = DecisionTreeRegressor(min_samples_leaf=NUM_TYPES)
    # if grid:
    #     param_grid = {'C': [1, 5, 10]}
    #     grid = GridSearchCV(estimator=classifier, scoring='roc_auc', param_grid=param_grid)
    #     grid.fit(train_x, train_y)
    #     print( "Training completed."
    #     print( grid.cv_results_
    #     print( grid.best_estimator_

    if not grid:
        classifier.fit(train_x, train_y)
        # cross_val_score(classifier, training_x, training_y, cv=10)
        # print( "Cross validation completed."
        # joblib.dump(classifier, "new_basic_lr" + ".pkl", compress=3)      #加个3,是压缩,一般用这个
        # classifier = joblib.load("new_basic_lr.pkl")

        y_pred = classifier.predict(valid_x)

        accuracy = metrics.mean_squared_error(valid_y, y_pred)
        print("mse: " + str(accuracy))

        end = datetime.datetime.now()
        day = datetime.date.today()
        np.savetxt(open(DATA_PATH_RESULT+title+"_lr_pred_"+str(day), "w"), accuracy, fmt='%.5f')

        rcd = str(end) + '\n'
        rcd += "lr: "+title+" 130" + '\n'
        rcd += str(classifier.get_params()) + '\n'
        rcd += "mse: " + str(accuracy) + '\n'
        rcd += "time: " + str(end - begin) + '\n' + '\n' + '\n'
        print( rcd)
        log_file = open(DATA_PATH_RESULT+"dtr_result", "a")
        log_file.write(rcd)
        log_file.close()
Exemple #7
0
def decision_tree(df, significant_cols, target, cat_cols, num_cols):
    ss = StandardScaler()
    ohe = OneHotEncoder(drop='first', sparse=False)
    X = df[significant_cols]
    y = df[target]
    estimator = DecisionTreeRegressor(random_state=0, splitter='best')
    params = {
        'criterion': ['mse', 'friedman_mse', 'mae'],
        'max_features': [None, 'log2', 'sqrt'],
        'ccp_alpha': np.arange(0, 1.1, 0.1)
    }
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.1,
                                                        random_state=0)
    X_train_cat = ohe.fit_transform(X_train[cat_cols])
    X_train_num = ss.fit_transform(X_train[num_cols])
    X_test_cat = ohe.transform(X_test[cat_cols])
    X_test_num = ss.transform(X_test[num_cols])
    train_data = np.c_[X_train_cat, X_train_num]
    test_data = np.c_[X_test_cat, X_test_num]
    gs = GridSearchCV(estimator, params, scoring='r2', cv=3)
    gs.fit(train_data, y_train)
    estimator = gs.best_estimator_
    r2_cv_scores = cross_val_score(estimator,
                                   train_data,
                                   y_train,
                                   scoring='r2',
                                   cv=3,
                                   n_jobs=-1)
    rmse_cv_scores = cross_val_score(estimator,
                                     train_data,
                                     y_train,
                                     scoring='neg_root_mean_squared_error',
                                     cv=3,
                                     n_jobs=-1)
    params = estimator.get_params()
    r2 = np.mean(r2_cv_scores)
    rmse = np.abs(np.mean(rmse_cv_scores))
    r2_variance = np.var(r2_cv_scores, ddof=1)
    rmse_variance = np.abs(np.var(rmse_cv_scores, ddof=1))
    estimator.fit(train_data, y_train)
    y_pred = estimator.predict(test_data)
    r2_validation = r2_score(y_test, y_pred)
    rmse_validation = np.sqrt(mean_squared_error(y_test, y_pred))
    return r2, rmse, r2_variance, rmse_variance, r2_validation, rmse_validation, params
def test_model_finder_set_model_regression(model_finder_regression, seed):
    """Testing if set_model() function correctly sets chosen Model and corresponding properties (regression).
    Additionally checks if the set Model wasn't fitted in the process."""
    model = DecisionTreeRegressor(max_depth=10,
                                  criterion="mae",
                                  random_state=seed)
    mf = model_finder_regression
    mf.scoring_functions = [mean_squared_error, r2_score]
    mf.set_model(model)

    assert mf._chosen_model.clf.regressor == model
    assert mf._chosen_model_params == model.get_params()
    assert mf._chosen_model_scores == {
        "mean_squared_error": 1323.0223273506956,
        "r2_score": -1.7265322117249737
    }
    assert type(mf._chosen_model) == WrappedModelRegression

    with pytest.raises(NotFittedError):
        mf.predict([1])
Exemple #9
0
#dt
from sklearn.tree import DecisionTreeRegressor
tree_reg = DecisionTreeRegressor(max_depth=2, random_state=42)
tree_reg.fit(fires_prepared, fires_labels)

#rf
from sklearn.ensemble import RandomForestRegressor
forest_reg = RandomForestRegressor(random_state=42)
forest_reg.fit(fires_prepared, fires_labels)

#2-1
from sklearn.model_selection import GridSearchCV
print("sgd_reg.get_params().keys(): ", sgd_reg.get_params().keys())
print("svm_reg.get_params().keys(): ", svm_reg.get_params().keys())
print("tree_reg.get_params().keys(): ", tree_reg.get_params().keys())
print("forest_reg.get_params().keys(): ", forest_reg.get_params().keys())

params_sgd = [
    {
        'alpha': [0.1, 0.5],
        'epsilon': [0.1, 1]
    },
    {
        'alpha': [0.5, 0.6],
        'epsilon': [0.1, 0.7]
    },
]

params_svm = {
    'kernel': ["linear", "poly", "rbf"],
Exemple #10
0
    #Cross validation is used to estimate the generalization performance
 #Why tune hyperparameters?
    #default hyperparameters are not optimal for all problems, for the best performance
    '''1)Grid Search
        2)Random Search
        3)Bayesian Search
        4)Genetic Search'''
#Grid Search Cross Validation
    #-manually set a grid of discrete hyperparameter values
    #-set a metric for scoring model performance
    #-for each hyperparameters, evaluate each model's CV(cross validation)-score, best CV is the optimal hyperparameters
    #-the bigger the grid, the longer to take to find the solution
################################################
from sklearn.tree import DecisionTreeClassifier
dt=DecisionTreeClassifier()
print(dt.get_params())
from sklearn.model_selection import GridSearchCV
params_dt = {'max_depth':[2,3,4] ,'min_samples_leaf':[0.12, 0.14, 0.16, 0.18]} # Define params_dt
grid_dt = GridSearchCV(estimator=dt, param_grid=params_dt, scoring='accuracy',cv=10,n_jobs=1)
grid_dt = GridSearchCV(estimator=dt,param_grid=params_dt,scoring='roc_auc',cv=5, n_jobs=-1)

from sklearn.metrics import roc_auc_score
best_model = grid_dt.best_estimator_# Extract the best estimator
y_pred_proba = best_model.predict_proba(X_test)[:,1]# Predict the test set probabilities of the positive class
test_roc_auc = roc_auc_score(y_test, y_pred_proba)# Compute test_roc_auc
print('Test set ROC AUC score: {:.3f}'.format(test_roc_auc))# Print test_roc_auc
################################################
#Tuning RF's Hyperparameter
    #Hyperparameter tuning is expensive, somtimes only slight improvement
    #Weight the impact of using hyperparameter
    #
Exemple #11
0
count = 0
for i in range(len(predictiveAttributeNotDegree)):
    if count < train_percent:
        count = count + 1
        train_set.append([
            predictiveAttributeNotDegree[i][10],
            predictiveAttributeNotDegree[i][12],
            predictiveAttributeNotDegree[i][2]
        ])
        train_result.append([predictiveAttributeNotDegree[i][20]])
    else:
        test_set.append([
            predictiveAttributeNotDegree[i][10],
            predictiveAttributeNotDegree[i][12],
            predictiveAttributeNotDegree[i][2]
        ])
        test_result.append([predictiveAttributeNotDegree[i][20]])

regressor = DecisionTreeRegressor(random_state=0, min_samples_leaf=10)
print(cross_val_score(regressor, train_set[1:], train_result[1:], cv=10))
regressor.fit(train_set, train_result)
print(regressor.score(test_set, test_result))
#              matr cf    2  3 tot cds tipoCds coorte annicarriera annodiploma votodip codschool tipoMat annolaur votolaur erasmus tesi mot_sta sta fc
newStudent = [[85, 11, 30]]
real_value = [1]
predicted = regressor.predict(newStudent)
print("Predicted: ", predicted)
print("MSE: ", mean_squared_error(real_value, regressor.predict(newStudent)))
print("Params: ", regressor.get_params())
print("Feature Importance: ", regressor.feature_importances_)
        test_result.append([predictiveAttributeNotDegree[i][2]])

regressor = DecisionTreeRegressor(random_state=0, min_samples_leaf=10)
print(cross_val_score(regressor, train_set[1:], train_result[1:], cv=10))
regressor.fit(train_set, train_result)
print(regressor.score(test_set, test_result))
prediction = []
for item in test_set:
    items = [[item[0], item[1]]]
    prediction.append(regressor.predict(items))
pred = np.zeros(len(prediction))
predi = np.array(prediction)
for i in range(len(prediction)):
    pred[i] = predi[i][0]
print(("MSE: {}".format(mean_squared_error(pred, test_result))))
print("Params: ", regressor.get_params())
print("Feature Importance: ", regressor.feature_importances_)

print(
    "\n\n\n----------QUI INIZIA LA SEZIONE CON TUTTI GLI ATTRIBUTI---------- \n\n\n"
)
predictiveAttributeDegree = pd.read_json(
    "C:/Users/sebas/PycharmProjects/MachineLearning-Local/DSML/DecisionTree/predictiveDegree.txt",
    orient='records',
    dtype=True,
    typ="series")
predictiveAttributeNotDegree = pd.read_json(
    "C:/Users/sebas/PycharmProjects/MachineLearning-Local/DSML/DecisionTree/predictiveNotDegree.txt",
    orient='records',
    dtype=True,
    typ="series")
    y = shots[['RESULT']]

    # Split training examples from labels
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.33,
                                                        random_state=22)
    logger.info("\nX_train:\n{}\n({})\n".format(X_train[:3], len(X_train)))
    logger.info("\nX_test:\n{}\n({})\n".format(X_test[:3], len(X_test)))
    logger.info("\ny_train:\n{}\n({})\n".format(y_train[:3], len(y_train)))
    logger.info("\ny_test:\n{}\n({})\n".format(y_test[:3], len(y_test)))

    # Fit decision tree regressor model
    dt = DecisionTreeRegressor(max_depth=4)
    dt.fit(X_train, y_train)
    dt_params = dt.get_params(deep=True)
    logger.info("\nDT Regressor Model Parameters\n\{}\n".format(dt_params))
    joblib.dump(dt, 'dt.pkl')
    logger.info("Persisted the DT Regressor model to dt.pkl...")
    y_pred_dt = dt.predict(X_test)

    # Fit Linear Regression model
    lr = LinearRegression()
    lr.fit(X_train, y_train)
    lr_params = lr.coef_
    logger.info("\nLinear Regressor Model Parameters\n{}\n{}\n".format(
        params, lr_params))
    joblib.dump(lr, 'lr.pkl')
    logger.info("Persisted the Linear Regression model to lr.pkl...")
    y_pred_lr = lr.predict(X_test)
# y_2 = regr_2.predict(X_test)
# y_3 = regr_3.predict(X_test)

train_x_pd = pd.read_table('train_x.txt', delimiter='\t')
train_x = train_x_pd.values

train_y_pd = pd.read_table('train_y.txt', delimiter='\t')
train_y = train_y_pd.values

test_x_pd = pd.read_table('test_x.txt', delimiter='\t')
test_x = test_x_pd.values

test_y_pd = pd.read_table('test_y.txt', delimiter='\t')
test_y = test_y_pd.values

regr_1 = DecisionTreeRegressor(max_depth=20)
print(cross_validate(regr_1, train_x, train_y, cv=10))
print(regr_1.get_params())

regr_1.fit(train_x, train_y)

y_1 = regr_1.predict(test_x)

#print(y_1.shape, test_y.shape)
res = np.corrcoef(y_1.flatten(), test_y.flatten())
print(res)

#loss_values = regr_1.estimator.loss_curve_
#plt.plot(loss_values)
#plt.savefig('demo.pdf')
Exemple #15
0
import matplotlib.pyplot as plt

#Import final cleaned dataframe
dataframe=pd.read_csv('dataframe210620.csv',index_col=0)

from sklearn.tree import DecisionTreeRegressor
from pprint import pprint
from sklearn.model_selection import train_test_split

X_rf = dataframe.drop(['Patient ID','A1C (%)','Year'],1)
y_rf = dataframe['A1C (%)']
X_train_rf, X_test_rf, y_train_rf, y_test_rf = train_test_split(X_rf, y_rf, random_state = 0)

clf_rf = DecisionTreeRegressor(random_state = 0).fit(X_train_rf, y_train_rf)
print('Parameters currently in use:\n')
pprint(clf_rf.get_params())

X_train_rf.to_csv('X_train.csv')
y_train_rf.to_csv('y_train.csv')

from sklearn.model_selection import RandomizedSearchCV


# Number of features to consider at every split - for regressor, none is good
max_features = None

# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)

# Minimum number of samples required to split a node
Exemple #16
0
#den_r2q = np.power(den_r2, 2)

#r2_manual = 1 - (num_r2q.sum()/den_r2q.sum())
#r2_uv = 1 - (v/u)
#print("R2 força bruta: ", round(r2_manual, 2))
#print("R2 força bruta UV: ", round(r2_uv, 8))

#%% ACURÁCIA br
#print('Accuracy RF: ', round(np.average(scores_br_rf), 2), "%.")
print('Accuracy DT: ', round(np.mean(accuracy_dt), 4))
print('Accuracy R2: ', round(np.mean(accuracy_r2), 4))
print('Accuracy CV: ', round(np.mean(accuracy_cv), 4))
print('Accuracy MSE: ', round(np.mean(accuracy_mse), 4))
print('Accuracy MAE: ', round(np.mean(accuracy_mae), 4))
print('Accuracy MAPE: ', round(np.mean(accuracy_mape.mean()), 4))
print("Parameters: ", dt_br.get_params())

importance_fields_br_dt_t = importance_fields_br_dt

#%% VIMP
# Lista de tupla com as variáveis de importância - Árvore de decisão
feature_importances_br_dt = \
[(feature, round(importance, 8)) \
 for feature, importance in zip(features_br_list_oh, importance_fields_br_dt)]

# Print out the feature and importances
# [print('Variable DT: {:20} Importance DT: {}'.format(*pair)) for pair in feature_importances_br_dt];

#%%# GUARDANDO OS VALORES DE VIMP
I01_AL_DT = importance_fields_al_dt_t[0:5]; I02_AL_DT = importance_fields_al_dt_t[5:11]; 
def _decision_tree_regression_train(
        table,
        feature_cols,
        label_col,  # fig_size=np.array([6.4, 4.8]), 
        criterion='mse',
        splitter='best',
        max_depth=None,
        min_samples_split=2,
        min_samples_leaf=1,
        min_weight_fraction_leaf=0.0,
        max_features=None,
        random_state=None,
        max_leaf_nodes=None,
        min_impurity_decrease=0.0,
        min_impurity_split=None,
        presort=False,
        sample_weight=None,
        check_input=True,
        X_idx_sorted=None):

    param_validation_check = [
        greater_than_or_equal_to(min_samples_split, 2, 'min_samples_split'),
        greater_than_or_equal_to(min_samples_leaf, 1, 'min_samples_leaf'),
        greater_than_or_equal_to(min_weight_fraction_leaf, 0.0,
                                 'min_weight_fraction_leaf')
    ]
    if max_depth is not None:
        param_validation_check.append(
            greater_than_or_equal_to(max_depth, 1, 'max_depth'))

    validate(*param_validation_check)

    regressor = DecisionTreeRegressor(criterion, splitter, max_depth,
                                      min_samples_split, min_samples_leaf,
                                      min_weight_fraction_leaf, max_features,
                                      random_state, max_leaf_nodes,
                                      min_impurity_decrease,
                                      min_impurity_split, presort)
    regressor.fit(table[feature_cols], table[label_col], sample_weight,
                  check_input, X_idx_sorted)

    try:
        from sklearn.externals.six import StringIO
        from sklearn.tree import export_graphviz
        import pydotplus
        dot_data = StringIO()
        export_graphviz(regressor,
                        out_file=dot_data,
                        feature_names=feature_cols,
                        filled=True,
                        rounded=True,
                        special_characters=True)
        graph = pydotplus.graph_from_dot_data(dot_data.getvalue())

        from brightics.common.repr import png2MD
        fig_tree = png2MD(graph.create_png())
    except:
        fig_tree = "Graphviz is needed to draw a Decision Tree graph. Please download it from http://graphviz.org/download/ and install it to your computer."

    # json
    model = _model_dict('decision_tree_regression_model')
    model['feature_cols'] = feature_cols
    model['label_col'] = label_col
    feature_importance = regressor.feature_importances_
    model['feature_importance'] = feature_importance
    model['max_features'] = regressor.max_features_
    model['n_features'] = regressor.n_features_
    model['n_outputs'] = regressor.n_outputs_
    model['tree'] = regressor.tree_
    get_param = regressor.get_params()
    model['parameters'] = get_param
    model['regressor'] = regressor

    # report

    indices = np.argsort(feature_importance)
    sorted_feature_cols = np.array(feature_cols)[indices]

    plt.title('Feature Importances')
    plt.barh(range(len(indices)),
             feature_importance[indices],
             color='b',
             align='center')
    for i, v in enumerate(feature_importance[indices]):
        plt.text(v,
                 i,
                 " {:.2f}".format(v),
                 color='b',
                 va='center',
                 fontweight='bold')
    plt.yticks(range(len(indices)), sorted_feature_cols)
    plt.xlabel('Relative Importance')
    plt.tight_layout()
    fig_feature_importances = plt2MD(plt)
    plt.clf()

    params = dict2MD(get_param)
    feature_importance_df = pd.DataFrame(data=feature_importance,
                                         index=feature_cols).T

    # Add tree plot

    rb = BrtcReprBuilder()
    rb.addMD(
        strip_margin("""
    | ## Decision Tree Regression Train Result
    | ### Decision Tree
    | {fig_tree}
    |
    | ### Feature Importance
    | {fig_feature_importances}
    |
    | ### Parameters
    | {list_parameters}
    |
    """.format(fig_tree=fig_tree,
               fig_feature_importances=fig_feature_importances,
               list_parameters=params)))
    model['_repr_brtc_'] = rb.get()

    return {'model': model}
class model: #Main model class
    def __init__(self,modeltype,PCA=None,modelparams = None):
        self.information = {}
        if PCA:
            fitters = load(PCA)
            self.pca = fitters[0]
            self.scaler = fitters[1]
        else:
            self.pca = None
        if modeltype == 'Linear': #Linear Regression
            self.model = LinearRegression(n_jobs=-1)
        elif modeltype == 'SVM': #Support Vector Machine
            self.model= svm.SVR(cache_size=750,C=200)
        elif modeltype == 'LinearSVM': #Linear SVM
            self.model = svm.LinearSVR()
        elif modeltype == 'SGD': #Stochastic Gradient Descent
            self.model = SGDRegressor()
        elif modeltype == 'MLP': #Multi-layer Perceptron
            self.model = MLPRegressor(learning_rate='adaptive',max_iter=1000) 
        elif modeltype == 'KNN': #K Nearest Neighbour
            self.model = KNeighborsRegressor(n_neighbors=2,n_jobs=-1)
        elif modeltype == 'Tree': #Decision Tree
            self.model = DecisionTreeRegressor()
        elif modeltype == 'load': #Load a pre-existing model
            pass
        else: #Not supported
            print('Model type not recognised')
        if modelparams:
            self.model.set_params(**modelparams)

    def convert_arrays(self,datadf): #Convert pd dataframes to numpy ndarrays
        """
        Converts pd.Dataframe to np.ndarray
        Parameters
        ----------
        datadf : pd.Dataframe
            Dataframe of MP's and descriptors and SMILES
        Returns
        -------
        X : np.ndarray
            Descriptor values array
        Y : np.ndarray
            MP values
        """
        if isinstance(datadf, pd.DataFrame):
            Y = datadf['MP'].to_numpy()
            X = datadf.drop(['SMILES','MP'],axis=1)
            self.descrips = X.keys()
            X = X.to_numpy()
            #X,Y = shuffle(X,Y,random_state=None)
        else:
            X = datadf[0]
            Y = datadf[1]
        return X,Y

    def split_data(self,data,split):
        """
        Splits data into train and test data
        Parameters
        ----------
        split : float between 0 and 1
            Proportion of dataset to create train data
        Returns
        -------
        Training Data : list
            Training data as a list of [X,Y] where X and Y are numpy arrays of the descriptor and MP values respectively
        Test Data : list
            Test data as a list of [X,Y] where X and Y are numpy arrays of the descriptor and MP values respectively
        """
        X,Y = self.convert_arrays(data)
        datas = train_test_split(X,Y,train_size=split)
        return [datas[0],datas[2]],[datas[1],datas[3]]


    def printinfo(self,X):
        """
        Prints info about the current model
        Parameters
        ----------
        X : numpy array
            Dataset descriptors array
        Returns
        -------
        Prints various information about the model and dataset
        Dictionary holding all this information
        """
        modelname = type(self.model).__name__
        print("Model: "+modelname)
        parameters = self.model.get_params()
        print("Model Parameters: "+str(parameters))
        if self.pca:
            print("PCA: True")
            PCA = True
        else:
            print("PCA: False")
            PCA = False
        samples = np.size(X,0)
        features = np.size(X,1)
        print("Dataset # of samples: "+str(samples))
        print("Dataset # of features: "+str(features))
        return {'Model Type':modelname,'Model Parameters':parameters,'Samples':samples,'Features':features,'PCA':PCA}

    def crossValidate(self,training_data,folds=5): #Cross Validate model
        """
        Performs cross validation using the current model
        Parameters
        ----------
        training_data : np.array or pd.Dataframe
            Dataset to perform cross validation on
        folds : integer
            Number of folds
        """
        X,Y = self.convert_arrays(training_data)
        if self.pca:
            X_scaled = self.scaler.transform(X)
            X = self.pca.transform(X_scaled)
        modelPipeline = make_pipeline(preprocessing.StandardScaler(),self.model)
        kf = KFold(n_splits=folds,shuffle=True)
        cvScore = cross_val_score(modelPipeline,X,Y,scoring='neg_root_mean_squared_error',cv=kf,n_jobs=-1,verbose=1)
        print("CROSS VALIDATION")
        self.printinfo(X)
        print("Cross validated score (RMSE): "+str(cvScore))
        print("Mean = "+str(statistics.mean(cvScore))+"\n")
            

    def train_model(self,training_data): #Train model on inputted dataset
        """
        Trains model on inputted dataset
        Parameters
        ----------
        training_data : np.array or pd.Dataframe
            Data to train the model on
        """
        X,Y = self.convert_arrays(training_data)
        if self.pca:
            X_scaled = self.scaler.transform(X)
            X = self.pca.transform(X_scaled)

        else:
            self.scaler = preprocessing.StandardScaler().fit(X)
            X = self.scaler.transform(X)
        self.model.fit(X,Y)
        print("TRAINING")
        self.information['Training'] = self.printinfo(X)
        print("R^2 Score = "+str(self.model.score(X, Y)))
        predicted = self.model.predict(X)
        RMSE = mean_squared_error(Y, predicted,squared=False)
        print("RMSE = "+str(RMSE)+"\n")
        self.information['Training']['RMSE'] = RMSE
    
    def save_model(self,filepath): #Input filepath with filename included
        """
        Saves the model to a .joblib file
        Parameters
        ----------
        filepath : string
            The filepath to save the model to
        """
        #full_model = [self.model,self.scaler,self.descrips,self.pca]
        full_model = {'model':self.model,'scaler':self.scaler,'descriptors':self.descrips,'PCA':self.pca,'information':self.information}
        dump(full_model, filepath+'.joblib') #File extension is added automatically


    def load_model(self,file): #Load saved model
        """
        Loads a model from a .joblib file
        Parameters
        ----------
        file : string
            Filepath to load model from
        """
        models = load(file)
        self.model = models['model']
        self.scaler = models['scaler']
        self.descrips = models['descriptors']
        self.pca = models['PCA']
        self.information = models['information']

    def test_model(self,test_data): #Test model on test_data and return RMSE
        """
        Tests model on inputted dataset and returns the predicted values
        Parameters
        ----------
        test_data : np.array or pd.Dataframe
            Dataset to test the model on
        Returns
        -------
        Y : np.array
            Actual MP values
        predicted : np.array
            Predicted MP values
        """
        X,Y = self.convert_arrays(test_data)
        if self.pca:
            X_scaled = self.scaler.transform(X)
            X = self.pca.transform(X_scaled)
        else:
            X = self.scaler.transform(X)
        predicted = self.model.predict(X)
        print("TESTING")
        self.information['Testing'] = self.printinfo(X)
        print("R^2 = "+str(r2_score(Y,predicted)))
        RMSE = mean_squared_error(Y, predicted,squared=False)
        print("RMSE = "+str(RMSE)+"\n")
        self.information['Testing']['RMSE'] = RMSE
        return Y,predicted


    def gridsearch(self,test_data,params,save=None,graph=False): #Perform a gridsearch on test_data using params
        """
        Performs a cross validated gridsearch on a dataset with selected parameters
        Parameters
        ----------
        test_data : np.array or pd.Dataframe
            Dataset to use for the gridsearch
        params : dict
            Dictionary of parameter values to test
        save : string
            Filepath to save results to (defaults to None if not inputted)
        graph : boolean
            If true, creates graph of results (only works when one parameter is being varied)
        Returns
        -------
        Creates graph if graph = True
        Saves .txt of results if a save filepath is given
        """
        modelPipeline = make_pipeline(preprocessing.StandardScaler(),self.model)
        #print(modelPipeline.get_params().keys())
        gridcv = GridSearchCV(modelPipeline,param_grid=params,n_jobs=-1,scoring='neg_root_mean_squared_error',verbose=1)

        X,Y = self.convert_arrays(test_data)
        if self.pca:
            X_scaled = self.scaler.transform(X)
            X = self.pca.transform(X_scaled)
        gridcv.fit(X,Y)
        print("GRIDSEARCH")
        self.printinfo(X)
        print("Best Parameter : "+str(gridcv.cv_results_['params'][gridcv.best_index_]))
        print("RMSE: "+str(gridcv.cv_results_['mean_test_score'][gridcv.best_index_]))
        if graph == True:
            for param in params.keys():
                variable = param.split('__')[1]
                x_axis = (gridcv.cv_results_["param_"+param]).filled().astype(np.float64)
            y_axis = gridcv.cv_results_["mean_test_score"]
            std = gridcv.cv_results_["std_test_score"]
            sns.lineplot(x="param_"+param,y="mean_test_score",data=gridcv.cv_results_,color = 'red')
            plt.title("Gridsearch on "+type(self.model).__name__)
            plt.xlabel(variable)
            plt.ylabel("Negative RMSE /°C")
            
            plt.fill_between(x= x_axis,y1 = y_axis-std,y2 = y_axis+std,alpha=0.2,color= 'red')
            plt.show()
        if save: #Input filepath to save to if wanted
            pd.DataFrame.from_dict(gridcv.cv_results_, orient="index").to_csv(save+'.csv')

    def predictSingle(self,mol): #Return the predicted MP of single mol
        """
        Predicts the MP of a single molecule
        Parameters
        ----------
        mol : array
            Descriptor values for molecule
        Returns
        -------
        prediction : array
            Contains predicted MP of inputted molecule
        """
        if self.pca:
            X_scaled = self.scaler.transform(mol)
            X = self.pca.transform(X_scaled)
        else:
            X = self.scaler.transform(mol)
        prediction = self.model.predict(X)
        return prediction

    def getDescriptors(self): #Returns descriptors used in model
        """
        Returns the descriptors of the model as a list
        """
        return self.descrips