def cvtestNone(xtrain,labels,tid): #test DecisionTreeRegressor with depth = None cvresult=np.zeros(7); for i in range(1): clf_1 = DecisionTreeRegressor(max_depth=None,random_state=rng,min_samples_leaf=2) cv = cross_validation.ShuffleSplit(xtrain.shape[0], n_iter=10,test_size=0.1, random_state=rng) scores = cross_validation.cross_val_score(clf_1, xtrain, labels[:,tid],cv=cv,scoring='mean_squared_error') cvresult[i-1]=np.sum(scores)/10 print tid,'depth=None',np.sum(scores)/10,scores print clf_1.get_params()
def model_dec_tree(args, y): dect = DecisionTreeRegressor() dect.fit(args, y) res = dect.score(args, y) params = dect.get_params() dump = None return res, params, dump
def model_dec_tree(args, y): alpha = 0.1 l1_ratio = 0.7 dect = DecisionTreeRegressor() dect.fit(args, y) res = dect.score(args, y) params = dect.get_params() dump = None #pickle.dumps(dect.tree_) return res, params, dump
def search_bestparam_DecisionTreeRegressor(X_train, y_train, df_search_best_param): print(f"Search best params for DecisionTreeRegressor ...") model = DecisionTreeRegressor() print("Supported params", model.get_params()) param_grid = { "criterion": ["mse", "mae"], "min_samples_split": [10, 20, 40], "max_depth": [2, 6, 8], "min_samples_leaf": [20, 40, 100], "max_leaf_nodes": [5, 20, 100] } search_bestparam(model, param_grid, X_train, y_train, df_search_best_param)
def DecisionTreeModel(X, y): ''' Defines a decision tree model and fit features X to outputs y. In the process plot the learning curve. returns the regression model. :param X: features :param y: outputs :return: a regression model ''' n = 5 regressor = DecisionTreeRegressor(max_depth=n, random_state=0) train_sizes = np.linspace(1, X.shape[0] * 0.8 - 1, 9).astype(int) print("train_sizes: ", train_sizes) print("train_sizes fracs: ", train_sizes / float(X.shape[0])) cv = ShuffleSplit(n_splits=10, train_size=0.8, random_state=0) sizes, train_scores, test_scores = curves(regressor, X, y, cv=cv, train_sizes=train_sizes, scoring='r2') train_scores_mean = np.mean(train_scores, axis=1) train_scores_std = np.std(train_scores, axis=1) test_scores_mean = np.mean(test_scores, axis=1) test_scores_std = np.std(test_scores, axis=1) plt.figure() #plt.subplot(1,2,1) plt.plot(sizes, train_scores_mean, '-o') plt.fill_between(sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.15, color='b') #plt.subplot(1,2,2) plt.plot(sizes, test_scores_mean, '-o') plt.fill_between(sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.15, color='r') plt.xlabel('training size') plt.ylabel('r2 score') plt.show() print(regressor.get_params()) pass
def dtr(days_train): begin = datetime.datetime.now() grid = False # train_x = read_coo_mtx(training_X_file) # train_y = np.loadtxt(open(training_Y_file), dtype=int) # test_x = read_coo_mtx(test_X_file) # test_y = np.loadtxt(open(test_Y_file), dtype=int) train_x, train_y, valid_x, valid_y = divideTrainValid(days_train) # print( train_x.shape, test_x.shape print( "Loading data completed.") print( "Read time: " + str(datetime.datetime.now() - begin)) classifier = DecisionTreeRegressor(min_samples_leaf=NUM_TYPES) # if grid: # param_grid = {'C': [1, 5, 10]} # grid = GridSearchCV(estimator=classifier, scoring='roc_auc', param_grid=param_grid) # grid.fit(train_x, train_y) # print( "Training completed." # print( grid.cv_results_ # print( grid.best_estimator_ if not grid: classifier.fit(train_x, train_y) # cross_val_score(classifier, training_x, training_y, cv=10) # print( "Cross validation completed." # joblib.dump(classifier, "new_basic_lr" + ".pkl", compress=3) #加个3,是压缩,一般用这个 # classifier = joblib.load("new_basic_lr.pkl") y_pred = classifier.predict(valid_x) accuracy = metrics.mean_squared_error(valid_y, y_pred) print("mse: " + str(accuracy)) end = datetime.datetime.now() day = datetime.date.today() np.savetxt(open(DATA_PATH_RESULT+title+"_lr_pred_"+str(day), "w"), accuracy, fmt='%.5f') rcd = str(end) + '\n' rcd += "lr: "+title+" 130" + '\n' rcd += str(classifier.get_params()) + '\n' rcd += "mse: " + str(accuracy) + '\n' rcd += "time: " + str(end - begin) + '\n' + '\n' + '\n' print( rcd) log_file = open(DATA_PATH_RESULT+"dtr_result", "a") log_file.write(rcd) log_file.close()
def decision_tree(df, significant_cols, target, cat_cols, num_cols): ss = StandardScaler() ohe = OneHotEncoder(drop='first', sparse=False) X = df[significant_cols] y = df[target] estimator = DecisionTreeRegressor(random_state=0, splitter='best') params = { 'criterion': ['mse', 'friedman_mse', 'mae'], 'max_features': [None, 'log2', 'sqrt'], 'ccp_alpha': np.arange(0, 1.1, 0.1) } X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0) X_train_cat = ohe.fit_transform(X_train[cat_cols]) X_train_num = ss.fit_transform(X_train[num_cols]) X_test_cat = ohe.transform(X_test[cat_cols]) X_test_num = ss.transform(X_test[num_cols]) train_data = np.c_[X_train_cat, X_train_num] test_data = np.c_[X_test_cat, X_test_num] gs = GridSearchCV(estimator, params, scoring='r2', cv=3) gs.fit(train_data, y_train) estimator = gs.best_estimator_ r2_cv_scores = cross_val_score(estimator, train_data, y_train, scoring='r2', cv=3, n_jobs=-1) rmse_cv_scores = cross_val_score(estimator, train_data, y_train, scoring='neg_root_mean_squared_error', cv=3, n_jobs=-1) params = estimator.get_params() r2 = np.mean(r2_cv_scores) rmse = np.abs(np.mean(rmse_cv_scores)) r2_variance = np.var(r2_cv_scores, ddof=1) rmse_variance = np.abs(np.var(rmse_cv_scores, ddof=1)) estimator.fit(train_data, y_train) y_pred = estimator.predict(test_data) r2_validation = r2_score(y_test, y_pred) rmse_validation = np.sqrt(mean_squared_error(y_test, y_pred)) return r2, rmse, r2_variance, rmse_variance, r2_validation, rmse_validation, params
def test_model_finder_set_model_regression(model_finder_regression, seed): """Testing if set_model() function correctly sets chosen Model and corresponding properties (regression). Additionally checks if the set Model wasn't fitted in the process.""" model = DecisionTreeRegressor(max_depth=10, criterion="mae", random_state=seed) mf = model_finder_regression mf.scoring_functions = [mean_squared_error, r2_score] mf.set_model(model) assert mf._chosen_model.clf.regressor == model assert mf._chosen_model_params == model.get_params() assert mf._chosen_model_scores == { "mean_squared_error": 1323.0223273506956, "r2_score": -1.7265322117249737 } assert type(mf._chosen_model) == WrappedModelRegression with pytest.raises(NotFittedError): mf.predict([1])
#dt from sklearn.tree import DecisionTreeRegressor tree_reg = DecisionTreeRegressor(max_depth=2, random_state=42) tree_reg.fit(fires_prepared, fires_labels) #rf from sklearn.ensemble import RandomForestRegressor forest_reg = RandomForestRegressor(random_state=42) forest_reg.fit(fires_prepared, fires_labels) #2-1 from sklearn.model_selection import GridSearchCV print("sgd_reg.get_params().keys(): ", sgd_reg.get_params().keys()) print("svm_reg.get_params().keys(): ", svm_reg.get_params().keys()) print("tree_reg.get_params().keys(): ", tree_reg.get_params().keys()) print("forest_reg.get_params().keys(): ", forest_reg.get_params().keys()) params_sgd = [ { 'alpha': [0.1, 0.5], 'epsilon': [0.1, 1] }, { 'alpha': [0.5, 0.6], 'epsilon': [0.1, 0.7] }, ] params_svm = { 'kernel': ["linear", "poly", "rbf"],
#Cross validation is used to estimate the generalization performance #Why tune hyperparameters? #default hyperparameters are not optimal for all problems, for the best performance '''1)Grid Search 2)Random Search 3)Bayesian Search 4)Genetic Search''' #Grid Search Cross Validation #-manually set a grid of discrete hyperparameter values #-set a metric for scoring model performance #-for each hyperparameters, evaluate each model's CV(cross validation)-score, best CV is the optimal hyperparameters #-the bigger the grid, the longer to take to find the solution ################################################ from sklearn.tree import DecisionTreeClassifier dt=DecisionTreeClassifier() print(dt.get_params()) from sklearn.model_selection import GridSearchCV params_dt = {'max_depth':[2,3,4] ,'min_samples_leaf':[0.12, 0.14, 0.16, 0.18]} # Define params_dt grid_dt = GridSearchCV(estimator=dt, param_grid=params_dt, scoring='accuracy',cv=10,n_jobs=1) grid_dt = GridSearchCV(estimator=dt,param_grid=params_dt,scoring='roc_auc',cv=5, n_jobs=-1) from sklearn.metrics import roc_auc_score best_model = grid_dt.best_estimator_# Extract the best estimator y_pred_proba = best_model.predict_proba(X_test)[:,1]# Predict the test set probabilities of the positive class test_roc_auc = roc_auc_score(y_test, y_pred_proba)# Compute test_roc_auc print('Test set ROC AUC score: {:.3f}'.format(test_roc_auc))# Print test_roc_auc ################################################ #Tuning RF's Hyperparameter #Hyperparameter tuning is expensive, somtimes only slight improvement #Weight the impact of using hyperparameter #
count = 0 for i in range(len(predictiveAttributeNotDegree)): if count < train_percent: count = count + 1 train_set.append([ predictiveAttributeNotDegree[i][10], predictiveAttributeNotDegree[i][12], predictiveAttributeNotDegree[i][2] ]) train_result.append([predictiveAttributeNotDegree[i][20]]) else: test_set.append([ predictiveAttributeNotDegree[i][10], predictiveAttributeNotDegree[i][12], predictiveAttributeNotDegree[i][2] ]) test_result.append([predictiveAttributeNotDegree[i][20]]) regressor = DecisionTreeRegressor(random_state=0, min_samples_leaf=10) print(cross_val_score(regressor, train_set[1:], train_result[1:], cv=10)) regressor.fit(train_set, train_result) print(regressor.score(test_set, test_result)) # matr cf 2 3 tot cds tipoCds coorte annicarriera annodiploma votodip codschool tipoMat annolaur votolaur erasmus tesi mot_sta sta fc newStudent = [[85, 11, 30]] real_value = [1] predicted = regressor.predict(newStudent) print("Predicted: ", predicted) print("MSE: ", mean_squared_error(real_value, regressor.predict(newStudent))) print("Params: ", regressor.get_params()) print("Feature Importance: ", regressor.feature_importances_)
test_result.append([predictiveAttributeNotDegree[i][2]]) regressor = DecisionTreeRegressor(random_state=0, min_samples_leaf=10) print(cross_val_score(regressor, train_set[1:], train_result[1:], cv=10)) regressor.fit(train_set, train_result) print(regressor.score(test_set, test_result)) prediction = [] for item in test_set: items = [[item[0], item[1]]] prediction.append(regressor.predict(items)) pred = np.zeros(len(prediction)) predi = np.array(prediction) for i in range(len(prediction)): pred[i] = predi[i][0] print(("MSE: {}".format(mean_squared_error(pred, test_result)))) print("Params: ", regressor.get_params()) print("Feature Importance: ", regressor.feature_importances_) print( "\n\n\n----------QUI INIZIA LA SEZIONE CON TUTTI GLI ATTRIBUTI---------- \n\n\n" ) predictiveAttributeDegree = pd.read_json( "C:/Users/sebas/PycharmProjects/MachineLearning-Local/DSML/DecisionTree/predictiveDegree.txt", orient='records', dtype=True, typ="series") predictiveAttributeNotDegree = pd.read_json( "C:/Users/sebas/PycharmProjects/MachineLearning-Local/DSML/DecisionTree/predictiveNotDegree.txt", orient='records', dtype=True, typ="series")
y = shots[['RESULT']] # Split training examples from labels X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=22) logger.info("\nX_train:\n{}\n({})\n".format(X_train[:3], len(X_train))) logger.info("\nX_test:\n{}\n({})\n".format(X_test[:3], len(X_test))) logger.info("\ny_train:\n{}\n({})\n".format(y_train[:3], len(y_train))) logger.info("\ny_test:\n{}\n({})\n".format(y_test[:3], len(y_test))) # Fit decision tree regressor model dt = DecisionTreeRegressor(max_depth=4) dt.fit(X_train, y_train) dt_params = dt.get_params(deep=True) logger.info("\nDT Regressor Model Parameters\n\{}\n".format(dt_params)) joblib.dump(dt, 'dt.pkl') logger.info("Persisted the DT Regressor model to dt.pkl...") y_pred_dt = dt.predict(X_test) # Fit Linear Regression model lr = LinearRegression() lr.fit(X_train, y_train) lr_params = lr.coef_ logger.info("\nLinear Regressor Model Parameters\n{}\n{}\n".format( params, lr_params)) joblib.dump(lr, 'lr.pkl') logger.info("Persisted the Linear Regression model to lr.pkl...") y_pred_lr = lr.predict(X_test)
# y_2 = regr_2.predict(X_test) # y_3 = regr_3.predict(X_test) train_x_pd = pd.read_table('train_x.txt', delimiter='\t') train_x = train_x_pd.values train_y_pd = pd.read_table('train_y.txt', delimiter='\t') train_y = train_y_pd.values test_x_pd = pd.read_table('test_x.txt', delimiter='\t') test_x = test_x_pd.values test_y_pd = pd.read_table('test_y.txt', delimiter='\t') test_y = test_y_pd.values regr_1 = DecisionTreeRegressor(max_depth=20) print(cross_validate(regr_1, train_x, train_y, cv=10)) print(regr_1.get_params()) regr_1.fit(train_x, train_y) y_1 = regr_1.predict(test_x) #print(y_1.shape, test_y.shape) res = np.corrcoef(y_1.flatten(), test_y.flatten()) print(res) #loss_values = regr_1.estimator.loss_curve_ #plt.plot(loss_values) #plt.savefig('demo.pdf')
import matplotlib.pyplot as plt #Import final cleaned dataframe dataframe=pd.read_csv('dataframe210620.csv',index_col=0) from sklearn.tree import DecisionTreeRegressor from pprint import pprint from sklearn.model_selection import train_test_split X_rf = dataframe.drop(['Patient ID','A1C (%)','Year'],1) y_rf = dataframe['A1C (%)'] X_train_rf, X_test_rf, y_train_rf, y_test_rf = train_test_split(X_rf, y_rf, random_state = 0) clf_rf = DecisionTreeRegressor(random_state = 0).fit(X_train_rf, y_train_rf) print('Parameters currently in use:\n') pprint(clf_rf.get_params()) X_train_rf.to_csv('X_train.csv') y_train_rf.to_csv('y_train.csv') from sklearn.model_selection import RandomizedSearchCV # Number of features to consider at every split - for regressor, none is good max_features = None # Maximum number of levels in tree max_depth = [int(x) for x in np.linspace(10, 110, num = 11)] max_depth.append(None) # Minimum number of samples required to split a node
#den_r2q = np.power(den_r2, 2) #r2_manual = 1 - (num_r2q.sum()/den_r2q.sum()) #r2_uv = 1 - (v/u) #print("R2 força bruta: ", round(r2_manual, 2)) #print("R2 força bruta UV: ", round(r2_uv, 8)) #%% ACURÁCIA br #print('Accuracy RF: ', round(np.average(scores_br_rf), 2), "%.") print('Accuracy DT: ', round(np.mean(accuracy_dt), 4)) print('Accuracy R2: ', round(np.mean(accuracy_r2), 4)) print('Accuracy CV: ', round(np.mean(accuracy_cv), 4)) print('Accuracy MSE: ', round(np.mean(accuracy_mse), 4)) print('Accuracy MAE: ', round(np.mean(accuracy_mae), 4)) print('Accuracy MAPE: ', round(np.mean(accuracy_mape.mean()), 4)) print("Parameters: ", dt_br.get_params()) importance_fields_br_dt_t = importance_fields_br_dt #%% VIMP # Lista de tupla com as variáveis de importância - Árvore de decisão feature_importances_br_dt = \ [(feature, round(importance, 8)) \ for feature, importance in zip(features_br_list_oh, importance_fields_br_dt)] # Print out the feature and importances # [print('Variable DT: {:20} Importance DT: {}'.format(*pair)) for pair in feature_importances_br_dt]; #%%# GUARDANDO OS VALORES DE VIMP I01_AL_DT = importance_fields_al_dt_t[0:5]; I02_AL_DT = importance_fields_al_dt_t[5:11];
def _decision_tree_regression_train( table, feature_cols, label_col, # fig_size=np.array([6.4, 4.8]), criterion='mse', splitter='best', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=None, random_state=None, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, presort=False, sample_weight=None, check_input=True, X_idx_sorted=None): param_validation_check = [ greater_than_or_equal_to(min_samples_split, 2, 'min_samples_split'), greater_than_or_equal_to(min_samples_leaf, 1, 'min_samples_leaf'), greater_than_or_equal_to(min_weight_fraction_leaf, 0.0, 'min_weight_fraction_leaf') ] if max_depth is not None: param_validation_check.append( greater_than_or_equal_to(max_depth, 1, 'max_depth')) validate(*param_validation_check) regressor = DecisionTreeRegressor(criterion, splitter, max_depth, min_samples_split, min_samples_leaf, min_weight_fraction_leaf, max_features, random_state, max_leaf_nodes, min_impurity_decrease, min_impurity_split, presort) regressor.fit(table[feature_cols], table[label_col], sample_weight, check_input, X_idx_sorted) try: from sklearn.externals.six import StringIO from sklearn.tree import export_graphviz import pydotplus dot_data = StringIO() export_graphviz(regressor, out_file=dot_data, feature_names=feature_cols, filled=True, rounded=True, special_characters=True) graph = pydotplus.graph_from_dot_data(dot_data.getvalue()) from brightics.common.repr import png2MD fig_tree = png2MD(graph.create_png()) except: fig_tree = "Graphviz is needed to draw a Decision Tree graph. Please download it from http://graphviz.org/download/ and install it to your computer." # json model = _model_dict('decision_tree_regression_model') model['feature_cols'] = feature_cols model['label_col'] = label_col feature_importance = regressor.feature_importances_ model['feature_importance'] = feature_importance model['max_features'] = regressor.max_features_ model['n_features'] = regressor.n_features_ model['n_outputs'] = regressor.n_outputs_ model['tree'] = regressor.tree_ get_param = regressor.get_params() model['parameters'] = get_param model['regressor'] = regressor # report indices = np.argsort(feature_importance) sorted_feature_cols = np.array(feature_cols)[indices] plt.title('Feature Importances') plt.barh(range(len(indices)), feature_importance[indices], color='b', align='center') for i, v in enumerate(feature_importance[indices]): plt.text(v, i, " {:.2f}".format(v), color='b', va='center', fontweight='bold') plt.yticks(range(len(indices)), sorted_feature_cols) plt.xlabel('Relative Importance') plt.tight_layout() fig_feature_importances = plt2MD(plt) plt.clf() params = dict2MD(get_param) feature_importance_df = pd.DataFrame(data=feature_importance, index=feature_cols).T # Add tree plot rb = BrtcReprBuilder() rb.addMD( strip_margin(""" | ## Decision Tree Regression Train Result | ### Decision Tree | {fig_tree} | | ### Feature Importance | {fig_feature_importances} | | ### Parameters | {list_parameters} | """.format(fig_tree=fig_tree, fig_feature_importances=fig_feature_importances, list_parameters=params))) model['_repr_brtc_'] = rb.get() return {'model': model}
class model: #Main model class def __init__(self,modeltype,PCA=None,modelparams = None): self.information = {} if PCA: fitters = load(PCA) self.pca = fitters[0] self.scaler = fitters[1] else: self.pca = None if modeltype == 'Linear': #Linear Regression self.model = LinearRegression(n_jobs=-1) elif modeltype == 'SVM': #Support Vector Machine self.model= svm.SVR(cache_size=750,C=200) elif modeltype == 'LinearSVM': #Linear SVM self.model = svm.LinearSVR() elif modeltype == 'SGD': #Stochastic Gradient Descent self.model = SGDRegressor() elif modeltype == 'MLP': #Multi-layer Perceptron self.model = MLPRegressor(learning_rate='adaptive',max_iter=1000) elif modeltype == 'KNN': #K Nearest Neighbour self.model = KNeighborsRegressor(n_neighbors=2,n_jobs=-1) elif modeltype == 'Tree': #Decision Tree self.model = DecisionTreeRegressor() elif modeltype == 'load': #Load a pre-existing model pass else: #Not supported print('Model type not recognised') if modelparams: self.model.set_params(**modelparams) def convert_arrays(self,datadf): #Convert pd dataframes to numpy ndarrays """ Converts pd.Dataframe to np.ndarray Parameters ---------- datadf : pd.Dataframe Dataframe of MP's and descriptors and SMILES Returns ------- X : np.ndarray Descriptor values array Y : np.ndarray MP values """ if isinstance(datadf, pd.DataFrame): Y = datadf['MP'].to_numpy() X = datadf.drop(['SMILES','MP'],axis=1) self.descrips = X.keys() X = X.to_numpy() #X,Y = shuffle(X,Y,random_state=None) else: X = datadf[0] Y = datadf[1] return X,Y def split_data(self,data,split): """ Splits data into train and test data Parameters ---------- split : float between 0 and 1 Proportion of dataset to create train data Returns ------- Training Data : list Training data as a list of [X,Y] where X and Y are numpy arrays of the descriptor and MP values respectively Test Data : list Test data as a list of [X,Y] where X and Y are numpy arrays of the descriptor and MP values respectively """ X,Y = self.convert_arrays(data) datas = train_test_split(X,Y,train_size=split) return [datas[0],datas[2]],[datas[1],datas[3]] def printinfo(self,X): """ Prints info about the current model Parameters ---------- X : numpy array Dataset descriptors array Returns ------- Prints various information about the model and dataset Dictionary holding all this information """ modelname = type(self.model).__name__ print("Model: "+modelname) parameters = self.model.get_params() print("Model Parameters: "+str(parameters)) if self.pca: print("PCA: True") PCA = True else: print("PCA: False") PCA = False samples = np.size(X,0) features = np.size(X,1) print("Dataset # of samples: "+str(samples)) print("Dataset # of features: "+str(features)) return {'Model Type':modelname,'Model Parameters':parameters,'Samples':samples,'Features':features,'PCA':PCA} def crossValidate(self,training_data,folds=5): #Cross Validate model """ Performs cross validation using the current model Parameters ---------- training_data : np.array or pd.Dataframe Dataset to perform cross validation on folds : integer Number of folds """ X,Y = self.convert_arrays(training_data) if self.pca: X_scaled = self.scaler.transform(X) X = self.pca.transform(X_scaled) modelPipeline = make_pipeline(preprocessing.StandardScaler(),self.model) kf = KFold(n_splits=folds,shuffle=True) cvScore = cross_val_score(modelPipeline,X,Y,scoring='neg_root_mean_squared_error',cv=kf,n_jobs=-1,verbose=1) print("CROSS VALIDATION") self.printinfo(X) print("Cross validated score (RMSE): "+str(cvScore)) print("Mean = "+str(statistics.mean(cvScore))+"\n") def train_model(self,training_data): #Train model on inputted dataset """ Trains model on inputted dataset Parameters ---------- training_data : np.array or pd.Dataframe Data to train the model on """ X,Y = self.convert_arrays(training_data) if self.pca: X_scaled = self.scaler.transform(X) X = self.pca.transform(X_scaled) else: self.scaler = preprocessing.StandardScaler().fit(X) X = self.scaler.transform(X) self.model.fit(X,Y) print("TRAINING") self.information['Training'] = self.printinfo(X) print("R^2 Score = "+str(self.model.score(X, Y))) predicted = self.model.predict(X) RMSE = mean_squared_error(Y, predicted,squared=False) print("RMSE = "+str(RMSE)+"\n") self.information['Training']['RMSE'] = RMSE def save_model(self,filepath): #Input filepath with filename included """ Saves the model to a .joblib file Parameters ---------- filepath : string The filepath to save the model to """ #full_model = [self.model,self.scaler,self.descrips,self.pca] full_model = {'model':self.model,'scaler':self.scaler,'descriptors':self.descrips,'PCA':self.pca,'information':self.information} dump(full_model, filepath+'.joblib') #File extension is added automatically def load_model(self,file): #Load saved model """ Loads a model from a .joblib file Parameters ---------- file : string Filepath to load model from """ models = load(file) self.model = models['model'] self.scaler = models['scaler'] self.descrips = models['descriptors'] self.pca = models['PCA'] self.information = models['information'] def test_model(self,test_data): #Test model on test_data and return RMSE """ Tests model on inputted dataset and returns the predicted values Parameters ---------- test_data : np.array or pd.Dataframe Dataset to test the model on Returns ------- Y : np.array Actual MP values predicted : np.array Predicted MP values """ X,Y = self.convert_arrays(test_data) if self.pca: X_scaled = self.scaler.transform(X) X = self.pca.transform(X_scaled) else: X = self.scaler.transform(X) predicted = self.model.predict(X) print("TESTING") self.information['Testing'] = self.printinfo(X) print("R^2 = "+str(r2_score(Y,predicted))) RMSE = mean_squared_error(Y, predicted,squared=False) print("RMSE = "+str(RMSE)+"\n") self.information['Testing']['RMSE'] = RMSE return Y,predicted def gridsearch(self,test_data,params,save=None,graph=False): #Perform a gridsearch on test_data using params """ Performs a cross validated gridsearch on a dataset with selected parameters Parameters ---------- test_data : np.array or pd.Dataframe Dataset to use for the gridsearch params : dict Dictionary of parameter values to test save : string Filepath to save results to (defaults to None if not inputted) graph : boolean If true, creates graph of results (only works when one parameter is being varied) Returns ------- Creates graph if graph = True Saves .txt of results if a save filepath is given """ modelPipeline = make_pipeline(preprocessing.StandardScaler(),self.model) #print(modelPipeline.get_params().keys()) gridcv = GridSearchCV(modelPipeline,param_grid=params,n_jobs=-1,scoring='neg_root_mean_squared_error',verbose=1) X,Y = self.convert_arrays(test_data) if self.pca: X_scaled = self.scaler.transform(X) X = self.pca.transform(X_scaled) gridcv.fit(X,Y) print("GRIDSEARCH") self.printinfo(X) print("Best Parameter : "+str(gridcv.cv_results_['params'][gridcv.best_index_])) print("RMSE: "+str(gridcv.cv_results_['mean_test_score'][gridcv.best_index_])) if graph == True: for param in params.keys(): variable = param.split('__')[1] x_axis = (gridcv.cv_results_["param_"+param]).filled().astype(np.float64) y_axis = gridcv.cv_results_["mean_test_score"] std = gridcv.cv_results_["std_test_score"] sns.lineplot(x="param_"+param,y="mean_test_score",data=gridcv.cv_results_,color = 'red') plt.title("Gridsearch on "+type(self.model).__name__) plt.xlabel(variable) plt.ylabel("Negative RMSE /°C") plt.fill_between(x= x_axis,y1 = y_axis-std,y2 = y_axis+std,alpha=0.2,color= 'red') plt.show() if save: #Input filepath to save to if wanted pd.DataFrame.from_dict(gridcv.cv_results_, orient="index").to_csv(save+'.csv') def predictSingle(self,mol): #Return the predicted MP of single mol """ Predicts the MP of a single molecule Parameters ---------- mol : array Descriptor values for molecule Returns ------- prediction : array Contains predicted MP of inputted molecule """ if self.pca: X_scaled = self.scaler.transform(mol) X = self.pca.transform(X_scaled) else: X = self.scaler.transform(mol) prediction = self.model.predict(X) return prediction def getDescriptors(self): #Returns descriptors used in model """ Returns the descriptors of the model as a list """ return self.descrips