def predict(): """Importing the data file""" df = pd.read_csv('refined.csv') # Features and Labels features = df.iloc[:10000, 0] labels = df.iloc[:10000, 1] # Extract Feature With CountVectorizer from sklearn.feature_extraction.text import CountVectorizer cv = CountVectorizer() features = cv.fit_transform(features.values.astype('U')).toarray() """forming train test split""" from sklearn.model_selection import train_test_split as TTS f_train, f_test, l_train, l_test = TTS(features, labels, random_state=1, test_size=0.33) #training a GaussinanNB del features, labels, df from sklearn.naive_bayes import MultinomialNB MNB = MultinomialNB() MNB.fit(f_train, l_train) l_pred = MNB.predict(f_test) from sklearn.metrics import classification_report print(classification_report(l_test, l_pred)) if request.method == 'POST': message = request.form['message'] data = [message] vect = cv.transform(data).toarray() my_prediction = MNB.predict(vect) return render_template('result.html', prediction=my_prediction)
async def prediction(algo: str, number: float): if number > 1 or number < 0: result = " Vous devez choisir entre : KNN, RF (RandomForest) ou GB (GradientBoosting) en plus du pourcentage de splitting entre le train et le test (train_size) entre 1 et 0." else: X_tr, X_te, Y_tr, Y_te = TTS( X, Y, # features, target stratify= Y, # Va prendre une proportion aux hasard de valeurs différentes histoire de ne pas avoir des cas où l'on a que des même valeur random_state= 777, # Sert à fixer le harsard pour ne pas avoir des résultat différents à chaque tests. train_size=number ) # 50% de X_train et Y_train et 50% de Y_test et Y_test if algo == 'KNN': scoreTr, scoreTe = kn(number, X_tr, X_te, Y_tr, Y_te) result = " Pour l\'algorithme %s, on à un score de %f d'accuracy en train et %f d'accuracy en test. " % ( algo, scoreTr, scoreTe) elif algo == 'RF': scoreTr, scoreTe = rf(number, X_tr, X_te, Y_tr, Y_te) result = " Pour l\'algorithme %s, on à un score de %f d'accuracy en train et %f d'accuracy en test. " % ( algo, scoreTr, scoreTe) elif algo == 'GB': scoreTr, scoreTe = gb(number, X_tr, X_te, Y_tr, Y_te) result = " Pour l\'algorithme %s, on à un score de %f d'accuracy en train et %f d'accuracy en test. " % ( algo, scoreTr, scoreTe) else: result = " Vous devez choisir entre : KNN, RF (RandomForest) ou GB (GradientBoosting) en plus du pourcentage de splitting entre le train et le test (train_size) entre 1 et 0." return result
def main(): fileName = sys.argv[1] ourChords = pd.read_csv(fileName) #save each dataset theirChords = pd.read_csv("theirChords.csv") dataSet = ourChords.append(theirChords, sort=False) #concat two lists dataSet = dataSet.sample(frac=1).reset_index(drop=True) #shuffle dataset #split dataset into chords and labels X = dataSet.iloc[:, 1:16] Y = dataSet.iloc[:, 0] print(X) print(Y) # encode labels as 0s and 1s le = preprocessing.LabelEncoder() Y = le.fit_transform(Y) #one hot encode everything X = pd.get_dummies(X) #split into train and test sets X_train, X_test, Y_train, Y_test = TTS(X, Y, test_size=0.20) #train the set mlp = MLPClassifier(hidden_layer_sizes=(16, 16, 16), max_iter=1000) mlp.fit(X_train, Y_train) #make predictions predictions = mlp.predict(X_test) #print analysis print(confusion_matrix(Y_test, predictions)) print(classification_report(Y_test, predictions))
def __two_splits(self, *data, **options): """ Split data in train-test and validation datasets """ test_size = options['test_size'] validation_size = 1 - options['train_size'] - test_size options['test_size'] = None # First split X_train, X_test, y_train, y_test = TTS(*data, **options) options['train_size'] = test_size / (test_size + validation_size) # Second split X_test, X_validation, y_test, y_validation = TTS( X_test, y_test, **options) return [X_train, X_test, X_validation, y_train, y_test, y_validation]
def xgboost_demo(): data = load_boston() x = data.data y = data.target Xtrain, Xtest, Ytrain, Ytest = TTS(x, y, test_size=0.3, random_state=430) reg = XGBR(n_estimators=100).fit(Xtrain, Ytrain) # 创建多少棵树 reg.predict(Xtest) print(reg.score(Xtest, Ytest)) print(MSE(Ytest, reg.predict(Xtest))) # 查看均方误差 print(reg.feature_importances_) # 每个特征的贡献 print(CVS(reg, Xtrain, Ytrain, cv=5).mean()) # 交叉验证均值
def Model(features, labels): # Applying OneHotEncoding from sklearn.preprocessing import OneHotEncoder col_to_ohe = [6, 7] # Columns to be OneHotEncoded ohe = OneHotEncoder(categorical_features=[col_to_ohe]) features = ohe.fit_transform(features).toarray() # Getting indexes for the columns to be dropped, to avoid dummy variable trap total_col, indexes = 0, [] for col in col_to_ohe: unique_val_count = len(dataset.iloc[:, col].value_counts()) total_col += unique_val_count indexes.append(total_col - unique_val_count) # Dropping the dummy variable trap columns features = np.delete(features, indexes, axis=1) # Splitting the dataset into train and test from sklearn.model_selection import train_test_split as TTS f_train, f_test, l_train, l_test = TTS(features, labels, test_size=0.25, random_state=0) # Logistic Regression Model from sklearn.linear_model import LogisticRegression reg = LogisticRegression(random_state=0) reg = reg.fit(f_train, l_train) pred = reg.predict(f_test) # Prediction on test data # np.array([1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 3, 25, 3, 1, 4, 16]).reshape(1,-1) # Preprocessing the new individual's data val = np.array([3, 25, 3, 1, 4, 16, 4, 2]).reshape(1, -1) val = ohe.transform(val).toarray() val = np.delete(val, indexes, axis=1) val_pred = reg.predict_proba(val) # Predicting Individual's value # Confusion Matrix from sklearn.metrics import confusion_matrix cm = confusion_matrix(l_test, pred) # check the accuracy on the Model mod_score = reg.score(f_test, l_test) return pred, val_pred, cm, mod_score
def singleExperiment(cfg): """ Can only run with access to config variable """ # make data X, y = make_classification(n_samples=cfg['n_samples'], n_features=cfg['n_features']) # train test split X_train, X_test, y_train, y_test = TTS(X, y, test_size=.2) # set up RecurrentForest model rec_fst_clf = RF.RecurrentForest(X_train, y_train, cfg['T'], cfg['n_trees'], cfg['p_connect'], cfg['p_feature'], cfg['p_example'], cfg['tree_kwargs']) # set up RandomForest rnd_fst_clf = RFC(**cfg['random_forest_kwargs']) # set up AdaBoost ada_bst_clf = ABC(**cfg['ada_boost_kwargs']) # in a list models = [rec_fst_clf, rnd_fst_clf, ada_bst_clf] print("<<< training models >>>") for m in tqdm(models): m.fit(X_train, y_train) # RecurrentForest ignores args - data present at init print("<<< testing models >>>") y_hats = np.zeros((3, X_test.shape[0])) for i, m in tqdm(enumerate(models)): if i == 0: y_hats[i, :] = m.predictNew(X_test) else: y_hats[i, :] = m.predict(X_test) # get metrics measures = np.zeros((3, 4)) for i in tqdm(range(3)): measures[i,:] = M.binary_metrics(y_test, y_hats[i,:], model=str(models[i])) return measures
def draw_curve(): data = load_boston() x = data.data y = data.target Xtrain, Xtest, Ytrain, Ytest = TTS(x, y, test_size=0.3, random_state=430) axisx = range(10, 1010, 50) rs = [] # 1 减去 偏差 var = [] # 纪录方差 ge = [] # 计算泛化误差的可控部分 for i in axisx: reg = XGBR(n_estimators=i) # 创建多少棵树 # 默认值越大,越好。所以scoring='neg_mean_squared_error' rs.append(CVS(reg, Xtrain, Ytrain, cv=5, scoring='neg_mean_squared_error').mean()) print(axisx[rs.index(max(rs))], max(rs)) plt.figure(figsize=(20, 5)) plt.plot(axisx, rs, c='red', label='XGB') plt.legend() plt.show()
def draw_curve_2(): data = load_boston() x = data.data y = data.target Xtrain, Xtest, Ytrain, Ytest = TTS(x, y, test_size=0.3, random_state=430) axisx = range(10, 1010, 50) rs = [] var = [] ge = [] for i in axisx: reg = XGBR(n_estimators=i, random_state=420) # 创建多少棵树 cvresult = CVS(reg, Xtrain, Ytrain, cv=5) # 分5折验证 rs.append(cvresult.mean()) # 1 减去 偏差 var.append(cvresult.var()) # 纪录方差 ge.append((1 - cvresult.mean()) ** 2 + cvresult.var()) # 计算泛化误差的可控部分 # print(axisx[rs.index(max(rs))], max(rs), var[rs.index(max(rs))]) # 泛化误差可控部分最小的时候,打印r平方和泛化误差 print(axisx[ge.index(min(ge))], rs[ge.index(min(ge))], var[ge.index(min(ge))], min(ge))
features_train_data).toarray() features_test_data = onehotencoder.transform(features_test_data).toarray() import numpy as np features_train_data = np.append(arr=features, values=features_train_data, axis=1) features_test_data = np.append(arr=features2, values=features_test_data, axis=1) from sklearn.model_selection import train_test_split as TTS f_train, f_test, l_train, l_test = TTS(features_train_data, labels_train_data, test_size=0.3, random_state=0) model_scores = {} from sklearn.decomposition import PCA pca = PCA(n_components=10) f_train = pca.fit_transform(f_train) f_test = pca.transform(f_test) explained_variance = pca.explained_variance_ratio_ from sklearn.metrics import accuracy_score # Naive Bayes from sklearn.naive_bayes import GaussianNB nb = GaussianNB()
datatotal=pd.read_csv("Chemistry_f.csv") X=datatotal.drop("Y",axis=1) X=preprocessing.scale(X) y=datatotal.iloc[:,26] importances=np.loadtxt("importance1.txt") im=np.sort(importances)[::-1] arg=np.argsort(importances)[::-1] i=20 t=arg[:i] X=X[:,np.array(t)] X=preprocessing.scale(X) y=datatotal.iloc[:,26] X_train,X_test,Y_train,Y_test=TTS(X,y,test_size=0.2,random_state=seed) x=np.arange(-1,2,step=0.001) y=x ###Xgboost fig = plt.figure(figsize=(8,16)) ax = fig.subplots(3,2) reg=XGBR(silent=True,n_estimators=200,max_depth=3,learning_rate=0.26,reg_lambda=0.09).fit(X_train,Y_train) xgb_pre=reg.predict(X_test) xgb_pre_tr=reg.predict(X_train) xgb_avg=CVS(reg,X_train,Y_train,scoring="neg_mean_absolute_error",cv=5).mean() xgb_mse=metrics.mean_squared_error(xgb_pre,Y_test) xgb_r2=metrics.explained_variance_score(xgb_pre,Y_test) xgb_mae=metrics.mean_absolute_error(xgb_pre,Y_test)
features = df.iloc[:, :-1].values labels = df.iloc[:, -1:].values from sklearn.preprocessing import Imputer as ip imp = ip(missing_values='NaN', strategy="median", axis=0) imp = imp.fit(features[:, 1:2]) features[:, 1:2] = imp.transform(features[:, 1:2]) from sklearn.preprocessing import LabelEncoder le = LabelEncoder() le = le.fit(features[:, 0]) features[:, 0] = le.transform(features[:, 0]) from sklearn.preprocessing import OneHotEncoder as OHE ohe = OHE(categorical_features=[0]) features = ohe.fit_transform(features).toarray() labels = le.fit_transform(labels) from sklearn.model_selection import train_test_split as TTS x_train, x_test, y_train, y_test = TTS(features, labels, test_size=0.4, random_state=0) from sklearn.preprocessing import StandardScaler sc = StandardScaler() x_train = sc.fit_transform(x_train) x_test = sc.transform(x_test)
# In[29]: # On enlève toutes les valeurs NaN X = X.dropna(how='any') # In[30]: X.shape, Y.shape # In[31]: X_tr, X_te, Y_tr, Y_te = TTS( X, Y, # features, target stratify= Y, # Va prendre une proportion aux hasard de valeurs différentes histoire de ne pas avoir des cas où l'on a que des même valeur random_state= 777, # Sert à fixer le harsard pour ne pas avoir des résultat différents à chaque tests. train_size=0.8) # 50% de X_train et Y_train et 50% de Y_test et Y_test # In[60]: knn = KNN(n_neighbors=21, weights='uniform', leaf_size=3) knn.fit(X_tr, Y_tr) train_preds = knn.predict(X_tr) predictions = knn.predict(X_te) knnTr = accuracy(train_preds, Y_tr) knnTe = accuracy(predictions, Y_te) # In[76]:
# Importing the dataset dataset=pd.read_csv('50_Startups.csv') X=dataset.iloc[:,:-1].values y=dataset.iloc[:,-1].values #Categorial data from sklearn.compose import ColumnTransformer from sklearn.preprocessing import OneHotEncoder ctX = ColumnTransformer([("State", OneHotEncoder(), [3])], remainder = 'passthrough') X = np.array(ctX.fit_transform(X),dtype=np.float)[:,1:] #Splitting in test and training from sklearn.model_selection import train_test_split as TTS X_train,X_test,y_train,y_test= TTS(X,y, test_size=0.2,random_state=0) #Train from sklearn.linear_model import LinearRegression regressor= LinearRegression() regressor.fit(X_train, y_train) # Predict y_pred= regressor.predict(X_test) #Backward elimination import statsmodels.api as sm X= np.append(arr= np.ones((50,1),np.float),values=X,axis=1)
df = pd.read_csv(dir+'KNN_Project_Data',index_col=0) print(df.head()) # PLOTTING # sns.pairplot(df) # plt.show() # SCALING scaler = SS() scaler.fit(df.drop('TARGET CLASS',axis=1)) scaled = scaler.transform(df.drop('TARGET CLASS',axis=1)) df_scale = pd.DataFrame(scaled,columns=df.columns[:-1]) print(df_scale.head()) # SPLIT DATA INTO TRAINING AND TESTING X_train,X_test,y_train,y_test = TTS(df_scale,df['TARGET CLASS'],test_size=0.3,random_state=101) # KNN model = KNC(n_neighbors=1) model.fit(X_train,y_train) pred = model.predict(X_test) print(CR(y_test,pred)) print(CM(y_test,pred)) # CHOOSE K VALUE (ELBOW METHOD) error_rate = [] for i in range(1,40): model = KNC(n_neighbors=i) model.fit(X_train,y_train)
##Make dictionary to store model importances print_message("Working on RFC") model_importances = {} for feature in list(clinical_columns) + list(micro_bio_colums): model_importances[feature] = 0 ##Store AUC for CV auc_validations = {} for k in np.unique(y_true): auc_validations[k] = [] ##K-fold Cross Validation for T in range(20): ##Train-Test Split X_train, X_test, y_train, y_test = TTS(X, y, test_size=0.25) ##Train the Model # print("Building RFC model\n") rfc = RFC(n_estimators=30, n_jobs=5) rfc.fit(X_train, y_train) y_pred = rfc.predict_proba(X_test) y_pred_b = rfc.predict(X_test) ##Create encoding for AUC y_test_e = label_binarize(y_test, classes=np.unique(y_test)) # Compute ROC Curve and AUC for each class fpr = dict() tpr = dict() roc_auc = dict()
from xgboost import XGBRegressor as XGBR import xgboost as xgb from sklearn.model_selection import cross_val_score as CVS, train_test_split as TTS from sklearn.metrics import mean_squared_error as MSE from sklearn.metrics import r2_score from sklearn.datasets import load_boston import pickle data = load_boston() X = data.data y = data.target Xtrain, Xtest, Ytrain, Ytest = TTS(X, y, test_size=0.3, random_state=420) reg = XGBR(n_estimators=100).fit(Xtrain, Ytrain) reg.predict(Xtest) # 传统接口predict score = reg.score(Xtest, Ytest) # 你能想出这里应该返回什么模型评估指标么? MSE(Ytest, reg.predict(Xtest)) dtrain = xgb.DMatrix(Xtrain, Ytrain) param = { 'silent': True, 'obj': 'reg:linear', "subsample": 1, "eta": 0.05, "gamma": 20, "lambda": 3.5, "alpha": 0.2, "max_depth": 4, "colsample_bytree": 0.4, "colsample_bylevel": 0.6,
from collections import Counter # In[2]: data_wine = load_wine() data_wine.keys() # In[3]: X = pd.DataFrame(data_wine.data, columns=data_wine.feature_names) y = pd.DataFrame(data_wine.target, columns=['flag']) pd.concat([X.head(), y.head()]).shape # In[4]: x_train, x_test, y_train, y_test = TTS(X, y, test_size=0.3, random_state=1) # In[5]: dtc = tree.DecisionTreeClassifier( criterion='entropy', splitter='random', random_state=1, ) dtc.fit(x_train, y_train) dtc.score(x_test, y_test) # In[6]: plt.barh( X.columns,
plt.style.use('ggplot') # GET THE DATA iris = sns.load_dataset('iris') print(iris.info()) # PLOTTING # sns.pairplot(iris,hue='species') # plt.show() # sns.kdeplot(iris[['sepal_width','sepal_length']][iris['species'] == "setosa"]) # plt.show() # SPLIT DATA X_train, X_test, y_train, y_test = TTS(iris.drop('species', axis=1), iris['species'], test_size=0.3, random_state=101) # TRAIN MODEL model = SVC() model.fit(X_train, y_train) pred = model.predict(X_test) print(CR(y_test, pred), CM(y_test, pred)) print(model) # GRID SEARCH - "THIS IS NOT NECESSARY, THE MODEL IS PERFECT" param_grid = { 'C': list(np.arange(0.1, 10, 0.1)), 'gamma': [1, 0.1, 0.001, 0.0001] }
# In[2]: # 导入数据集,数据集为字典形式 data = load_boston() data # In[3]: x = data.data y = data.target print(x.shape) print(y.shape) # In[4]: xtrain, xtest, ytrain, ytest = TTS(x, y, test_size=0.3, random_state=420) # In[5]: reg = XGBR(n_estimators=100).fit(xtrain, ytrain) reg.predict(xtest) # In[6]: # 测试集的结果分数,默认是返回R平方指标 reg.score(xtest, ytest) # In[7]: # 均方误差 MSE(ytest, reg.predict(xtest))
# TFIDF TRANSFORMATION OF BOW TRANSFORMATION (THE COUNT VECTORIZED MESSAGES) messages_tfidf = tt.transform(messages_bow) # NAIVE BAYES spam_detect_model = MNB().fit(messages_tfidf,df['label']) pred4 = spam_detect_model.predict(tfidf4)[0] print(pred4) pred = spam_detect_model.predict(messages_tfidf) rate = np.mean(pred == df['label']) print("Rate: {}\n".format(rate)) # TRAIN AND TEST msg_train,msg_test,label_train,label_test = TTS(df['msg'],df['label'],test_size=0.3,random_state=64) # PIPELINE - A WAY TO STORE DATA PREPARATION PIPELINE pipe = Pipeline([ ('bow',CV(analyzer=text_process)), # COUNT VECTORIZER ('tfidf',TT()), # TFIDF TRANSFORMER ('classifier',MNB()) ]) # FIT AND PREDICT - BUSINESS AS USUAL pipe.fit(msg_train,label_train) pred_pipe = pipe.predict(msg_test) rate = np.mean(pred_pipe == label_test) print("Rate: {}\n".format(rate))
from sklearn.metrics import mean_squared_error as mse import xgboost from xgboost import XGBRegressor, DMatrix from matplotlib import pyplot as plt import seaborn as sns import numpy as np import pandas as pd # 到如波斯顿房价数据集 data = load_boston() X = data.data y = data.target # 划分测试集训练集 x_train, x_test, y_train, y_test = TTS(X, y, test_size=0.3) # 初始化xgbooster模型并训练 xgbr = XGBRegressor(n_estimators=100).fit(X=x_train, y=y_train) # 使用训练好的模型在测试集上预测 y_test_pre = xgbr.predict(x_test) print('r2: ', xgbr.score(x_test, y_test)) print('mse: ', mse(y_test_pre, y_test)) # # 画学习曲线 xgbr = XGBRegressor(n_estimators=100) cv = KFold(n_splits=5, shuffle=True, random_state=23) # 使用seaborn画图
assigned_group = tf.feature_column.categorical_column_with_vocabulary_list( 'Group', ['A', 'B', 'C', 'D']) # age_bucket = tf.feature_column.bucketized_column('Age',boundaries=[20,30,40,50,60,70,80]) age_column = tf.feature_column.numeric_column('Age') age_bucket = tf.feature_column.bucketized_column( age_column, boundaries=[20, 30, 40, 50, 60, 70, 80]) # ADD THE SPECIAL CASES TO THE LIST fcols.append(assigned_group) fcols.append(age_bucket) # TRAIN TEST SPLIT X_data = dia.drop('Class', axis=1) y_data = dia['Class'] X_train, X_test, y_train, y_test = TTS(X_data, y_data, test_size=0.3, random_state=101) # INPUT FUNCTION input_func = tf.estimator.inputs.pandas_input_fn(x=X_train, y=y_train, batch_size=10, num_epochs=1000, shuffle=True, num_threads=8) # ESTIMATOR MODEL model = tf.estimator.LinearClassifier(feature_columns=fcols, n_classes=2) # TRAINING model.train(input_fn=input_func, steps=1000)
exclude=['object']).columns categoric_columns = train.drop(columns='Survived').select_dtypes( include=['object']).columns df_km = KMeans_Feature().fit(train[numeric_columns]).transform( train[numeric_columns]) train = pd.concat([train, df_km], axis=1, sort=True) df_km = KMeans_Feature().fit(test[numeric_columns]).transform( test[numeric_columns]) test = pd.concat([test, df_km], axis=1, sort=True) y = train.Survived x = train.drop(columns=['Survived']) xtrain, xval, ytrain, yval = TTS(x, y, test_size=0.3, random_state=42, stratify=y) categoric_transformer = Pipeline(steps=[('MCA', MCA(n_components=2))]) preprocessor = ColumnTransformer(transformers=[('cat', categoric_transformer, categoric_columns)]) pipe = Pipeline( steps=[('preprocessor', preprocessor), ('Scaler', StandardScaler( )), ('PCA', KernelPCA(n_components=4, kernel='rbf')), ('XGB', xgb.XGBClassifier())]) RSCparameter = {
data[i] = data[i].fillna(data[i].mode()[0]) sns.heatmap(data, cmap='viridis') data.isnull().any(axis=0) ######## Solution for Part 1 ######## # Separating Dependent and Independent variables as per Problem Statement fe = data[[ 'BIO_SEX', 'age', 'WHITE', 'BLACK', 'HISPANIC', 'NAMERICAN', 'ASIAN', 'ALCEVR1', 'ALCPROBS1', 'marever1', 'cocever1', 'inhever1', 'cigavail', 'DEP1', 'ESTEEM1' ]].values la = data["TREG1"].values # Splitting the Data into Test and Train ftr, fte, ltr, lte = TTS(fe, la, test_size=.2, random_state=0) # Applying DecisionTreeClassifier classi = DecisionTreeClassifier(criterion="entropy", random_state=0) classi.fit(ftr, ltr) pred = classi.predict(fte) # Building Confusion Matrix cm = confusion_matrix(pred, lte) # Getting Accuracy Score of the Model acc_model_part1 = accuracy_score(lte, pred) print("Accuracy Score of the Model part 1 : " + str(round(acc_model_part1 * 100, 2)) + "%")
# from sklearn.metrics import confusion_matrix as CM plt.style.use('ggplot') # READ THE DATA cancer = load_breast_cancer() df = pd.DataFrame(data=cancer['data'], columns=cancer['feature_names']) print(df.info()) # EXPLORATION # sns.pairplot(df) # plt.show() # SPLIT X_train, X_test, y_train, y_test = TTS(df, cancer['target'], test_size=0.3, random_state=64) # MODEL FITTING model = SVC() model.fit(X_train, y_train) pred = model.predict(X_test) print(CR(y_test, pred)) print(CM(y_test, pred)) # VERY BAD!!! # GRID SEARCH - TRIAL AND ERROR ON VARIOUS MODEL PARAMTERS param_grid = { 'C': list(np.arange(50000, 500000, 10000)), 'gamma': list(np.arange(0.00000001, 0.000005, 0.0000005))
""" import numpy as np import pandas as pd from sklearn.model_selection import train_test_split as TTS from sklearn.linear_model import LinearRegression from sklearn.metrics import mean_squared_error cancer_df = pd.read_csv( "http://www.stat.cmu.edu/~ryantibs/statcomp/data/pros.dat", delimiter=" ") label = cancer_df['lpsa'].values feature = cancer_df.iloc[:, :-1].values x_train, x_test, y_train, y_test = TTS(feature, label, test_size=0.20, random_state=0) lr_a1 = LinearRegression() lr_a1.fit(x_train, y_train) y_pred = lr_a1.predict(x_test) Compare_pred = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred}) MSE = mean_squared_error(y_test, y_pred) score_train = lr_a1.score(x_train, y_train) score_test = lr_a1.score(x_test, y_test) ## Regularized model from sklearn.linear_model import Lasso, Ridge, ElasticNet lr_la = Lasso()
#finding car whose mpg value is maximum car = df.iloc[df['mpg'].argmax(), -1] print("car with maximum mpg is", car) #replacing missing values from horsepower column whic are in form of '?' df['horsepower'][df['horsepower'] == '?'] = df['horsepower'].mode()[0] df['horsepower'] = df['horsepower'].convert_objects(convert_numeric=True) features = df.iloc[:, 1:-1].values labels = df.iloc[:, 0:1].values #train test and split from sklearn.model_selection import train_test_split as TTS features_train, features_test, labels_train, labels_test = TTS(features, labels, test_size=0.2, random_state=0) #scailing using standard scaler from sklearn.preprocessing import StandardScaler sc = StandardScaler() features_train = sc.fit_transform(features_train) features_test = sc.transform(features_test) #also scailing the required result import numpy as np pred_1 = sc.transform( np.array([6, 215, 100, 2630, 22.2, 80, 3]).reshape(1, -1)) #using decision tree from sklearn.tree import DecisionTreeRegressor
# Removing NaN values with Most Frequent value of the column for i in data: data[i] = data[i].fillna(data[i].mode()[0]) #--------------------------Solution part1------------------------------------- # Separating Dependent and Independent variables as per Problem Statement features = data[[ 'BIO_SEX', 'age', 'WHITE', 'BLACK', 'HISPANIC', 'NAMERICAN', 'ASIAN', 'ALCEVR1', 'ALCPROBS1', 'marever1', 'cocever1', 'inhever1', 'cigavail', 'DEP1', 'ESTEEM1' ]].values labels = data["TREG1"].values # Splitting the Data into Test and Train features_train, features_test, labels_train, labels_test = TTS(features, labels, test_size=.2, random_state=0) # Applying DecisionTreeClassifier classifier = DecisionTreeClassifier(criterion="entropy", random_state=0) classifier.fit(features_train, labels_train) pred = classifier.predict(features_test) pd.DataFrame(pred, labels_test) cm = confusion_matrix(pred, labels_test) accuracy = accuracy_score(pred, labels_test) #--------------------------Solution part2----------------------------------- # Separating Dependent and Independent variables as per Problem Statement features2 = data[["BIO_SEX", "VIOL1"]].values
from sklearn.metrics import f1_score as f1 from sklearn.model_selection import train_test_split as TTS from sklearn.ensemble import RandomForestClassifier as RFC from sklearn.svm import SVC from sklearn.metrics import classification_report, confusion_matrix # %matplotlib inline from sklearn.metrics import roc_curve, auc import matplotlib.pyplot as plt ## RANDOM FOREST MODEL ### DATA PRE-PROCESSING X_train, X_test, y_train, y_test = TTS(labels_X, class_y, test_size=0.10, shuffle=True, random_state=2000) ### EXTRACTING BEST FEATURES FROM DATASET rfe = RFE(estimator=RFC(n_estimators=1000, max_depth=30, random_state=100, max_leaf_nodes=1000), step=2) rfe = rfe.fit(X_train, y_train) sel_features = pd.DataFrame({ "Feature Name": list(X_train.columns),