Exemple #1
0
def predict():
    """Importing the data file"""
    df = pd.read_csv('refined.csv')

    # Features and Labels
    features = df.iloc[:10000, 0]
    labels = df.iloc[:10000, 1]
    # Extract Feature With CountVectorizer
    from sklearn.feature_extraction.text import CountVectorizer
    cv = CountVectorizer()
    features = cv.fit_transform(features.values.astype('U')).toarray()
    """forming train test split"""
    from sklearn.model_selection import train_test_split as TTS
    f_train, f_test, l_train, l_test = TTS(features,
                                           labels,
                                           random_state=1,
                                           test_size=0.33)

    #training a GaussinanNB
    del features, labels, df
    from sklearn.naive_bayes import MultinomialNB
    MNB = MultinomialNB()
    MNB.fit(f_train, l_train)

    l_pred = MNB.predict(f_test)

    from sklearn.metrics import classification_report
    print(classification_report(l_test, l_pred))

    if request.method == 'POST':
        message = request.form['message']
        data = [message]
        vect = cv.transform(data).toarray()
        my_prediction = MNB.predict(vect)
        return render_template('result.html', prediction=my_prediction)
Exemple #2
0
async def prediction(algo: str, number: float):
    if number > 1 or number < 0:
        result = " Vous devez choisir entre : KNN, RF (RandomForest) ou GB (GradientBoosting) en plus du pourcentage de splitting entre le train et le test (train_size) entre 1 et 0."
    else:
        X_tr, X_te, Y_tr, Y_te = TTS(
            X,
            Y,  # features, target
            stratify=
            Y,  # Va prendre une proportion aux hasard de valeurs différentes histoire de ne pas avoir des cas où l'on a que des même valeur
            random_state=
            777,  # Sert à fixer le harsard pour ne pas avoir des résultat différents à chaque tests.
            train_size=number
        )  # 50% de X_train et Y_train et 50% de Y_test et Y_test

    if algo == 'KNN':
        scoreTr, scoreTe = kn(number, X_tr, X_te, Y_tr, Y_te)
        result = " Pour l\'algorithme %s, on à un score de %f d'accuracy en train et %f d'accuracy en test. " % (
            algo, scoreTr, scoreTe)
    elif algo == 'RF':
        scoreTr, scoreTe = rf(number, X_tr, X_te, Y_tr, Y_te)
        result = " Pour l\'algorithme %s, on à un score de %f d'accuracy en train et %f d'accuracy en test. " % (
            algo, scoreTr, scoreTe)
    elif algo == 'GB':
        scoreTr, scoreTe = gb(number, X_tr, X_te, Y_tr, Y_te)
        result = " Pour l\'algorithme %s, on à un score de %f d'accuracy en train et %f d'accuracy en test. " % (
            algo, scoreTr, scoreTe)
    else:
        result = " Vous devez choisir entre : KNN, RF (RandomForest) ou GB (GradientBoosting) en plus du pourcentage de splitting entre le train et le test (train_size) entre 1 et 0."
    return result
Exemple #3
0
def main():
    fileName = sys.argv[1]
    ourChords = pd.read_csv(fileName)  #save each dataset
    theirChords = pd.read_csv("theirChords.csv")
    dataSet = ourChords.append(theirChords, sort=False)  #concat two lists
    dataSet = dataSet.sample(frac=1).reset_index(drop=True)  #shuffle dataset

    #split dataset into chords and labels
    X = dataSet.iloc[:, 1:16]
    Y = dataSet.iloc[:, 0]
    print(X)
    print(Y)
    # encode labels as 0s and 1s
    le = preprocessing.LabelEncoder()
    Y = le.fit_transform(Y)

    #one hot encode everything
    X = pd.get_dummies(X)

    #split into train and test sets
    X_train, X_test, Y_train, Y_test = TTS(X, Y, test_size=0.20)

    #train the set
    mlp = MLPClassifier(hidden_layer_sizes=(16, 16, 16), max_iter=1000)
    mlp.fit(X_train, Y_train)

    #make predictions
    predictions = mlp.predict(X_test)

    #print analysis
    print(confusion_matrix(Y_test, predictions))
    print(classification_report(Y_test, predictions))
Exemple #4
0
    def __two_splits(self, *data, **options):
        """
        Split data in train-test and validation datasets
        """
        test_size = options['test_size']

        validation_size = 1 - options['train_size'] - test_size

        options['test_size'] = None

        # First split
        X_train, X_test, y_train, y_test = TTS(*data, **options)

        options['train_size'] = test_size / (test_size + validation_size)

        # Second split
        X_test, X_validation, y_test, y_validation = TTS(
            X_test, y_test, **options)

        return [X_train, X_test, X_validation, y_train, y_test, y_validation]
Exemple #5
0
def xgboost_demo():
    data = load_boston()
    x = data.data
    y = data.target
    Xtrain, Xtest, Ytrain, Ytest = TTS(x, y, test_size=0.3, random_state=430)
    reg = XGBR(n_estimators=100).fit(Xtrain, Ytrain)    # 创建多少棵树
    reg.predict(Xtest)

    print(reg.score(Xtest, Ytest))
    print(MSE(Ytest, reg.predict(Xtest)))    # 查看均方误差

    print(reg.feature_importances_)    # 每个特征的贡献
    print(CVS(reg, Xtrain, Ytrain, cv=5).mean())    # 交叉验证均值
Exemple #6
0
def Model(features, labels):
    # Applying OneHotEncoding
    from sklearn.preprocessing import OneHotEncoder

    col_to_ohe = [6, 7]  # Columns to be OneHotEncoded
    ohe = OneHotEncoder(categorical_features=[col_to_ohe])
    features = ohe.fit_transform(features).toarray()

    # Getting indexes for the columns to be dropped, to avoid dummy variable trap
    total_col, indexes = 0, []
    for col in col_to_ohe:
        unique_val_count = len(dataset.iloc[:, col].value_counts())
        total_col += unique_val_count
        indexes.append(total_col - unique_val_count)

    # Dropping the dummy variable trap columns
    features = np.delete(features, indexes, axis=1)

    # Splitting the dataset into train and test
    from sklearn.model_selection import train_test_split as TTS

    f_train, f_test, l_train, l_test = TTS(features,
                                           labels,
                                           test_size=0.25,
                                           random_state=0)

    # Logistic Regression Model
    from sklearn.linear_model import LogisticRegression
    reg = LogisticRegression(random_state=0)
    reg = reg.fit(f_train, l_train)

    pred = reg.predict(f_test)  # Prediction on test data

    # np.array([1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 3, 25, 3, 1, 4, 16]).reshape(1,-1)
    # Preprocessing the new individual's data
    val = np.array([3, 25, 3, 1, 4, 16, 4, 2]).reshape(1, -1)
    val = ohe.transform(val).toarray()
    val = np.delete(val, indexes, axis=1)

    val_pred = reg.predict_proba(val)  # Predicting Individual's value

    # Confusion Matrix
    from sklearn.metrics import confusion_matrix
    cm = confusion_matrix(l_test, pred)

    # check the accuracy on the Model
    mod_score = reg.score(f_test, l_test)

    return pred, val_pred, cm, mod_score
def singleExperiment(cfg):
    """
    Can only run with access to config variable
    """
    # make data
    X, y = make_classification(n_samples=cfg['n_samples'],
                               n_features=cfg['n_features'])
    # train test split
    X_train, X_test, y_train, y_test = TTS(X, y, test_size=.2)
    # set up RecurrentForest model
    rec_fst_clf = RF.RecurrentForest(X_train,
                             y_train,
                             cfg['T'],
                             cfg['n_trees'],
                             cfg['p_connect'],
                             cfg['p_feature'],
                             cfg['p_example'],
                             cfg['tree_kwargs'])
    # set up RandomForest
    rnd_fst_clf = RFC(**cfg['random_forest_kwargs'])
    # set up AdaBoost
    ada_bst_clf = ABC(**cfg['ada_boost_kwargs'])
    # in a list
    models = [rec_fst_clf,
              rnd_fst_clf,
              ada_bst_clf]
    
    print("<<< training models >>>")
    for m in tqdm(models):
        m.fit(X_train, y_train) # RecurrentForest ignores args - data present at init

    print("<<< testing models >>>")
    y_hats = np.zeros((3, X_test.shape[0]))
    for i, m in tqdm(enumerate(models)):
        if i == 0:
            y_hats[i, :] = m.predictNew(X_test)
        else:
            y_hats[i, :] = m.predict(X_test)

    # get metrics
    measures = np.zeros((3, 4))
    for i in tqdm(range(3)):
        measures[i,:] = M.binary_metrics(y_test, y_hats[i,:], model=str(models[i]))

    return measures
Exemple #8
0
def draw_curve():
    data = load_boston()
    x = data.data
    y = data.target
    Xtrain, Xtest, Ytrain, Ytest = TTS(x, y, test_size=0.3, random_state=430)
    axisx = range(10, 1010, 50)
    rs = []    # 1 减去 偏差
    var = []    # 纪录方差
    ge = []    # 计算泛化误差的可控部分
    for i in axisx:
        reg = XGBR(n_estimators=i)  # 创建多少棵树
        # 默认值越大,越好。所以scoring='neg_mean_squared_error'
        rs.append(CVS(reg, Xtrain, Ytrain, cv=5, scoring='neg_mean_squared_error').mean())

    print(axisx[rs.index(max(rs))], max(rs))
    plt.figure(figsize=(20, 5))
    plt.plot(axisx, rs, c='red', label='XGB')
    plt.legend()
    plt.show()
Exemple #9
0
def draw_curve_2():
    data = load_boston()
    x = data.data
    y = data.target
    Xtrain, Xtest, Ytrain, Ytest = TTS(x, y, test_size=0.3, random_state=430)
    axisx = range(10, 1010, 50)
    rs = []
    var = []
    ge = []
    for i in axisx:
        reg = XGBR(n_estimators=i, random_state=420)  # 创建多少棵树
        cvresult = CVS(reg, Xtrain, Ytrain, cv=5)    # 分5折验证
        rs.append(cvresult.mean())  # 1 减去 偏差
        var.append(cvresult.var())  # 纪录方差
        ge.append((1 - cvresult.mean()) ** 2 + cvresult.var())  # 计算泛化误差的可控部分

    # print(axisx[rs.index(max(rs))], max(rs), var[rs.index(max(rs))])
    # 泛化误差可控部分最小的时候,打印r平方和泛化误差
    print(axisx[ge.index(min(ge))], rs[ge.index(min(ge))], var[ge.index(min(ge))], min(ge))
Exemple #10
0
    features_train_data).toarray()

features_test_data = onehotencoder.transform(features_test_data).toarray()

import numpy as np

features_train_data = np.append(arr=features,
                                values=features_train_data,
                                axis=1)
features_test_data = np.append(arr=features2,
                               values=features_test_data,
                               axis=1)

from sklearn.model_selection import train_test_split as TTS
f_train, f_test, l_train, l_test = TTS(features_train_data,
                                       labels_train_data,
                                       test_size=0.3,
                                       random_state=0)

model_scores = {}

from sklearn.decomposition import PCA
pca = PCA(n_components=10)
f_train = pca.fit_transform(f_train)
f_test = pca.transform(f_test)
explained_variance = pca.explained_variance_ratio_

from sklearn.metrics import accuracy_score

# Naive Bayes
from sklearn.naive_bayes import GaussianNB
nb = GaussianNB()
datatotal=pd.read_csv("Chemistry_f.csv")
X=datatotal.drop("Y",axis=1)
X=preprocessing.scale(X)
y=datatotal.iloc[:,26]

importances=np.loadtxt("importance1.txt")
im=np.sort(importances)[::-1]
arg=np.argsort(importances)[::-1]


i=20
t=arg[:i]
X=X[:,np.array(t)]
X=preprocessing.scale(X)
y=datatotal.iloc[:,26]
X_train,X_test,Y_train,Y_test=TTS(X,y,test_size=0.2,random_state=seed)

x=np.arange(-1,2,step=0.001)
y=x


###Xgboost
fig = plt.figure(figsize=(8,16))
ax = fig.subplots(3,2)
reg=XGBR(silent=True,n_estimators=200,max_depth=3,learning_rate=0.26,reg_lambda=0.09).fit(X_train,Y_train)
xgb_pre=reg.predict(X_test)
xgb_pre_tr=reg.predict(X_train)
xgb_avg=CVS(reg,X_train,Y_train,scoring="neg_mean_absolute_error",cv=5).mean()
xgb_mse=metrics.mean_squared_error(xgb_pre,Y_test)
xgb_r2=metrics.explained_variance_score(xgb_pre,Y_test)
xgb_mae=metrics.mean_absolute_error(xgb_pre,Y_test)
Exemple #12
0
features = df.iloc[:, :-1].values
labels = df.iloc[:, -1:].values

from sklearn.preprocessing import Imputer as ip
imp = ip(missing_values='NaN', strategy="median", axis=0)
imp = imp.fit(features[:, 1:2])

features[:, 1:2] = imp.transform(features[:, 1:2])

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le = le.fit(features[:, 0])
features[:, 0] = le.transform(features[:, 0])

from sklearn.preprocessing import OneHotEncoder as OHE
ohe = OHE(categorical_features=[0])
features = ohe.fit_transform(features).toarray()

labels = le.fit_transform(labels)

from sklearn.model_selection import train_test_split as TTS
x_train, x_test, y_train, y_test = TTS(features,
                                       labels,
                                       test_size=0.4,
                                       random_state=0)

from sklearn.preprocessing import StandardScaler
sc = StandardScaler()

x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)
Exemple #13
0
# In[29]:

# On enlève toutes les valeurs NaN

X = X.dropna(how='any')

# In[30]:

X.shape, Y.shape

# In[31]:

X_tr, X_te, Y_tr, Y_te = TTS(
    X,
    Y,  # features, target
    stratify=
    Y,  # Va prendre une proportion aux hasard de valeurs différentes histoire de ne pas avoir des cas où l'on a que des même valeur
    random_state=
    777,  # Sert à fixer le harsard pour ne pas avoir des résultat différents à chaque tests.
    train_size=0.8)  # 50% de X_train et Y_train et 50% de Y_test et Y_test

# In[60]:

knn = KNN(n_neighbors=21, weights='uniform', leaf_size=3)
knn.fit(X_tr, Y_tr)
train_preds = knn.predict(X_tr)
predictions = knn.predict(X_te)
knnTr = accuracy(train_preds, Y_tr)
knnTe = accuracy(predictions, Y_te)

# In[76]:
# Importing the dataset
dataset=pd.read_csv('50_Startups.csv')
X=dataset.iloc[:,:-1].values
y=dataset.iloc[:,-1].values


#Categorial data
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ctX = ColumnTransformer([("State", OneHotEncoder(), [3])], remainder = 'passthrough')
X = np.array(ctX.fit_transform(X),dtype=np.float)[:,1:]


#Splitting in test and training
from sklearn.model_selection import train_test_split as TTS
X_train,X_test,y_train,y_test= TTS(X,y, test_size=0.2,random_state=0)


#Train
from sklearn.linear_model import LinearRegression
regressor= LinearRegression()
regressor.fit(X_train, y_train)

# Predict
y_pred= regressor.predict(X_test)


#Backward elimination
import statsmodels.api as sm
X= np.append(arr= np.ones((50,1),np.float),values=X,axis=1)
Exemple #15
0
df = pd.read_csv(dir+'KNN_Project_Data',index_col=0)
print(df.head())

# PLOTTING
# sns.pairplot(df)
# plt.show()

# SCALING
scaler = SS()
scaler.fit(df.drop('TARGET CLASS',axis=1))
scaled = scaler.transform(df.drop('TARGET CLASS',axis=1))
df_scale = pd.DataFrame(scaled,columns=df.columns[:-1])
print(df_scale.head())

# SPLIT DATA INTO TRAINING AND TESTING
X_train,X_test,y_train,y_test = TTS(df_scale,df['TARGET CLASS'],test_size=0.3,random_state=101)

# KNN
model = KNC(n_neighbors=1)
model.fit(X_train,y_train)
pred = model.predict(X_test)

print(CR(y_test,pred))
print(CM(y_test,pred))

# CHOOSE K VALUE (ELBOW METHOD)
error_rate = []

for i in range(1,40):
	model = KNC(n_neighbors=i)
	model.fit(X_train,y_train)
Exemple #16
0
##Make dictionary to store model importances
print_message("Working on RFC")

model_importances = {}
for feature in list(clinical_columns) + list(micro_bio_colums):
    model_importances[feature] = 0

##Store AUC for CV
auc_validations = {}
for k in np.unique(y_true):
    auc_validations[k] = []

##K-fold Cross Validation
for T in range(20):
    ##Train-Test Split
    X_train, X_test, y_train, y_test = TTS(X, y, test_size=0.25)

    ##Train the Model
    # print("Building RFC model\n")
    rfc = RFC(n_estimators=30, n_jobs=5)
    rfc.fit(X_train, y_train)
    y_pred = rfc.predict_proba(X_test)
    y_pred_b = rfc.predict(X_test)

    ##Create encoding for AUC
    y_test_e = label_binarize(y_test, classes=np.unique(y_test))

    # Compute ROC Curve and AUC for each class
    fpr = dict()
    tpr = dict()
    roc_auc = dict()
Exemple #17
0
from xgboost import XGBRegressor as XGBR
import xgboost as xgb
from sklearn.model_selection import cross_val_score as CVS, train_test_split as TTS
from sklearn.metrics import mean_squared_error as MSE
from sklearn.metrics import r2_score
from sklearn.datasets import load_boston
import pickle

data = load_boston()

X = data.data
y = data.target

Xtrain, Xtest, Ytrain, Ytest = TTS(X, y, test_size=0.3, random_state=420)
reg = XGBR(n_estimators=100).fit(Xtrain, Ytrain)
reg.predict(Xtest)  # 传统接口predict
score = reg.score(Xtest, Ytest)  # 你能想出这里应该返回什么模型评估指标么?
MSE(Ytest, reg.predict(Xtest))

dtrain = xgb.DMatrix(Xtrain, Ytrain)
param = {
    'silent': True,
    'obj': 'reg:linear',
    "subsample": 1,
    "eta": 0.05,
    "gamma": 20,
    "lambda": 3.5,
    "alpha": 0.2,
    "max_depth": 4,
    "colsample_bytree": 0.4,
    "colsample_bylevel": 0.6,
Exemple #18
0
from collections import Counter

# In[2]:

data_wine = load_wine()
data_wine.keys()

# In[3]:

X = pd.DataFrame(data_wine.data, columns=data_wine.feature_names)
y = pd.DataFrame(data_wine.target, columns=['flag'])
pd.concat([X.head(), y.head()]).shape

# In[4]:

x_train, x_test, y_train, y_test = TTS(X, y, test_size=0.3, random_state=1)

# In[5]:

dtc = tree.DecisionTreeClassifier(
    criterion='entropy',
    splitter='random',
    random_state=1,
)
dtc.fit(x_train, y_train)
dtc.score(x_test, y_test)

# In[6]:

plt.barh(
    X.columns,
plt.style.use('ggplot')

# GET THE DATA
iris = sns.load_dataset('iris')
print(iris.info())

# PLOTTING
# sns.pairplot(iris,hue='species')
# plt.show()

# sns.kdeplot(iris[['sepal_width','sepal_length']][iris['species'] == "setosa"])
# plt.show()

# SPLIT DATA
X_train, X_test, y_train, y_test = TTS(iris.drop('species', axis=1),
                                       iris['species'],
                                       test_size=0.3,
                                       random_state=101)

# TRAIN MODEL
model = SVC()
model.fit(X_train, y_train)
pred = model.predict(X_test)

print(CR(y_test, pred), CM(y_test, pred))
print(model)

# GRID SEARCH - "THIS IS NOT NECESSARY, THE MODEL IS PERFECT"
param_grid = {
    'C': list(np.arange(0.1, 10, 0.1)),
    'gamma': [1, 0.1, 0.001, 0.0001]
}
# In[2]:

# 导入数据集,数据集为字典形式
data = load_boston()
data

# In[3]:

x = data.data
y = data.target
print(x.shape)
print(y.shape)

# In[4]:

xtrain, xtest, ytrain, ytest = TTS(x, y, test_size=0.3, random_state=420)

# In[5]:

reg = XGBR(n_estimators=100).fit(xtrain, ytrain)
reg.predict(xtest)

# In[6]:

# 测试集的结果分数,默认是返回R平方指标
reg.score(xtest, ytest)

# In[7]:

# 均方误差
MSE(ytest, reg.predict(xtest))
Exemple #21
0
# TFIDF TRANSFORMATION OF BOW TRANSFORMATION (THE COUNT VECTORIZED MESSAGES)
messages_tfidf = tt.transform(messages_bow)

# NAIVE BAYES
spam_detect_model = MNB().fit(messages_tfidf,df['label'])
pred4 = spam_detect_model.predict(tfidf4)[0]
print(pred4)

pred = spam_detect_model.predict(messages_tfidf)

rate = np.mean(pred == df['label'])
print("Rate: {}\n".format(rate))

# TRAIN AND TEST
msg_train,msg_test,label_train,label_test = TTS(df['msg'],df['label'],test_size=0.3,random_state=64)

# PIPELINE - A WAY TO STORE DATA PREPARATION PIPELINE
pipe = Pipeline([
	('bow',CV(analyzer=text_process)), # COUNT VECTORIZER
	('tfidf',TT()), # TFIDF TRANSFORMER
	('classifier',MNB())
])

# FIT AND PREDICT - BUSINESS AS USUAL
pipe.fit(msg_train,label_train)

pred_pipe = pipe.predict(msg_test)

rate = np.mean(pred_pipe == label_test)
print("Rate: {}\n".format(rate))
Exemple #22
0
from sklearn.metrics import mean_squared_error as mse
import xgboost
from xgboost import XGBRegressor, DMatrix
from matplotlib import pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd

# 到如波斯顿房价数据集
data = load_boston()

X = data.data
y = data.target

# 划分测试集训练集
x_train, x_test, y_train, y_test = TTS(X, y, test_size=0.3)

# 初始化xgbooster模型并训练
xgbr = XGBRegressor(n_estimators=100).fit(X=x_train, y=y_train)

# 使用训练好的模型在测试集上预测
y_test_pre = xgbr.predict(x_test)

print('r2: ', xgbr.score(x_test, y_test))
print('mse: ', mse(y_test_pre, y_test))

# # 画学习曲线
xgbr = XGBRegressor(n_estimators=100)
cv = KFold(n_splits=5, shuffle=True, random_state=23)

# 使用seaborn画图
assigned_group = tf.feature_column.categorical_column_with_vocabulary_list(
    'Group', ['A', 'B', 'C', 'D'])
# age_bucket = tf.feature_column.bucketized_column('Age',boundaries=[20,30,40,50,60,70,80])
age_column = tf.feature_column.numeric_column('Age')
age_bucket = tf.feature_column.bucketized_column(
    age_column, boundaries=[20, 30, 40, 50, 60, 70, 80])

# ADD THE SPECIAL CASES TO THE LIST
fcols.append(assigned_group)
fcols.append(age_bucket)

# TRAIN TEST SPLIT
X_data = dia.drop('Class', axis=1)
y_data = dia['Class']
X_train, X_test, y_train, y_test = TTS(X_data,
                                       y_data,
                                       test_size=0.3,
                                       random_state=101)

# INPUT FUNCTION
input_func = tf.estimator.inputs.pandas_input_fn(x=X_train,
                                                 y=y_train,
                                                 batch_size=10,
                                                 num_epochs=1000,
                                                 shuffle=True,
                                                 num_threads=8)

# ESTIMATOR MODEL
model = tf.estimator.LinearClassifier(feature_columns=fcols, n_classes=2)

# TRAINING
model.train(input_fn=input_func, steps=1000)
    exclude=['object']).columns
categoric_columns = train.drop(columns='Survived').select_dtypes(
    include=['object']).columns

df_km = KMeans_Feature().fit(train[numeric_columns]).transform(
    train[numeric_columns])
train = pd.concat([train, df_km], axis=1, sort=True)
df_km = KMeans_Feature().fit(test[numeric_columns]).transform(
    test[numeric_columns])
test = pd.concat([test, df_km], axis=1, sort=True)

y = train.Survived
x = train.drop(columns=['Survived'])
xtrain, xval, ytrain, yval = TTS(x,
                                 y,
                                 test_size=0.3,
                                 random_state=42,
                                 stratify=y)

categoric_transformer = Pipeline(steps=[('MCA', MCA(n_components=2))])

preprocessor = ColumnTransformer(transformers=[('cat', categoric_transformer,
                                                categoric_columns)])

pipe = Pipeline(
    steps=[('preprocessor', preprocessor), ('Scaler', StandardScaler(
    )), ('PCA',
         KernelPCA(n_components=4, kernel='rbf')), ('XGB',
                                                    xgb.XGBClassifier())])

RSCparameter = {
Exemple #25
0
    data[i] = data[i].fillna(data[i].mode()[0])

sns.heatmap(data, cmap='viridis')
data.isnull().any(axis=0)
######## Solution for Part 1 ########

# Separating Dependent and Independent variables as per Problem Statement
fe = data[[
    'BIO_SEX', 'age', 'WHITE', 'BLACK', 'HISPANIC', 'NAMERICAN', 'ASIAN',
    'ALCEVR1', 'ALCPROBS1', 'marever1', 'cocever1', 'inhever1', 'cigavail',
    'DEP1', 'ESTEEM1'
]].values
la = data["TREG1"].values

# Splitting the Data into Test and Train
ftr, fte, ltr, lte = TTS(fe, la, test_size=.2, random_state=0)

# Applying DecisionTreeClassifier
classi = DecisionTreeClassifier(criterion="entropy", random_state=0)
classi.fit(ftr, ltr)
pred = classi.predict(fte)

# Building Confusion Matrix
cm = confusion_matrix(pred, lte)

# Getting Accuracy Score of the Model
acc_model_part1 = accuracy_score(lte, pred)

print("Accuracy Score of the Model part 1 : " +
      str(round(acc_model_part1 * 100, 2)) + "%")
Exemple #26
0
# from sklearn.metrics import confusion_matrix as CM

plt.style.use('ggplot')

# READ THE DATA
cancer = load_breast_cancer()
df = pd.DataFrame(data=cancer['data'], columns=cancer['feature_names'])
print(df.info())

# EXPLORATION
# sns.pairplot(df)
# plt.show()

# SPLIT
X_train, X_test, y_train, y_test = TTS(df,
                                       cancer['target'],
                                       test_size=0.3,
                                       random_state=64)

# MODEL FITTING
model = SVC()
model.fit(X_train, y_train)
pred = model.predict(X_test)

print(CR(y_test, pred))
print(CM(y_test, pred))
# VERY BAD!!!

# GRID SEARCH - TRIAL AND ERROR ON VARIOUS MODEL PARAMTERS
param_grid = {
    'C': list(np.arange(50000, 500000, 10000)),
    'gamma': list(np.arange(0.00000001, 0.000005, 0.0000005))
"""

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split as TTS
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

cancer_df = pd.read_csv(
    "http://www.stat.cmu.edu/~ryantibs/statcomp/data/pros.dat", delimiter=" ")
label = cancer_df['lpsa'].values
feature = cancer_df.iloc[:, :-1].values

x_train, x_test, y_train, y_test = TTS(feature,
                                       label,
                                       test_size=0.20,
                                       random_state=0)

lr_a1 = LinearRegression()
lr_a1.fit(x_train, y_train)

y_pred = lr_a1.predict(x_test)
Compare_pred = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
MSE = mean_squared_error(y_test, y_pred)

score_train = lr_a1.score(x_train, y_train)
score_test = lr_a1.score(x_test, y_test)

## Regularized model
from sklearn.linear_model import Lasso, Ridge, ElasticNet
lr_la = Lasso()
Exemple #28
0
#finding car whose mpg value is maximum
car = df.iloc[df['mpg'].argmax(), -1]
print("car with maximum mpg is", car)

#replacing missing values from horsepower column whic are in form of '?'
df['horsepower'][df['horsepower'] == '?'] = df['horsepower'].mode()[0]
df['horsepower'] = df['horsepower'].convert_objects(convert_numeric=True)

features = df.iloc[:, 1:-1].values
labels = df.iloc[:, 0:1].values

#train test and split
from sklearn.model_selection import train_test_split as TTS
features_train, features_test, labels_train, labels_test = TTS(features,
                                                               labels,
                                                               test_size=0.2,
                                                               random_state=0)

#scailing using standard scaler
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
features_train = sc.fit_transform(features_train)
features_test = sc.transform(features_test)

#also scailing the required result
import numpy as np
pred_1 = sc.transform(
    np.array([6, 215, 100, 2630, 22.2, 80, 3]).reshape(1, -1))

#using decision tree
from sklearn.tree import DecisionTreeRegressor
Exemple #29
0
# Removing NaN values with Most Frequent value of the column
for i in data:
    data[i] = data[i].fillna(data[i].mode()[0])

#--------------------------Solution part1-------------------------------------
# Separating Dependent and Independent variables as per Problem Statement
features = data[[
    'BIO_SEX', 'age', 'WHITE', 'BLACK', 'HISPANIC', 'NAMERICAN', 'ASIAN',
    'ALCEVR1', 'ALCPROBS1', 'marever1', 'cocever1', 'inhever1', 'cigavail',
    'DEP1', 'ESTEEM1'
]].values
labels = data["TREG1"].values

# Splitting the Data into Test and Train
features_train, features_test, labels_train, labels_test = TTS(features,
                                                               labels,
                                                               test_size=.2,
                                                               random_state=0)

# Applying DecisionTreeClassifier
classifier = DecisionTreeClassifier(criterion="entropy", random_state=0)
classifier.fit(features_train, labels_train)
pred = classifier.predict(features_test)
pd.DataFrame(pred, labels_test)

cm = confusion_matrix(pred, labels_test)
accuracy = accuracy_score(pred, labels_test)

#--------------------------Solution part2-----------------------------------

# Separating Dependent and Independent variables as per Problem Statement
features2 = data[["BIO_SEX", "VIOL1"]].values
from sklearn.metrics import f1_score as f1
from sklearn.model_selection import train_test_split as TTS
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix
# %matplotlib inline
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt

## RANDOM FOREST MODEL

### DATA PRE-PROCESSING

X_train, X_test, y_train, y_test = TTS(labels_X,
                                       class_y,
                                       test_size=0.10,
                                       shuffle=True,
                                       random_state=2000)

### EXTRACTING BEST FEATURES FROM DATASET

rfe = RFE(estimator=RFC(n_estimators=1000,
                        max_depth=30,
                        random_state=100,
                        max_leaf_nodes=1000),
          step=2)

rfe = rfe.fit(X_train, y_train)

sel_features = pd.DataFrame({
    "Feature Name": list(X_train.columns),