l1=LabelEncoder()
data1['sex']=l1.fit_transform(data['Sex'])

l2=LabelEncoder()
data1['embarked']=l2.fit_transform(data['Embarked'].astype(str))

data1 = data1.drop(['Sex','Embarked'],axis=1)

cater_cols=['Pclass','SibSp','Parch','sex','embarked']
contin_cols=['Age','Fare']
total_df=cater_cols+contin_cols

conti=pd.DataFrame(data1,columns=contin_cols)
cater=pd.DataFrame(data1,columns=cater_cols)

impca=Imputer(missing_values='NaN',strategy='most_frequent')
impca_out=impca.fit_transform(cater)

impca_df=pd.DataFrame(impca_out,columns=cater_cols)


impco=Imputer(missing_values='NaN',strategy='mean')
impo_out = impco.fit_transform(conti)
impco_df=pd.DataFrame(impo_out,columns=contin_cols)

total_df_data=pd.concat([impca_df,impco_df],axis=1)

sl1=StandardScaler()
sl2=sl1.fit_transform(total_df_data)
sl3=pd.DataFrame(sl2,columns=total_df)
Esempio n. 2
0
def main():
    #*************************************************************************************
    #1.load data (training and test) and preprocessing data(replace NA,98,96,0(age) with NaN)
    #read data using pandas
    #replace 98, 96 with NAN for NOTime30-59,90,60-90
    #replace  0 with NAN for age
    #*************************************************************************************
    colnames = ['ID', 'label', 'RUUnsecuredL', 'age', 'NOTime30-59', \
                'DebtRatio', 'Income', 'NOCredit', 'NOTimes90', \
                'NORealEstate', 'NOTime60-89', 'NODependents']
    col_nas = ['', 'NA', 'NA', 0, [98, 96], 'NA', 'NA', 'NA', \
                [98, 96], 'NA', [98, 96], 'NA']
    col_na_values = creatDictKV(colnames, col_nas)

    dftrain = pd.read_csv("cs-training.csv", names=colnames, \
                          na_values=col_na_values, skiprows=[0])
    train_id = [int(x) for x in dftrain.pop("ID")]
    y_train = np.asarray([int(x) for x in dftrain.pop("label")])
    x_train = dftrain.as_matrix()

    dftest = pd.read_csv("cs-test.csv", names=colnames, \
                         na_values=col_na_values, skiprows=[0])
    test_id = [int(x) for x in dftest.pop("ID")]
    y_test = np.asarray(dftest.pop("label"))
    x_test = dftest.as_matrix()

    #*************************************************************************************
    #2.split training data into training_new  and test_new (for validation model)
    # to keep the class ratio using StratifiedShuffleSplit to do the split
    #*************************************************************************************

    sss = StratifiedShuffleSplit(n_splits=1, test_size=0.33333, random_state=0)
    for train_index, test_index in sss.split(x_train, y_train):
        print("TRAIN:", train_index, "TEST:", test_index)
        x_train_new, x_test_new = x_train[train_index], x_train[test_index]
        y_train_new, y_test_new = y_train[train_index], y_train[test_index]

    y_train = y_train_new
    x_train = x_train_new

    #*****************************************************************************************
    #3.impute the data with imputer: replace MVs with Mean
    #*****************************************************************************************
    imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
    imp.fit(x_train)
    x_train = imp.transform(x_train)
    x_test_new = imp.transform(x_test_new)
    x_test = imp.transform(x_test)

    #*****************************************************************************************
    #4.Build RF model using the training_new data:
    #   a. handle imbalanced data distribution by
    #      setting class_weight="balanced"/"balanced_subsample"
    #      n_samples / (n_classes * np.bincount(y))
    #*****************************************************************************************
    #  Initialize the model:
    #*****************************************************************************************
    rf = RandomForestClassifier(n_estimators=100, \
                                oob_score=True, \
                                min_samples_split=2, \
                                min_samples_leaf=50, \
                                n_jobs=-1, \
                                #class_weight="balanced",\
                                class_weight="balanced_subsample", \
                                bootstrap=True\
                                )
    #*************************************************************************************
    #   b. perform parameter tuning using grid search with CrossValidation
    #*************************************************************************************

    #param_grid={"max_features": [2,3,4,5],\
    #	 "min_samples_leaf": [30,40,50,100],\
    #	 "criterion": ["gini", "entropy"]}
    param_grid = {"max_features": [2, 3, 4], "min_samples_leaf": [50]}
    grid_search = GridSearchCV(rf,
                               cv=10,
                               scoring='roc_auc',
                               param_grid=param_grid,
                               iid=False)

    #*************************************************************************************
    #   c. output the best model and make predictions for test data
    #       - Use best parameter to build model with training_new data
    #*************************************************************************************
    grid_search.fit(x_train, y_train)
    print "the best parameter:", grid_search.best_params_
    print "the best score:", grid_search.best_score_
    #print "the parameters used:",grid_search.get_params

    #*************************************************************************************
    #   To see how fit the model with the training_new data
    #       -Use the model trained to make predication for train_new data
    #*************************************************************************************

    predicted_probs_train = grid_search.predict_proba(x_train)
    predicted_probs_train = [x[1] for x in predicted_probs_train]
    computeAUC(y_train, predicted_probs_train)

    #*************************************************************************************
    #   To see how well the model performs with the test_new data
    #    -Use the model trained to make predication for validataion data (test_new)
    #*************************************************************************************
    predicted_probs_test_new = grid_search.predict_proba(x_test_new)
    predicted_probs_test_new = [x[1] for x in predicted_probs_test_new]
    computeAUC(y_test_new, predicted_probs_test_new)

    #*************************************************************************************
    #  use the model to predict for test and output submission file
    #*************************************************************************************
    predicted_probs_test = grid_search.predict_proba(x_test)
    predicted_probs_test = ["%.9f" % x[1] for x in predicted_probs_test]
    submission = pd.DataFrame({
        'ID': test_id,
        'Probabilities': predicted_probs_test
    })
    submission.to_csv("rf_benchmark.csv", index=False)
Esempio n. 3
0
class insan:
    boy = 180

    def kosmak(self, b):
        return b + 10


ali = insan()
print(ali.boy)
print(ali.kosmak(90))

#eksik veriler
#sci - kit learn
from sklearn.preprocessing import Imputer

imputer = Imputer(missing_values='NaN', strategy='mean', axis=0)

Yas = veriler.iloc[:, 1:4].values
print(Yas)
imputer = imputer.fit(Yas[:, 1:4])
Yas[:, 1:4] = imputer.transform(Yas[:, 1:4])
print(Yas)

ulke = veriler.iloc[:, 0:1].values
print(ulke)
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
ulke[:, 0] = le.fit_transform(ulke[:, 0])
print(ulke)
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(categorical_features='all')
Esempio n. 4
0
bestAUUC = 1000
bestPredictors = []
for j in range(len(predictorBank)):

    predictors = predictorBase + [predictorBank[j]]

    iterations = 50
    auucs = np.zeros(iterations)
    auucs2 = np.zeros(iterations)
    for i in range(iterations):
        df = df.sample(frac=1.)
        #df = df.sort_values(by = 'timesec')
        dfTrain = df[trainIndex]
        dfTest = df[testIndex]
        dfVal = df[valIndex]
        imputer = Imputer()
        scaler = StandardScaler()
        xTrain = imputer.fit_transform(dfTrain[predictorBase +
                                               predictorsExtra].values)
        xVal = imputer.transform(dfVal[predictorBase + predictorsExtra].values)
        xTest = imputer.transform(dfTest[predictorBase +
                                         predictorsExtra].values)
        xTrain = pd.DataFrame(scaler.fit_transform(xTrain),
                              columns=predictorBase + predictorsExtra)
        xVal = pd.DataFrame(scaler.transform(xVal),
                            columns=predictorBase + predictorsExtra)
        xTest = pd.DataFrame(scaler.transform(xTest),
                             columns=predictorBase + predictorsExtra)
        #        xTrainPoly = pieceFeature(dfTrain['surgical'].values, pieceFeature(dfTrain['icuatalert'].values, xTrain))
        #        xTestPoly = pieceFeature(dfTest['surgical'].values, pieceFeature(dfTest['icuatalert'].values, xTest))
Esempio n. 5
0
#plt.tight_layout(h_pad = 2.5)

#plt.show()

# Make a new dataframe for polynomial features
poly_features = app_train[[
    'EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'DAYS_BIRTH', 'TARGET'
]]
poly_features_test = app_test[[
    'EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'DAYS_BIRTH'
]]

# imputer is for handling missing values
from sklearn.preprocessing import Imputer
imputer = Imputer(strategy='median')

poly_target = poly_features['TARGET']

poly_features = poly_features.drop(columns=['TARGET'])

# Need to impute missing values
poly_features = imputer.fit_transform(poly_features)
poly_features_test = imputer.transform(poly_features_test)

from sklearn.preprocessing import PolynomialFeatures

# Create the polynomial object with specified degree
poly_transformer = PolynomialFeatures(degree=3)
# Train the polynomial features
poly_transformer.fit(poly_features)
Esempio n. 6
0
import pandas as pd
from sklearn.model_selection import train_test_split

dataset = pd.read_csv("C:\\Users\\B!ade\\Downloads\\expdata.csv")

dataset_corr = dataset.corr()

dataset_final = dataset[[
    "Overall", "International Reputation", "Reactions", "Value", "Wage"
]]
dataset_final.info()

dataset_final = dataset_final.iloc[:, :].values

from sklearn.preprocessing import Imputer
imputer = Imputer(missing_values="NaN", axis=0)
imputer = imputer.fit(dataset_final[:, :])
dataset_final[:, :] = imputer.transform(dataset_final[:, :])

X = dataset_final[:, 0:4]
y = dataset_final[:, 4]

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=0)

from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)
Esempio n. 7
0
# ## Another Example
#
# Here are partial plots from a very simple model on the Titanic data.

# In[ ]:

titanic_data = pd.read_csv('../input/titanic/train.csv')
titanic_y = titanic_data.Survived
clf = GradientBoostingClassifier()
titanic_X_colns = [
    'PassengerId',
    'Age',
    'Fare',
]
titanic_X = titanic_data[titanic_X_colns]
my_imputer = Imputer()
imputed_titanic_X = my_imputer.fit_transform(titanic_X)

clf.fit(imputed_titanic_X, titanic_y)
titanic_plots = plot_partial_dependence(clf,
                                        features=[1, 2],
                                        X=imputed_titanic_X,
                                        feature_names=titanic_X_colns,
                                        grid_resolution=8)

# These might seem surprising at first glance.  But they show some interesting insights:
# * Being young increased your odds of survival. This is consistent with historical recountings that they got women and children off the Titanic first.
# * People who paid more had better odds of survival.  It turns out that higher fares got you a cabin that was closer to the top of the boat, and may have given you better odds of getting a life-boat.
#
# # Conclusion
# Partial dependence plots are a great way (though not the only way) to extract insights from complex models.  These can be incredibly powerful for communicating those insights to colleagues or non-technical users.
'''

# Import FeatureUnion
from sklearn.pipeline import FeatureUnion

# Split using ALL data in sample_df
X_train, X_test, y_train, y_test = train_test_split(
    sample_df[['numeric', 'with_missing', 'text']],
    pd.get_dummies(sample_df['label']),
    random_state=22)

# Create a FeatureUnion with nested pipeline: process_and_join_features
process_and_join_features = FeatureUnion(
    transformer_list=[('numeric_features',
                       Pipeline([('selector',
                                  get_numeric_data), ('imputer', Imputer())])),
                      ('text_features',
                       Pipeline([(
                           'selector',
                           get_text_data), ('vectorizer',
                                            CountVectorizer())]))])

# Instantiate nested pipeline: pl
pl = Pipeline([('union', process_and_join_features),
               ('clf', OneVsRestClassifier(LogisticRegression()))])

# Fit pl to the training data
pl.fit(X_train, y_train)

# Compute and print accuracy
accuracy = pl.score(X_test, y_test)
from sklearn.svm import SVC
from sklearn import svm
from sklearn.model_selection import KFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVR



dataset = pd.read_csv('E:/LINEARREGRESSION/Vijay/Titanic Dataset/INPUT/train.csv')
test_data = pd.read_csv('E:/LINEARREGRESSION/Vijay/Titanic Dataset/INPUT/test.csv')

y_train = dataset.iloc[:, 1].values
X_train = dataset.iloc[:, [2, 4, 5, 6]].values
X_test = test_data.iloc[:, [1, 3, 4, 5]].values

imp_mean = Imputer()
imp_mean = imp_mean.fit(X_train[:, 2:4])
X_train[:, 2:4] = imp_mean.transform(X_train[:, 2:4])

imp_mean = imp_mean.fit(X_test[:, 2:4])
X_test[:, 2:4] = imp_mean.transform(X_test[:, 2:4])

labelencoder_x = LabelEncoder()
X_train[:, 1] = labelencoder_x.fit_transform(X_train[:, 1].astype(str))

X_test[:, 1] = labelencoder_x.fit_transform(X_test[:, 1].astype(str))


# #Grid Search   

# Random Forest Classifier
import pandas as pd

# Making the splits
training = pd.read_csv('train.csv')
X_train = training.iloc[:, [2, 4, 5, 6, 7, 9]].values
y_train = training.iloc[:, 1].values
testing = pd.read_csv('test.csv')
X_test = testing.iloc[:, [1, 3, 4, 5, 6, 8]].values

# Reshaping to a matrix
X_train = X_train.reshape(-1, 6)
X_test = X_test.reshape(-1, 6)

# Filling in missing data
from sklearn.preprocessing import Imputer
train_imputer = Imputer(missing_values = np.nan, strategy = 'mean', axis = 0)
train_imputer = train_imputer.fit(X_train[:, 2:3])
X_train[:, 2:3] = train_imputer.transform(X_train[:, 2:3])

test_imputer = Imputer(missing_values = np.nan, strategy = 'mean', axis = 0)
test_imputer = test_imputer.fit(X_test[:, 2:6])
X_test[:, 2:6] = test_imputer.transform(X_test[:, 2:6])

# Encoding categorical data
from sklearn.preprocessing import LabelEncoder
labelencoder_X = LabelEncoder()
X_train[:, 1] = labelencoder_X.fit_transform(X_train[:, 1])
X_test[:, 1] = labelencoder_X.transform(X_test[:, 1])

# Feature scaling
from sklearn.preprocessing import StandardScaler
Esempio n. 11
0
 def __init__(self):
     self.data = None
     self.X_train = None
     self.Y_train = None
     self.X_test = None
     self.Y_test = None
     self.clf = None
     category_binarizer = OnceFittedLabelBinarizer()
     country_binarizer = OnceFittedLabelBinarizer()
     state_binarizer = OnceFittedLabelBinarizer()
     self.category_mapper = DataFrameMapper([
         (['category_code'], [CategoricalImputer(), category_binarizer]),
         (['country_code'], [CategoricalImputer(), country_binarizer]),
         (['state_code'], [CategoricalImputer(), state_binarizer]),
     ])
     self.mapper = DataFrameMapper([
         (['category_code'], [CategoricalImputer(), category_binarizer], {
             'alias': 'category'
         }),
         (['country_code'], [CategoricalImputer(), country_binarizer], {
             'alias': 'country'
         }),
         (['state_code'], [CategoricalImputer(), state_binarizer], {
             'alias': 'state'
         }),
         (['mba_degree'], [ValueImputer(0),
                           StandardScaler()]),
         (['phd_degree'], [ValueImputer(0),
                           StandardScaler()]),
         (['ms_degree'], [ValueImputer(0),
                          StandardScaler()]),
         (['other_degree'], [ValueImputer(0)]),
         (['age'], [Imputer(), StandardScaler()]),
         (['offices'], [ValueImputer(1.0),
                        StandardScaler()]),
         (['products_number'], [ValueImputer(1.0),
                                StandardScaler()]),
         (['average_funded', 'average_participants'],
          [ParticipantsImputer(), StandardScaler()], {
              'alias': 'average_participants'
          }),
         (['total_rounds'], None),
         (['ipo'], None),
         (['is_closed'], None),
         (['total_rounds',
           'average_funded'], [FundImputer(),
                               StandardScaler()], {
                                   'alias': 'average_funded'
                               }),
         (['acquired_companies'], [ValueImputer(0)]),
     ])
     SVC_C_grid = [10**i for i in range(-3, 4)]
     SVC_gamma_grid = [10**i for i in range(-3, 1)] + ['auto']
     MLP_hidden_layer_sizes = [[25], [50], [75], [100], [50, 25], [75, 50],
                               [100, 75], [75, 50, 25], [100, 75, 50]]
     MLP_activation = ['logistic', 'tanh', 'relu']
     self.grid = [{
         'clf': [GradientBoostingClassifier()],
         'clf__n_estimators': [20 * i for i in range(5, 8)],
         'clf__max_depth': [i + 3 for i in range(2, 6)]
     }, {
         'clf': [SVC(kernel='rbf', class_weight='balanced')],
         'clf__C': SVC_C_grid,
         'clf__gamma': SVC_gamma_grid
     }, {
         'clf': [SVC(kernel='poly', class_weight='balanced')],
         'clf__C': SVC_C_grid,
         'clf__gamma': SVC_gamma_grid,
         'clf__degree': list(range(3, 6))
     }, {
         'clf': [MLPClassifier()],
         'clf__hidden_layer_sizes': MLP_hidden_layer_sizes,
         'clf__activation': MLP_activation,
         'clf__alpha': [10**i for i in range(-1, 3)]
     }]
Esempio n. 12
0
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

#reading the the dataset and making a function of f(X)=y#
dataset = pd.read_csv('Data.csv')
X = dataset.iloc[:, 0:3].values
y = dataset.iloc[:, -1].values

dataset.describe()  #use to give basic knowledge of dataset#

#removing null/NaN values#
from sklearn.preprocessing import Imputer
imputer = Imputer(missing_values='NaN', strategy='most_frequent', axis=0)
imputer.fit(X[:, 1:3])
X[:, 1:3] = imputer.transform(X[:, 1:3])

#Removing the Categorical values for #
from sklearn.preprocessing import LabelEncoder
#for country names in X#
labelencoder_X = LabelEncoder()
X[:, 0] = labelencoder_X.fit_transform(X[:, 0])
#for yes/no in y#
labelencoder_y = LabelEncoder()
y = labelencoder_y.fit_transform(y)

#removing "Dumy variable trap" by creating a sparse matrix #
from sklearn.preprocessing import OneHotEncoder
onehotencoder = OneHotEncoder(categorical_features=[0])
X = onehotencoder.fit_transform(X)
X = X.toarray()
        return self

    def transform(self, X, y=None):
        X = X.copy()

        for col in X.columns:
            X.loc[:, col] = X.loc[:, col].astype('category')

        return X


num_pipeline = Pipeline([
    ('WordToNum', ConvertWordToNum()),
    ('DtypeCV', DtypeConverter()),
    ('selector', DataFrameSelector(num_features)),
    ('Imputer', Imputer(strategy="median")),
    ('StdScaler', StandardScaler()),
])


def full_pipeline_encoder(X, X_train, X_test, y_train, y_test):
    X_train_num = pd.DataFrame(num_pipeline.fit_transform(X_train), index=X_train.index,
                               columns=X_train[num_features].columns)
    X_test_num = pd.DataFrame(num_pipeline.transform(X_test), index=X_test.index,
                              columns=X_test[num_features].columns)

    X_OHE = pd.get_dummies(X[cat_features])
    X_train_ohe = X_OHE.loc[X_train.index]
    X_test_ohe = X_OHE.loc[X_test.index]

    X_train = pd.concat([X_train_num, X_train_ohe], axis=1)
Esempio n. 14
0
print(" positive cases " + str((num_true / len(df['diabetes'])) * 100))
print(" Negative cases " + str((num_false / len(df['diabetes'])) * 100))

X = df.loc[:, df.columns != 'diabetes'].values
y = df['diabetes'].values
split_test_size = 0.30

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=split_test_size,
                                                    random_state=42)

print("{0:0.2f}% in training set ".format(
    (len(X_train) / len(df.index)) * 100))

fill_imputed = Imputer(missing_values=0, strategy="mean", axis=0)
X_train = fill_imputed.fit_transform(X_train)
X_test = fill_imputed.fit_transform(X_test)

nb_model = GaussianNB()
nb_model.fit(X_train, y_train.ravel())
# print(X_test)
nb_predict_train = nb_model.predict(X_train)
# print(X_test)
# tryli = [];
# for tet in X_test:
#     tryli.append(tet[0])
#
# print("Score:" ,nb_model.score(X_test, y_test))
# plt.scatter(tryli,y_test, c=nb_predict_train);
# # plt.scatter()
Esempio n. 15
0
# -*-coding:utf8-*-

import numpy as np
import pandas as pd

from sklearn.preprocessing import Imputer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

imputer = Imputer(missing_values = "NaN", strategy = "mean", axis = 0)
dataset = pd.read_csv('dataset/data.csv')
x = dataset.iloc[:,:-1].values
y = dataset.iloc[:, 3].values

imputer = imputer.fit(x[:, 1:3])
x[:, 1:3] = imputer.transform(x[:, 1:3])

labelencoder_x = LabelEncoder()
x[:, 0] = labelencoder_x.fit_transform(x[:, 0])
onehotencoder = OneHotEncoder(categorical_features=[0])
x = onehotencoder.fit_transform(x).toarray()
print(x)
Esempio n. 16
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--C',
                        type=float,
                        default=1.0,
                        help='inverse of L1 / L2 regularization')
    parser.add_argument('--l1', dest='l2', action='store_false')
    parser.add_argument('--l2', dest='l2', action='store_true')
    parser.set_defaults(l2=True)
    parser.add_argument('--period',
                        type=str,
                        default='all',
                        help='specifies which period extract features from',
                        choices=[
                            'first4days', 'first8days', 'last12hours',
                            'first25percent', 'first50percent', 'all'
                        ])
    parser.add_argument('--features',
                        type=str,
                        default='all',
                        help='specifies what features to extract',
                        choices=['all', 'len', 'all_but_len'])
    parser.add_argument('--data',
                        type=str,
                        help='Path to the data of in-hospital mortality task',
                        default=os.path.join(
                            os.path.dirname(__file__),
                            '../../../data/in-hospital-mortality/'))
    parser.add_argument(
        '--output_dir',
        type=str,
        help='Directory relative which all output files are stored',
        default='.')
    args = parser.parse_args()
    print(args)
    print("Path")
    print(os.path)
    train_reader = InHospitalMortalityReader(
        dataset_dir=
        "/home/sunitha/Documents/7thSem/DA/mimic-iii-clinical-database-demo-1.4/src/data/in-hospital-mortality/train",
        listfile=
        '/home/sunitha/Documents/7thSem/DA/mimic-iii-clinical-database-demo-1.4/src/data/in-hospital-mortality/train/train_listfile.csv',
        period_length=48.0)

    val_reader = InHospitalMortalityReader(
        dataset_dir=
        '/home/sunitha/Documents/7thSem/DA/mimic-iii-clinical-database-demo-1.4/src/data/in-hospital-mortality/train',
        listfile=
        '/home/sunitha/Documents/7thSem/DA/mimic-iii-clinical-database-demo-1.4/src/data/in-hospital-mortality/train/train_listfile.csv',
        period_length=48.0)

    test_reader = InHospitalMortalityReader(
        dataset_dir=
        '/home/sunitha/Documents/7thSem/DA/mimic-iii-clinical-database-demo-1.4/src/data/in-hospital-mortality/test',
        listfile=
        '/home/sunitha/Documents/7thSem/DA/mimic-iii-clinical-database-demo-1.4/src/data/in-hospital-mortality/test/test_listfile.csv',
        period_length=48.0)

    print('Reading data and extracting features ...')
    (train_X, train_y,
     train_names) = read_and_extract_features(train_reader, args.period,
                                              args.features)
    (val_X, val_y,
     val_names) = read_and_extract_features(val_reader, args.period,
                                            args.features)
    (test_X, test_y,
     test_names) = read_and_extract_features(test_reader, args.period,
                                             args.features)
    print('  train data shape = {}'.format(train_X.shape))
    print('  validation data shape = {}'.format(val_X.shape))
    print('  test data shape = {}'.format(test_X.shape))
    print("---------------")
    print(train_X[0])

    print("---------------")
    print(train_names[0])
    print('Imputing missing values ...')
    imputer = Imputer(missing_values=np.nan,
                      strategy='mean',
                      axis=0,
                      verbose=0,
                      copy=True)
    imputer.fit(train_X)
    train_X = np.array(imputer.transform(train_X), dtype=np.float32)
    val_X = np.array(imputer.transform(val_X), dtype=np.float32)
    test_X = np.array(imputer.transform(test_X), dtype=np.float32)

    print('Normalizing the data to have zero mean and unit variance ...')
    scaler = StandardScaler()
    scaler.fit(train_X)
    train_X = scaler.transform(train_X)
    val_X = scaler.transform(val_X)
    test_X = scaler.transform(test_X)

    penalty = ('l2' if args.l2 else 'l1')
    file_name = '{}.{}.{}.C{}'.format(args.period, args.features, penalty,
                                      args.C)

    decision_tree = DecisionTreeClassifier(random_state=42, max_depth=5)
    decision_tree.fit(train_X, train_y)

    result_dir = os.path.join(args.output_dir, 'results')
    common_utils.create_directory(result_dir)

    with open(os.path.join(result_dir, 'train_{}.json'.format(file_name)),
              'w') as res_file:
        ret = print_metrics_binary(train_y,
                                   decision_tree.predict_proba(train_X))
        ret = {k: float(v) for k, v in ret.items()}
        json.dump(ret, res_file)

    with open(os.path.join(result_dir, 'val_{}.json'.format(file_name)),
              'w') as res_file:
        ret = print_metrics_binary(val_y, decision_tree.predict_proba(val_X))
        ret = {k: float(v) for k, v in ret.items()}
        json.dump(ret, res_file)

    prediction = decision_tree.predict_proba(test_X)[:, 1]

    with open(os.path.join(result_dir, 'test_{}.json'.format(file_name)),
              'w') as res_file:
        ret = print_metrics_binary(test_y, prediction)
        ret = {k: float(v) for k, v in ret.items()}
        json.dump(ret, res_file)

    save_results(
        test_names, prediction, test_y,
        os.path.join(args.output_dir, 'predictions', file_name + '.csv'))
Esempio n. 17
0
    if value == 'N':
        return 0
    if value == 'Y':
        return 1
    if value == 'X':
        return 2


X1['md_trial'] = X1['md_trial'].apply(xyn_to_number)
X2['md_trial'] = X2['md_trial'].apply(xyn_to_number)
logging.debug("Normalized X1, X2")

X_all, y_all = pd.concat([X1, X2]), pd.concat([y1, y2])

logging.debug("Imputing & scaling X1, X2")
imputer = Imputer(missing_values='NaN')
X_all = imputer.fit_transform(X_all)
min_max_scaler = MinMaxScaler()
X_all = min_max_scaler.fit_transform(X_all)
logging.debug("Imputed & scaled X1, X2")

X_train, X_test, y_train, y_test = train_test_split(X_all,
                                                    y_all,
                                                    test_size=0.30)

# Parameters
learning_rate = 0.000001
training_epochs = 2000
batch_size = 256
test_step = 10
Esempio n. 18
0
 def __init__(self):
     self.reg = make_pipeline(Imputer(strategy='median'),
                              ExtraTreesRegressor(n_estimators=10))
def model_training(data,
                   feat_key,
                   le,
                   remove_nan,
                   perc_train_size,
                   output_file,
                   model_file,
                   sov_encoder_file,
                   n_estimators=500,
                   min_samples_leaf=1):

    #import seaborn as sns
    import numpy as np
    import pandas as pd
    import matplotlib.pyplot as plt

    from sklearn.model_selection import train_test_split
    from sklearn.preprocessing import StandardScaler
    from sklearn.preprocessing import Imputer
    from sklearn.preprocessing import OneHotEncoder
    from sklearn.preprocessing import LabelEncoder
    from sklearn.utils import check_random_state
    from sklearn.externals import joblib
    from sklearn.ensemble import RandomForestClassifier
    from sklearn import tree

    data_index = data.index  # Se crea la variable data_index para publicar el output.
    y_ = np.array(data.pop('IssuerRating'))
    X_ = np.array(data[feat_key["Key"]])

    # Remove observations with no output
    ind_valid_out = [is_string(yi) for yi in y_]
    X = X_[ind_valid_out]
    y = y_[ind_valid_out]
    data_index = data_index[ind_valid_out]
    # Encode y values,
    y = np.array(
        [list(le.loc[yi])[0] if is_string(yi) else float('NaN') for yi in y])

    # Encode Sovereig rating
    sr = feat_key[feat_key["Key"] == 'SovereignRating']
    if len(sr) > 0:
        pos_sr = feat_key.index.get_loc(
            sr.index[0])  # Position sovereign rating
        pos_str = [is_string(x) for x in X[:, pos_sr]]
        labels = np.unique(X[pos_str, pos_sr])
        le_X = LabelEncoder()
        le_X.fit(labels)
        X[pos_str, pos_sr] = le_X.transform(X[pos_str, pos_sr])
        joblib.dump(le_X, sov_encoder_file)  # Save sovereign label encoder

    # Remove NaN
    if remove_nan:
        ind_not_na = [not np.isnan(np.sum(x)) for x in X]
        X = X[ind_not_na]
        y = y[ind_not_na]
        data_index = data_index[ind_not_na]
    else:
        imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
        imp.fit(X=X_train)
        X = imp.transform(X)

    # Data Permitation:
    random_state = check_random_state(0)
    permutation = random_state.permutation(X.shape[0])

    X = X[permutation]
    y = y[permutation]
    data_index = data_index[permutation]

    # Train and test samples:

    train_size = int(X.shape[0] * perc_train_size)
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        train_size=train_size,
                                                        shuffle=False)

    print('Muestra de entrenamiento: %d' % X_train.shape[0])
    print('Muestra de testing: %d' % X_test.shape[0])
    print('')

    # Model fitting:
    clf = RandomForestClassifier(n_estimators=n_estimators,
                                 max_features="auto",
                                 min_samples_leaf=min_samples_leaf)
    clf.fit(X_train, y_train)

    # Save model
    joblib.dump(clf, model_file)

    score = clf.score(X_test, y_test)
    print('Score sobre muestra de testing:')
    print(score)
    print('')

    # output file:

    pred_calif = np.array([
        le.iloc[x == list(le.iloc[:, 0]), 0].index[0]
        for x in clf.predict(X_test)
    ])
    y_test_calif = np.array(
        [le.iloc[x == list(le.iloc[:, 0]), 0].index[0] for x in y_test])

    if len(sr) > 0:
        X_test[:, pos_sr] = le_X.inverse_transform(
            X_test[:,
                   pos_sr].astype('int'))  # Inverse transform of sov. ratingsS

    data_test = pd.DataFrame(
        np.column_stack((np.column_stack((X_test, y_test_calif)), pred_calif)),
        columns=list(feat_key.index) + ['Rating Test', 'Rating Predicc'],
        index=data_index[np.arange(train_size, data_index.shape[0])])

    # Output file:
    data_test.to_csv(output_file)

    # Variables importances:
    importances = clf.feature_importances_
    std = np.std([tree.feature_importances_ for tree in clf.estimators_],
                 axis=0)
    indices = np.argsort(importances)[::-1]

    # Print the feature ranking
    print('')
    print("Ranking:")
    for f in range(X_train.shape[1]):
        print("%d. %s (%f)" %
              (f + 1, feat_key.index[indices[f]], importances[indices[f]]))

    # Plot importances:
    print('')
    plt.figure()
    plt.title("Importancias")
    plt.bar(range(X.shape[1]),
            importances[indices],
            color="r",
            yerr=std[indices],
            align="center")

    plt.xticks(range(X.shape[1]), np.arange(X.shape[1]) + 1)
    plt.xlim([-1, X_train.shape[1]])
    plt.show()

    return (None)
def run_method():
    global db
    
    sets = file_all  # file_sets 
    
    mlp_para = list(range(2,21))
    rf_para = list(range(2,21))
    
    columns_cla = ["method", "parameter(# of trees or # of layers)", "evaluation", "value"]
    columns_regr = ["method", "parameter(# of layers)", "evaluation", "value"]
    df_load_cla = pd.DataFrame(columns=columns_cla)
    df_load_regr = pd.DataFrame(columns=columns_regr)
    df_perf_cla = pd.DataFrame(columns=columns_cla)
    df_perf_regr = pd.DataFrame(columns=columns_regr)
    
    for turn,file in enumerate(sets):
        db = pd.read_csv('../csv/' + file + '.csv')
        print('open file ','../csv/' + file + '.csv')
        
        # Preprocessingm
        # Imputation of missing values
        db_values = db.values
        instance_db = db_values[:, 4:-4]
        for i in range(instance_db.shape[0]):
            for j in range(instance_db.shape[1]):
                # process the null
                if instance_db[i][j] == 'null':
                    # assign NaN to null for future process
                    instance_db[i][j] = 'NaN'
        # tackle column whose members are all NaN
        for j in range(instance_db.shape[1]):
            if list(instance_db[:,j]) == list(['NaN']*instance_db.shape[0]):
                instance_db[:,j] = [0]*instance_db.shape[0]
        imp = Imputer(missing_values='NaN', strategy='mean', axis=0, verbose=1)
        imp.fit(instance_db)
        instance_db = imp.transform(instance_db)
        # Normalization
        scaler = StandardScaler()
        scaler.fit(instance_db)
        instance_db = scaler.transform(instance_db)
        db_values[:, 4:-4] = instance_db
        db = pd.DataFrame(db_values,columns = db.columns)
        
        # target 
        target_db = db_values[:, -4:]
        
        # load scores as target
        scores_load = target_db[:, 0]
        
        # load levels as target
        levels_load = target_db[:, 1]
        
        # convert 'A'... to 0... 
        levels_load = [ord(x) - ord('A') for x in levels_load] 
        
        # invalidate feature selection
        # instance_data = db[select_load]
        instance_data = db[feature]
        
        # regression process
        for i in range(6):
            for para in mlp_para:
                mlp_result, y_test = regression(instance_data, scores_load, para)
                _mse, _R2 = regression_valuate(mlp_result, y_test)
                df_load_regr.loc[df_load_regr.shape[0]] = ["mlp", para, "mse", _mse]
                df_load_regr.loc[df_load_regr.shape[0]] = ["mlp", para, "R2", _R2]
        
        # classification process
        count = Counter(levels_load)
        balance = np.array([count[i] for i in count])/len(levels_load) > 0.1
        _select = len(set(balance)) > 1
        if len(set(levels_load)) > 1 and _select:
            for i in range(6):
                for para in rf_para:
                    rf_result, y_test = classification_rf(instance_data, levels_load, para)
                    p, r, f = precision_recall_fscore_support(y_test, rf_result, average = 'macro')[:3]
                    df_load_cla.loc[df_load_cla.shape[0]] = ["rf", para, "precision", p]
                    df_load_cla.loc[df_load_cla.shape[0]] = ["rf", para, "recall", r]
                    df_load_cla.loc[df_load_cla.shape[0]] = ["rf", para, "fscore", f]
                for para in mlp_para:
                    mlp_result, y_test = classification_mlpc(instance_data, levels_load, para)
                    p, r, f = precision_recall_fscore_support(y_test, mlp_result, average = 'macro')[:3]
                    df_load_cla.loc[df_load_cla.shape[0]] = ["mlpc", para, "precision", p]
                    df_load_cla.loc[df_load_cla.shape[0]] = ["mlpc", para, "recall", r]
                    df_load_cla.loc[df_load_cla.shape[0]] = ["mlpc", para, "fscore", f]
        
        # performance scores as target
        scores_perf = target_db[:, 2]
        
        # performance levels as target
        levels_perf = target_db[:,3]
        
        # convert 'A'... to 0... 
        levels_perf = [ord(x) - ord('A') for x in levels_perf]
        
        # invalidate feature selection
        # instance_data = db[select_perf]
        instance_data = db[feature]
        
        # regression process
        for i in range(6):
            for para in mlp_para:
                mlp_result, y_test = regression(instance_data, scores_perf, para)
                _mse, _R2 = regression_valuate(mlp_result, y_test)
                df_perf_regr.loc[df_perf_regr.shape[0]] = ["mlp", para, "mse", _mse]
                df_perf_regr.loc[df_perf_regr.shape[0]] = ["mlp", para, "R2", _R2]
        
        # classification processhape[0]
        count = Counter(levels_perf)
        balance = np.array([count[i] for i in count])/len(levels_load) > 0.1
        _select = len(set(balance)) > 1
        if len(set(levels_perf)) > 1 and _select:
            for i in range(6):
                for para in rf_para:
                    rf_result, y_test = classification_rf(instance_data, levels_perf, para)
                    p, r, f = precision_recall_fscore_support(y_test, rf_result, average = 'macro')[:3]
                    df_perf_cla.loc[df_perf_cla.shape[0]] = ["rf", para, "precision", p]
                    df_perf_cla.loc[df_perf_cla.shape[0]] = ["rf", para, "recall", r]
                    df_perf_cla.loc[df_perf_cla.shape[0]] = ["rf", para, "fscore", f]
                for para in mlp_para:
                    mlp_result, y_test = classification_mlpc(instance_data, levels_perf, para)
                    p, r, f = precision_recall_fscore_support(y_test, mlp_result, average = 'macro')[:3]
                    df_perf_cla.loc[df_perf_cla.shape[0]] = ["mlpc", para, "precision", p]
                    df_perf_cla.loc[df_perf_cla.shape[0]] = ["mlpc", para, "recall", r]
                    df_perf_cla.loc[df_perf_cla.shape[0]] = ["mlpc", para, "fscore", f]
    return df_load_cla, df_load_regr, df_perf_cla, df_perf_regr
Esempio n. 21
0

# In[6]:

datacopy = data.copy()
data = apply_thresholding(data, thres=THRESH_BINARY_AND_THRESH_OTSU)
data.head()

# In[7]:

training_features = data.copy()

# In[8]:

from sklearn.preprocessing import Imputer
imputer = Imputer(strategy="median")
imputer.fit(training_features)

# In[9]:

from sklearn.preprocessing import StandardScaler
scalar = StandardScaler()
scalar.fit(training_features)

# In[10]:

from sklearn.decomposition import PCA

# In[11]:

training_features = imputer.transform(training_features)
Esempio n. 22
0
filename = 'parkinson.csv'
raw_data = open(filename, 'rt')
reader = csv.reader(raw_data, delimiter=',', quoting=csv.QUOTE_NONE)
td = list(reader)
data = numpy.array(td).astype('str')



x = data[:, 1:23]  # select columns 1 through end


x = numpy.array(x).astype('float')

w = data[:, 23]   # select column 0, the stock priceprint(w)
imp = Imputer(missing_values="NaN", strategy='median', axis=0)
x = imp.fit_transform(x)
w = numpy.array(w).astype('float')




for i in range(0, len(x)):

    if (w[i] == 0):
        w[i]=1

    else:
        w[i] = 0

print("total no.of rows : {0}".format(len(data)))
print("total no. of missing rows of Pregnancies : {0}".format(len(data.loc[data['Pregnancies']==0])))
print("total no. of missing rows of Glucose : {0}".format(len(data.loc[data['Glucose']==0])))
print("total no. of missing rows of BloodPressure : {0}".format(len(data.loc[data['BloodPressure']==0])))
print("total no. of missing rows of SkinThickness : {0}".format(len(data.loc[data['SkinThickness']==0])))
print("total no. of missing rows of Insulin: {0}".format(len(data.loc[data['Insulin']==0])))
print("total no. of missing rows of BMI: {0}".format(len(data.loc[data['BMI']==0])))
print("total no. of missing rows of DiabetesPedigreeFunction: {0}".format(len(data.loc[data['DiabetesPedigreeFunction']==0])))
print("total no. of missing rows of Age: {0}".format(len(data.loc[data['Age']==0])))


# In[19]:


from sklearn.preprocessing import Imputer
fill_values = Imputer(missing_values = 0, strategy = 'mean',axis = 0)
X_train = fill_values.fit_transform


# In[20]:


from sklearn.ensemble import RandomForestClassifier
random_forest_model = RandomForestClassifier(random_state = 10)
random_forest_model.fit(X_train, y_train.ravel())


# In[18]:


predict_train_data = random_forest_model.predict(X_test)
Esempio n. 24
0
def build_audit_na(classifier, name, with_proba=True, **kwargs):
    employment_mapping = {
        "CONSULTANT": "PRIVATE",
        "PSFEDERAL": "PUBLIC",
        "PSLOCAL": "PUBLIC",
        "PSSTATE": "PUBLIC",
        "SELFEMP": "PRIVATE",
        "PRIVATE": "PRIVATE"
    }
    gender_mapping = {"FEMALE": 0, "MALE": 1}
    mapper = DataFrameMapper([(["Age"], [
        ContinuousDomain(missing_values=None, with_data=False),
        Alias(ExpressionTransformer(
            "numpy.where(pandas.notnull(X[:, 0]), X[:, 0], -999)"),
              name="flag_missing(Age, -999)"),
        Imputer(missing_values=-999)
    ])] + [(["Hours"], [
        ContinuousDomain(missing_values=None, with_data=False),
        Alias(ExpressionTransformer(
            "numpy.where(pandas.isnull(X[:, 0]), -999, X[:, 0])"),
              name="flag_missing(Hours, -999)"),
        Imputer(missing_values=-999)
    ])] + [(["Income"], [
        ContinuousDomain(missing_values=None,
                         outlier_treatment="as_missing_values",
                         low_value=5000,
                         high_value=200000,
                         with_data=False),
        Imputer()
    ])] + [(["Employment"], [
        CategoricalDomain(missing_values=None, with_data=False),
        CategoricalImputer(),
        StringNormalizer(function="uppercase"),
        LookupTransformer(employment_mapping, "OTHER"),
        StringNormalizer(function="lowercase"),
        PMMLLabelBinarizer()
    ])] + [([column], [
        CategoricalDomain(missing_values=None, with_data=False),
        CategoricalImputer(missing_values=None),
        StringNormalizer(function="lowercase"),
        PMMLLabelBinarizer()
    ]) for column in ["Education", "Marital", "Occupation"]] + [(["Gender"], [
        CategoricalDomain(missing_values=None, with_data=False),
        CategoricalImputer(),
        StringNormalizer(function="uppercase"),
        LookupTransformer(gender_mapping, None)
    ])])
    pipeline = PMMLPipeline([("mapper", mapper), ("classifier", classifier)])
    pipeline.fit(audit_na_X, audit_na_y)
    customize(classifier, **kwargs)
    store_pkl(pipeline, name + ".pkl")
    adjusted = DataFrame(pipeline.predict(audit_na_X), columns=["Adjusted"])
    if with_proba == True:
        adjusted_proba = DataFrame(
            pipeline.predict_proba(audit_na_X),
            columns=["probability(0)", "probability(1)"])
        adjusted = pandas.concat((adjusted, adjusted_proba), axis=1)
    if isinstance(classifier, DecisionTreeClassifier):
        Xt = pipeline_transform(pipeline, audit_na_X)
        adjusted_apply = DataFrame(classifier.apply(Xt), columns=["nodeId"])
        adjusted = pandas.concat((adjusted, adjusted_apply), axis=1)
    store_csv(adjusted, name + ".csv")
Esempio n. 25
0
def convertMissingValues(data):
    #sustituir valores nan por la media de los valores de la columna
    imp = Imputer(missing_values=np.nan, strategy='mean')
    imp.fit(data)
    impdata = imp.transform(data)
    return impdata
Esempio n. 26
0
def make_dataset(train_df, test_df):
    print("\n****************************************************")
    print("make dataset")
    print("Train data features shape: {}".format(train_df.shape))
    print("Test data features shape: {}".format(test_df.shape))
    
    # One Hot Encoding
    # yes/No -> 1/0
    le = LabelEncoder()
    le_count = 0

    # only label encode those variables with 2 or less categories
    for col in train_df:
        if train_df[col].dtype == "object":
            # if 2 or fewer unique categories
            if len(list(train_df[col].unique())) <= 2:
                print(col)
                # Train on the training data
                le.fit(train_df[col])
                # Transeform noth training and testing data
                train_df[col] = le.transform(train_df[col])
                test_df[col] = le.transform(test_df[col])

                # keep track of how many columns were label encoded
                le_count += 1

    print("{} columns were labeld encoded.\n".format(le_count))
    
    train_df = pd.get_dummies(train_df)
    test_df = pd.get_dummies(test_df)

    print("Train data features shape: {}".format(train_df.shape))
    print("Test data features shape: {}".format(test_df.shape))
    
    target = train_df["TARGET"]
    train_df, test_df = train_df.align(test_df, join="inner", axis=1)
    print("Train data features shape: {}".format(train_df.shape))
    print("Test data features shape: {}".format(test_df.shape))
    
    if "TARGET" in train_df:
        train_df = train_df.drop("TARGET", axis=1)

    features = list(train_df.columns)

    # Median imputation of missing values
    imputer = Imputer(strategy="median")
    print("DONE: Imputation")
    
    # Scale each feature 0 - 1
    scaler = MinMaxScaler(feature_range=(0, 1))
    print("DONE: Scale")

    # Fit on the training data
    imputer.fit(train_df)
    print("DONE: Fit")

    # Transform both training and test data
    train_df = imputer.transform(train_df.astype(np.float32))
    test_df = imputer.transform(test_df.astype(np.float32))
    print("DONE: Transform\n")
    
    # Repeat with the scaler
    scaler.fit(train_df.astype(np.int))
    train_df = scaler.transform(train_df)
    test_df = scaler.transform(test_df)
    print("DONE: Scalar Transform")

    print("Train data features shape: {}".format(train_df.shape))
    print("Test data features shape: {}".format(test_df.shape))
    
    np.save("../../all/train_X", train_df)
    np.save("../../all/train_target", target)
    np.save("../../all/test", test_df)
    print("DONE! save train.npy, target.npy, test.npy") 
    
    # X_train, X_val, y_train, y_val = train_test_split(train_df, target, test_size=0.2, random_state=0)

    # print("X_train shape: {}".format(X_train.shape))
    # print("X_val shape: {}".format(X_val.shape))
    # print("y_train shape: {}".format(y_train.shape))
    # print("y_val shape: {}".format(y_val.shape))
    
    # np.save("../../all/X_train", X_train)
    # np.save("../../all/X_val", X_val)
    # np.save("../../all/y_train", y_train)
    # np.save("../../all/y_val", y_val)
    
    print("\n*********** DONE! ***************")
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

# Necessary to use SMOTE in the same pipeline as sklearn
from imblearn.pipeline import Pipeline as imb_pipeline

# Plotting
import seaborn as sns
import matplotlib.pyplot as plt

# Added later so results can be replicable
# Set to None and functions that use this will use their default random state
RANDOM_STATE = 1

# Variables
base_pipeline = [('imputer', Imputer(strategy='median')),
                 ('resampling', SMOTE(random_state=RANDOM_STATE)),
                 ('selection', SelectKBest(score_func=f_classif)),
                 ('scaler', StandardScaler()), ('pca', PCA())]

# Stuff we want to test for each model before doing careful tuning
base_param_grid = {
    'scaler': [None, StandardScaler()],
    'selection__k': [7, 10, 15],
    'pca':
    [None, PCA(n_components=2),
     PCA(n_components=4),
     PCA(n_components=6)],
}

financial_features = [
Esempio n. 28
0
import numpy as np
import pandas as pd
from sklearn.preprocessing import Imputer
import keras
from keras.utils import np_utils

# PART 1 - DATAPREPROCESSING

data = pd.read_csv('cancer.data')
# Bazen kullanılan verilerde kayıp yada eksik değerler olabilir.
# İlk adımda eksik olan değerler -99999 değeri ile değiştirildi. (Opsiyonel)
data.replace('?', -99999, inplace='true')

# Imputer sınıfı ile eksik olan veriler. O veriye karşılık gelen kolonun(özelliğin)
# ortalaması, standart sapması vb. yöntemlerle doldurulur.
imp = Imputer(missing_values=-99999, strategy="mean", axis=0)
data = pd.DataFrame(imp.fit_transform(data))
data = data.drop(0, 1)

# Okunan veri girdi ve çıktı olarak ayrıştırılır.
output_data = np.array(data.iloc[:, 9])
input_data = np.array(data.iloc[:, :9])

# Çıktı verisi kategorik veri olduğundan bu veriyi numerik veriye çevirmek gerekir.
output_data = np_utils.to_categorical(output_data)

# Verinin %80'i train, %20'si test verisi olacak şekilde ayrılır.
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(input_data,
                                                    output_data,
                                                    test_size=0.2,
Esempio n. 29
0

# In[3]:


design_matrix = cross_section.loc[:,['Region','RegionName','AveragePrice','Population','dist_from_lon',
                                     'Unemployment_rate','GDP_per_cap','Inflation_index','Inflation',
                                     'Local_pshs','Outstanding_perc','Good_perc',
                                     'Inad_perc','Non_UK_born','Migrant_inflow_monthly',
                                     'Migrant_outflow_monthly','Net_immigration_monthly',
                                     'LFS_active_perc']]
design_matrix['Real_GVA_per_cap'] = design_matrix['GDP_per_cap'] *100/ design_matrix['Inflation_index']

from sklearn.preprocessing import Imputer

design_matrix[['Outstanding_perc','Good_perc']] =Imputer(strategy = 'median').fit_transform(design_matrix[['Outstanding_perc','Good_perc']])
design_matrix[['Inad_perc']] =Imputer(strategy = 'mean').fit_transform(design_matrix[['Inad_perc']])

design_matrix[['Local_pshs']] = Imputer(strategy = 'median').fit_transform(design_matrix[['Local_pshs']])

cols_to_create = ['Migrant_inflow_per_cap','Migrant_outflow_per_cap','Net_immig_per_cap','Non_UK_per_cap','Local_pshs_per_cap']
cols_to_use = ['Migrant_inflow_monthly','Migrant_outflow_monthly','Net_immigration_monthly','Non_UK_born','Local_pshs']
    
for new, old in zip(cols_to_create, cols_to_use):
    design_matrix[new] = design_matrix[old]/design_matrix['Population']



region_dummies = pd.get_dummies(cross_section.loc[:,['Region']],drop_first = True)
dummy_names = region_dummies.columns.values
Esempio n. 30
0
def Submit():
    p = preg.get()
    g = gl.get()
    bp = BP.get()
    st = ST.get()
    i = insulin.get()
    b = bmi.get()
    d = dpf.get()
    a = age.get()
    if (p == " " or g == " " or bp == " " or st == " " or i == " " or b == " "
            or d == " " or a == " "):
        messageBox.showinfo("ERROR", "Please fill all the entries")
    else:

        l1 = [[p, g, bp, st, i, b, d, a]]
        print(l1)
        # Logistic Regression
        # Importing the libraries
        import numpy as np
        #import matplotlib.pyplot as plt
        import pandas as pd

        # Importing the dataset
        dataset = pd.read_csv('diabetes.csv')
        X = np.array(dataset.drop('Outcome', 1))
        y = np.array(dataset['Outcome'])
        l1 = np.array(l1)

        # Splitting the dataset into the Training set and Test set
        from sklearn.model_selection import train_test_split
        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            test_size=0.2,
                                                            random_state=0)

        # Data Preprocessing
        # Taking care of missing data
        from sklearn.preprocessing import Imputer
        imputer = Imputer(missing_values=0, strategy='mean', axis=0)
        imputer = imputer.fit(
            X[:,
              1:9])  # Upper bound is excluded only Index 1 and 2 is included
        X[:, 1:9] = imputer.transform(X[:, 1:9])

        # Feature Scaling
        from sklearn.preprocessing import StandardScaler
        sc_X = StandardScaler()
        X_train = sc_X.fit_transform(X_train)
        X_test = sc_X.transform(X_test)
        l1_test = sc_X.transform(l1)

        # Fitting Logistic Regression to the Training set
        from sklearn.linear_model import LogisticRegression
        classifier = LogisticRegression(random_state=0)
        classifier.fit(X_train, y_train)

        # Predicting the Test set results
        y_pred = classifier.predict(X_test)

        # Making the Confusion Matrix
        from sklearn.metrics import confusion_matrix
        cm = confusion_matrix(y_test, y_pred)

        # Outcome of cm shows array two rows - correct and incorrect predictions
        # example of making a single class prediction

        from sklearn.datasets.samples_generator import make_blobs
        # generate 2d classification dataset
        X, y = make_blobs(n_samples=100,
                          centers=2,
                          n_features=8,
                          random_state=1)
        # fit final model
        model = LogisticRegression()
        model.fit(X, y)
        # define one new instance
        Xnew = l1_test
        # make a prediction
        ynew = model.predict(Xnew)
        print("X=%s, Predicted=%s" % (Xnew[0], ynew[0]))
        out = ynew[0]
        if out == 0:
            messagebox.showinfo("Result", "You don't have diabetes")
        elif out == 1:
            messagebox.showinfo("Result", "You have diabetes")