l1=LabelEncoder() data1['sex']=l1.fit_transform(data['Sex']) l2=LabelEncoder() data1['embarked']=l2.fit_transform(data['Embarked'].astype(str)) data1 = data1.drop(['Sex','Embarked'],axis=1) cater_cols=['Pclass','SibSp','Parch','sex','embarked'] contin_cols=['Age','Fare'] total_df=cater_cols+contin_cols conti=pd.DataFrame(data1,columns=contin_cols) cater=pd.DataFrame(data1,columns=cater_cols) impca=Imputer(missing_values='NaN',strategy='most_frequent') impca_out=impca.fit_transform(cater) impca_df=pd.DataFrame(impca_out,columns=cater_cols) impco=Imputer(missing_values='NaN',strategy='mean') impo_out = impco.fit_transform(conti) impco_df=pd.DataFrame(impo_out,columns=contin_cols) total_df_data=pd.concat([impca_df,impco_df],axis=1) sl1=StandardScaler() sl2=sl1.fit_transform(total_df_data) sl3=pd.DataFrame(sl2,columns=total_df)
def main(): #************************************************************************************* #1.load data (training and test) and preprocessing data(replace NA,98,96,0(age) with NaN) #read data using pandas #replace 98, 96 with NAN for NOTime30-59,90,60-90 #replace 0 with NAN for age #************************************************************************************* colnames = ['ID', 'label', 'RUUnsecuredL', 'age', 'NOTime30-59', \ 'DebtRatio', 'Income', 'NOCredit', 'NOTimes90', \ 'NORealEstate', 'NOTime60-89', 'NODependents'] col_nas = ['', 'NA', 'NA', 0, [98, 96], 'NA', 'NA', 'NA', \ [98, 96], 'NA', [98, 96], 'NA'] col_na_values = creatDictKV(colnames, col_nas) dftrain = pd.read_csv("cs-training.csv", names=colnames, \ na_values=col_na_values, skiprows=[0]) train_id = [int(x) for x in dftrain.pop("ID")] y_train = np.asarray([int(x) for x in dftrain.pop("label")]) x_train = dftrain.as_matrix() dftest = pd.read_csv("cs-test.csv", names=colnames, \ na_values=col_na_values, skiprows=[0]) test_id = [int(x) for x in dftest.pop("ID")] y_test = np.asarray(dftest.pop("label")) x_test = dftest.as_matrix() #************************************************************************************* #2.split training data into training_new and test_new (for validation model) # to keep the class ratio using StratifiedShuffleSplit to do the split #************************************************************************************* sss = StratifiedShuffleSplit(n_splits=1, test_size=0.33333, random_state=0) for train_index, test_index in sss.split(x_train, y_train): print("TRAIN:", train_index, "TEST:", test_index) x_train_new, x_test_new = x_train[train_index], x_train[test_index] y_train_new, y_test_new = y_train[train_index], y_train[test_index] y_train = y_train_new x_train = x_train_new #***************************************************************************************** #3.impute the data with imputer: replace MVs with Mean #***************************************************************************************** imp = Imputer(missing_values='NaN', strategy='mean', axis=0) imp.fit(x_train) x_train = imp.transform(x_train) x_test_new = imp.transform(x_test_new) x_test = imp.transform(x_test) #***************************************************************************************** #4.Build RF model using the training_new data: # a. handle imbalanced data distribution by # setting class_weight="balanced"/"balanced_subsample" # n_samples / (n_classes * np.bincount(y)) #***************************************************************************************** # Initialize the model: #***************************************************************************************** rf = RandomForestClassifier(n_estimators=100, \ oob_score=True, \ min_samples_split=2, \ min_samples_leaf=50, \ n_jobs=-1, \ #class_weight="balanced",\ class_weight="balanced_subsample", \ bootstrap=True\ ) #************************************************************************************* # b. perform parameter tuning using grid search with CrossValidation #************************************************************************************* #param_grid={"max_features": [2,3,4,5],\ # "min_samples_leaf": [30,40,50,100],\ # "criterion": ["gini", "entropy"]} param_grid = {"max_features": [2, 3, 4], "min_samples_leaf": [50]} grid_search = GridSearchCV(rf, cv=10, scoring='roc_auc', param_grid=param_grid, iid=False) #************************************************************************************* # c. output the best model and make predictions for test data # - Use best parameter to build model with training_new data #************************************************************************************* grid_search.fit(x_train, y_train) print "the best parameter:", grid_search.best_params_ print "the best score:", grid_search.best_score_ #print "the parameters used:",grid_search.get_params #************************************************************************************* # To see how fit the model with the training_new data # -Use the model trained to make predication for train_new data #************************************************************************************* predicted_probs_train = grid_search.predict_proba(x_train) predicted_probs_train = [x[1] for x in predicted_probs_train] computeAUC(y_train, predicted_probs_train) #************************************************************************************* # To see how well the model performs with the test_new data # -Use the model trained to make predication for validataion data (test_new) #************************************************************************************* predicted_probs_test_new = grid_search.predict_proba(x_test_new) predicted_probs_test_new = [x[1] for x in predicted_probs_test_new] computeAUC(y_test_new, predicted_probs_test_new) #************************************************************************************* # use the model to predict for test and output submission file #************************************************************************************* predicted_probs_test = grid_search.predict_proba(x_test) predicted_probs_test = ["%.9f" % x[1] for x in predicted_probs_test] submission = pd.DataFrame({ 'ID': test_id, 'Probabilities': predicted_probs_test }) submission.to_csv("rf_benchmark.csv", index=False)
class insan: boy = 180 def kosmak(self, b): return b + 10 ali = insan() print(ali.boy) print(ali.kosmak(90)) #eksik veriler #sci - kit learn from sklearn.preprocessing import Imputer imputer = Imputer(missing_values='NaN', strategy='mean', axis=0) Yas = veriler.iloc[:, 1:4].values print(Yas) imputer = imputer.fit(Yas[:, 1:4]) Yas[:, 1:4] = imputer.transform(Yas[:, 1:4]) print(Yas) ulke = veriler.iloc[:, 0:1].values print(ulke) from sklearn.preprocessing import LabelEncoder le = LabelEncoder() ulke[:, 0] = le.fit_transform(ulke[:, 0]) print(ulke) from sklearn.preprocessing import OneHotEncoder ohe = OneHotEncoder(categorical_features='all')
bestAUUC = 1000 bestPredictors = [] for j in range(len(predictorBank)): predictors = predictorBase + [predictorBank[j]] iterations = 50 auucs = np.zeros(iterations) auucs2 = np.zeros(iterations) for i in range(iterations): df = df.sample(frac=1.) #df = df.sort_values(by = 'timesec') dfTrain = df[trainIndex] dfTest = df[testIndex] dfVal = df[valIndex] imputer = Imputer() scaler = StandardScaler() xTrain = imputer.fit_transform(dfTrain[predictorBase + predictorsExtra].values) xVal = imputer.transform(dfVal[predictorBase + predictorsExtra].values) xTest = imputer.transform(dfTest[predictorBase + predictorsExtra].values) xTrain = pd.DataFrame(scaler.fit_transform(xTrain), columns=predictorBase + predictorsExtra) xVal = pd.DataFrame(scaler.transform(xVal), columns=predictorBase + predictorsExtra) xTest = pd.DataFrame(scaler.transform(xTest), columns=predictorBase + predictorsExtra) # xTrainPoly = pieceFeature(dfTrain['surgical'].values, pieceFeature(dfTrain['icuatalert'].values, xTrain)) # xTestPoly = pieceFeature(dfTest['surgical'].values, pieceFeature(dfTest['icuatalert'].values, xTest))
#plt.tight_layout(h_pad = 2.5) #plt.show() # Make a new dataframe for polynomial features poly_features = app_train[[ 'EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'DAYS_BIRTH', 'TARGET' ]] poly_features_test = app_test[[ 'EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'DAYS_BIRTH' ]] # imputer is for handling missing values from sklearn.preprocessing import Imputer imputer = Imputer(strategy='median') poly_target = poly_features['TARGET'] poly_features = poly_features.drop(columns=['TARGET']) # Need to impute missing values poly_features = imputer.fit_transform(poly_features) poly_features_test = imputer.transform(poly_features_test) from sklearn.preprocessing import PolynomialFeatures # Create the polynomial object with specified degree poly_transformer = PolynomialFeatures(degree=3) # Train the polynomial features poly_transformer.fit(poly_features)
import pandas as pd from sklearn.model_selection import train_test_split dataset = pd.read_csv("C:\\Users\\B!ade\\Downloads\\expdata.csv") dataset_corr = dataset.corr() dataset_final = dataset[[ "Overall", "International Reputation", "Reactions", "Value", "Wage" ]] dataset_final.info() dataset_final = dataset_final.iloc[:, :].values from sklearn.preprocessing import Imputer imputer = Imputer(missing_values="NaN", axis=0) imputer = imputer.fit(dataset_final[:, :]) dataset_final[:, :] = imputer.transform(dataset_final[:, :]) X = dataset_final[:, 0:4] y = dataset_final[:, 4] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) from sklearn.preprocessing import StandardScaler sc_X = StandardScaler() X_train = sc_X.fit_transform(X_train) X_test = sc_X.transform(X_test)
# ## Another Example # # Here are partial plots from a very simple model on the Titanic data. # In[ ]: titanic_data = pd.read_csv('../input/titanic/train.csv') titanic_y = titanic_data.Survived clf = GradientBoostingClassifier() titanic_X_colns = [ 'PassengerId', 'Age', 'Fare', ] titanic_X = titanic_data[titanic_X_colns] my_imputer = Imputer() imputed_titanic_X = my_imputer.fit_transform(titanic_X) clf.fit(imputed_titanic_X, titanic_y) titanic_plots = plot_partial_dependence(clf, features=[1, 2], X=imputed_titanic_X, feature_names=titanic_X_colns, grid_resolution=8) # These might seem surprising at first glance. But they show some interesting insights: # * Being young increased your odds of survival. This is consistent with historical recountings that they got women and children off the Titanic first. # * People who paid more had better odds of survival. It turns out that higher fares got you a cabin that was closer to the top of the boat, and may have given you better odds of getting a life-boat. # # # Conclusion # Partial dependence plots are a great way (though not the only way) to extract insights from complex models. These can be incredibly powerful for communicating those insights to colleagues or non-technical users.
''' # Import FeatureUnion from sklearn.pipeline import FeatureUnion # Split using ALL data in sample_df X_train, X_test, y_train, y_test = train_test_split( sample_df[['numeric', 'with_missing', 'text']], pd.get_dummies(sample_df['label']), random_state=22) # Create a FeatureUnion with nested pipeline: process_and_join_features process_and_join_features = FeatureUnion( transformer_list=[('numeric_features', Pipeline([('selector', get_numeric_data), ('imputer', Imputer())])), ('text_features', Pipeline([( 'selector', get_text_data), ('vectorizer', CountVectorizer())]))]) # Instantiate nested pipeline: pl pl = Pipeline([('union', process_and_join_features), ('clf', OneVsRestClassifier(LogisticRegression()))]) # Fit pl to the training data pl.fit(X_train, y_train) # Compute and print accuracy accuracy = pl.score(X_test, y_test)
from sklearn.svm import SVC from sklearn import svm from sklearn.model_selection import KFold from sklearn.preprocessing import MinMaxScaler from sklearn.svm import SVR dataset = pd.read_csv('E:/LINEARREGRESSION/Vijay/Titanic Dataset/INPUT/train.csv') test_data = pd.read_csv('E:/LINEARREGRESSION/Vijay/Titanic Dataset/INPUT/test.csv') y_train = dataset.iloc[:, 1].values X_train = dataset.iloc[:, [2, 4, 5, 6]].values X_test = test_data.iloc[:, [1, 3, 4, 5]].values imp_mean = Imputer() imp_mean = imp_mean.fit(X_train[:, 2:4]) X_train[:, 2:4] = imp_mean.transform(X_train[:, 2:4]) imp_mean = imp_mean.fit(X_test[:, 2:4]) X_test[:, 2:4] = imp_mean.transform(X_test[:, 2:4]) labelencoder_x = LabelEncoder() X_train[:, 1] = labelencoder_x.fit_transform(X_train[:, 1].astype(str)) X_test[:, 1] = labelencoder_x.fit_transform(X_test[:, 1].astype(str)) # #Grid Search # Random Forest Classifier
import pandas as pd # Making the splits training = pd.read_csv('train.csv') X_train = training.iloc[:, [2, 4, 5, 6, 7, 9]].values y_train = training.iloc[:, 1].values testing = pd.read_csv('test.csv') X_test = testing.iloc[:, [1, 3, 4, 5, 6, 8]].values # Reshaping to a matrix X_train = X_train.reshape(-1, 6) X_test = X_test.reshape(-1, 6) # Filling in missing data from sklearn.preprocessing import Imputer train_imputer = Imputer(missing_values = np.nan, strategy = 'mean', axis = 0) train_imputer = train_imputer.fit(X_train[:, 2:3]) X_train[:, 2:3] = train_imputer.transform(X_train[:, 2:3]) test_imputer = Imputer(missing_values = np.nan, strategy = 'mean', axis = 0) test_imputer = test_imputer.fit(X_test[:, 2:6]) X_test[:, 2:6] = test_imputer.transform(X_test[:, 2:6]) # Encoding categorical data from sklearn.preprocessing import LabelEncoder labelencoder_X = LabelEncoder() X_train[:, 1] = labelencoder_X.fit_transform(X_train[:, 1]) X_test[:, 1] = labelencoder_X.transform(X_test[:, 1]) # Feature scaling from sklearn.preprocessing import StandardScaler
def __init__(self): self.data = None self.X_train = None self.Y_train = None self.X_test = None self.Y_test = None self.clf = None category_binarizer = OnceFittedLabelBinarizer() country_binarizer = OnceFittedLabelBinarizer() state_binarizer = OnceFittedLabelBinarizer() self.category_mapper = DataFrameMapper([ (['category_code'], [CategoricalImputer(), category_binarizer]), (['country_code'], [CategoricalImputer(), country_binarizer]), (['state_code'], [CategoricalImputer(), state_binarizer]), ]) self.mapper = DataFrameMapper([ (['category_code'], [CategoricalImputer(), category_binarizer], { 'alias': 'category' }), (['country_code'], [CategoricalImputer(), country_binarizer], { 'alias': 'country' }), (['state_code'], [CategoricalImputer(), state_binarizer], { 'alias': 'state' }), (['mba_degree'], [ValueImputer(0), StandardScaler()]), (['phd_degree'], [ValueImputer(0), StandardScaler()]), (['ms_degree'], [ValueImputer(0), StandardScaler()]), (['other_degree'], [ValueImputer(0)]), (['age'], [Imputer(), StandardScaler()]), (['offices'], [ValueImputer(1.0), StandardScaler()]), (['products_number'], [ValueImputer(1.0), StandardScaler()]), (['average_funded', 'average_participants'], [ParticipantsImputer(), StandardScaler()], { 'alias': 'average_participants' }), (['total_rounds'], None), (['ipo'], None), (['is_closed'], None), (['total_rounds', 'average_funded'], [FundImputer(), StandardScaler()], { 'alias': 'average_funded' }), (['acquired_companies'], [ValueImputer(0)]), ]) SVC_C_grid = [10**i for i in range(-3, 4)] SVC_gamma_grid = [10**i for i in range(-3, 1)] + ['auto'] MLP_hidden_layer_sizes = [[25], [50], [75], [100], [50, 25], [75, 50], [100, 75], [75, 50, 25], [100, 75, 50]] MLP_activation = ['logistic', 'tanh', 'relu'] self.grid = [{ 'clf': [GradientBoostingClassifier()], 'clf__n_estimators': [20 * i for i in range(5, 8)], 'clf__max_depth': [i + 3 for i in range(2, 6)] }, { 'clf': [SVC(kernel='rbf', class_weight='balanced')], 'clf__C': SVC_C_grid, 'clf__gamma': SVC_gamma_grid }, { 'clf': [SVC(kernel='poly', class_weight='balanced')], 'clf__C': SVC_C_grid, 'clf__gamma': SVC_gamma_grid, 'clf__degree': list(range(3, 6)) }, { 'clf': [MLPClassifier()], 'clf__hidden_layer_sizes': MLP_hidden_layer_sizes, 'clf__activation': MLP_activation, 'clf__alpha': [10**i for i in range(-1, 3)] }]
import numpy as np import matplotlib.pyplot as plt import pandas as pd #reading the the dataset and making a function of f(X)=y# dataset = pd.read_csv('Data.csv') X = dataset.iloc[:, 0:3].values y = dataset.iloc[:, -1].values dataset.describe() #use to give basic knowledge of dataset# #removing null/NaN values# from sklearn.preprocessing import Imputer imputer = Imputer(missing_values='NaN', strategy='most_frequent', axis=0) imputer.fit(X[:, 1:3]) X[:, 1:3] = imputer.transform(X[:, 1:3]) #Removing the Categorical values for # from sklearn.preprocessing import LabelEncoder #for country names in X# labelencoder_X = LabelEncoder() X[:, 0] = labelencoder_X.fit_transform(X[:, 0]) #for yes/no in y# labelencoder_y = LabelEncoder() y = labelencoder_y.fit_transform(y) #removing "Dumy variable trap" by creating a sparse matrix # from sklearn.preprocessing import OneHotEncoder onehotencoder = OneHotEncoder(categorical_features=[0]) X = onehotencoder.fit_transform(X) X = X.toarray()
return self def transform(self, X, y=None): X = X.copy() for col in X.columns: X.loc[:, col] = X.loc[:, col].astype('category') return X num_pipeline = Pipeline([ ('WordToNum', ConvertWordToNum()), ('DtypeCV', DtypeConverter()), ('selector', DataFrameSelector(num_features)), ('Imputer', Imputer(strategy="median")), ('StdScaler', StandardScaler()), ]) def full_pipeline_encoder(X, X_train, X_test, y_train, y_test): X_train_num = pd.DataFrame(num_pipeline.fit_transform(X_train), index=X_train.index, columns=X_train[num_features].columns) X_test_num = pd.DataFrame(num_pipeline.transform(X_test), index=X_test.index, columns=X_test[num_features].columns) X_OHE = pd.get_dummies(X[cat_features]) X_train_ohe = X_OHE.loc[X_train.index] X_test_ohe = X_OHE.loc[X_test.index] X_train = pd.concat([X_train_num, X_train_ohe], axis=1)
print(" positive cases " + str((num_true / len(df['diabetes'])) * 100)) print(" Negative cases " + str((num_false / len(df['diabetes'])) * 100)) X = df.loc[:, df.columns != 'diabetes'].values y = df['diabetes'].values split_test_size = 0.30 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=split_test_size, random_state=42) print("{0:0.2f}% in training set ".format( (len(X_train) / len(df.index)) * 100)) fill_imputed = Imputer(missing_values=0, strategy="mean", axis=0) X_train = fill_imputed.fit_transform(X_train) X_test = fill_imputed.fit_transform(X_test) nb_model = GaussianNB() nb_model.fit(X_train, y_train.ravel()) # print(X_test) nb_predict_train = nb_model.predict(X_train) # print(X_test) # tryli = []; # for tet in X_test: # tryli.append(tet[0]) # # print("Score:" ,nb_model.score(X_test, y_test)) # plt.scatter(tryli,y_test, c=nb_predict_train); # # plt.scatter()
# -*-coding:utf8-*- import numpy as np import pandas as pd from sklearn.preprocessing import Imputer from sklearn.preprocessing import LabelEncoder, OneHotEncoder imputer = Imputer(missing_values = "NaN", strategy = "mean", axis = 0) dataset = pd.read_csv('dataset/data.csv') x = dataset.iloc[:,:-1].values y = dataset.iloc[:, 3].values imputer = imputer.fit(x[:, 1:3]) x[:, 1:3] = imputer.transform(x[:, 1:3]) labelencoder_x = LabelEncoder() x[:, 0] = labelencoder_x.fit_transform(x[:, 0]) onehotencoder = OneHotEncoder(categorical_features=[0]) x = onehotencoder.fit_transform(x).toarray() print(x)
def main(): parser = argparse.ArgumentParser() parser.add_argument('--C', type=float, default=1.0, help='inverse of L1 / L2 regularization') parser.add_argument('--l1', dest='l2', action='store_false') parser.add_argument('--l2', dest='l2', action='store_true') parser.set_defaults(l2=True) parser.add_argument('--period', type=str, default='all', help='specifies which period extract features from', choices=[ 'first4days', 'first8days', 'last12hours', 'first25percent', 'first50percent', 'all' ]) parser.add_argument('--features', type=str, default='all', help='specifies what features to extract', choices=['all', 'len', 'all_but_len']) parser.add_argument('--data', type=str, help='Path to the data of in-hospital mortality task', default=os.path.join( os.path.dirname(__file__), '../../../data/in-hospital-mortality/')) parser.add_argument( '--output_dir', type=str, help='Directory relative which all output files are stored', default='.') args = parser.parse_args() print(args) print("Path") print(os.path) train_reader = InHospitalMortalityReader( dataset_dir= "/home/sunitha/Documents/7thSem/DA/mimic-iii-clinical-database-demo-1.4/src/data/in-hospital-mortality/train", listfile= '/home/sunitha/Documents/7thSem/DA/mimic-iii-clinical-database-demo-1.4/src/data/in-hospital-mortality/train/train_listfile.csv', period_length=48.0) val_reader = InHospitalMortalityReader( dataset_dir= '/home/sunitha/Documents/7thSem/DA/mimic-iii-clinical-database-demo-1.4/src/data/in-hospital-mortality/train', listfile= '/home/sunitha/Documents/7thSem/DA/mimic-iii-clinical-database-demo-1.4/src/data/in-hospital-mortality/train/train_listfile.csv', period_length=48.0) test_reader = InHospitalMortalityReader( dataset_dir= '/home/sunitha/Documents/7thSem/DA/mimic-iii-clinical-database-demo-1.4/src/data/in-hospital-mortality/test', listfile= '/home/sunitha/Documents/7thSem/DA/mimic-iii-clinical-database-demo-1.4/src/data/in-hospital-mortality/test/test_listfile.csv', period_length=48.0) print('Reading data and extracting features ...') (train_X, train_y, train_names) = read_and_extract_features(train_reader, args.period, args.features) (val_X, val_y, val_names) = read_and_extract_features(val_reader, args.period, args.features) (test_X, test_y, test_names) = read_and_extract_features(test_reader, args.period, args.features) print(' train data shape = {}'.format(train_X.shape)) print(' validation data shape = {}'.format(val_X.shape)) print(' test data shape = {}'.format(test_X.shape)) print("---------------") print(train_X[0]) print("---------------") print(train_names[0]) print('Imputing missing values ...') imputer = Imputer(missing_values=np.nan, strategy='mean', axis=0, verbose=0, copy=True) imputer.fit(train_X) train_X = np.array(imputer.transform(train_X), dtype=np.float32) val_X = np.array(imputer.transform(val_X), dtype=np.float32) test_X = np.array(imputer.transform(test_X), dtype=np.float32) print('Normalizing the data to have zero mean and unit variance ...') scaler = StandardScaler() scaler.fit(train_X) train_X = scaler.transform(train_X) val_X = scaler.transform(val_X) test_X = scaler.transform(test_X) penalty = ('l2' if args.l2 else 'l1') file_name = '{}.{}.{}.C{}'.format(args.period, args.features, penalty, args.C) decision_tree = DecisionTreeClassifier(random_state=42, max_depth=5) decision_tree.fit(train_X, train_y) result_dir = os.path.join(args.output_dir, 'results') common_utils.create_directory(result_dir) with open(os.path.join(result_dir, 'train_{}.json'.format(file_name)), 'w') as res_file: ret = print_metrics_binary(train_y, decision_tree.predict_proba(train_X)) ret = {k: float(v) for k, v in ret.items()} json.dump(ret, res_file) with open(os.path.join(result_dir, 'val_{}.json'.format(file_name)), 'w') as res_file: ret = print_metrics_binary(val_y, decision_tree.predict_proba(val_X)) ret = {k: float(v) for k, v in ret.items()} json.dump(ret, res_file) prediction = decision_tree.predict_proba(test_X)[:, 1] with open(os.path.join(result_dir, 'test_{}.json'.format(file_name)), 'w') as res_file: ret = print_metrics_binary(test_y, prediction) ret = {k: float(v) for k, v in ret.items()} json.dump(ret, res_file) save_results( test_names, prediction, test_y, os.path.join(args.output_dir, 'predictions', file_name + '.csv'))
if value == 'N': return 0 if value == 'Y': return 1 if value == 'X': return 2 X1['md_trial'] = X1['md_trial'].apply(xyn_to_number) X2['md_trial'] = X2['md_trial'].apply(xyn_to_number) logging.debug("Normalized X1, X2") X_all, y_all = pd.concat([X1, X2]), pd.concat([y1, y2]) logging.debug("Imputing & scaling X1, X2") imputer = Imputer(missing_values='NaN') X_all = imputer.fit_transform(X_all) min_max_scaler = MinMaxScaler() X_all = min_max_scaler.fit_transform(X_all) logging.debug("Imputed & scaled X1, X2") X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=0.30) # Parameters learning_rate = 0.000001 training_epochs = 2000 batch_size = 256 test_step = 10
def __init__(self): self.reg = make_pipeline(Imputer(strategy='median'), ExtraTreesRegressor(n_estimators=10))
def model_training(data, feat_key, le, remove_nan, perc_train_size, output_file, model_file, sov_encoder_file, n_estimators=500, min_samples_leaf=1): #import seaborn as sns import numpy as np import pandas as pd import matplotlib.pyplot as plt from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler from sklearn.preprocessing import Imputer from sklearn.preprocessing import OneHotEncoder from sklearn.preprocessing import LabelEncoder from sklearn.utils import check_random_state from sklearn.externals import joblib from sklearn.ensemble import RandomForestClassifier from sklearn import tree data_index = data.index # Se crea la variable data_index para publicar el output. y_ = np.array(data.pop('IssuerRating')) X_ = np.array(data[feat_key["Key"]]) # Remove observations with no output ind_valid_out = [is_string(yi) for yi in y_] X = X_[ind_valid_out] y = y_[ind_valid_out] data_index = data_index[ind_valid_out] # Encode y values, y = np.array( [list(le.loc[yi])[0] if is_string(yi) else float('NaN') for yi in y]) # Encode Sovereig rating sr = feat_key[feat_key["Key"] == 'SovereignRating'] if len(sr) > 0: pos_sr = feat_key.index.get_loc( sr.index[0]) # Position sovereign rating pos_str = [is_string(x) for x in X[:, pos_sr]] labels = np.unique(X[pos_str, pos_sr]) le_X = LabelEncoder() le_X.fit(labels) X[pos_str, pos_sr] = le_X.transform(X[pos_str, pos_sr]) joblib.dump(le_X, sov_encoder_file) # Save sovereign label encoder # Remove NaN if remove_nan: ind_not_na = [not np.isnan(np.sum(x)) for x in X] X = X[ind_not_na] y = y[ind_not_na] data_index = data_index[ind_not_na] else: imp = Imputer(missing_values='NaN', strategy='mean', axis=0) imp.fit(X=X_train) X = imp.transform(X) # Data Permitation: random_state = check_random_state(0) permutation = random_state.permutation(X.shape[0]) X = X[permutation] y = y[permutation] data_index = data_index[permutation] # Train and test samples: train_size = int(X.shape[0] * perc_train_size) X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=train_size, shuffle=False) print('Muestra de entrenamiento: %d' % X_train.shape[0]) print('Muestra de testing: %d' % X_test.shape[0]) print('') # Model fitting: clf = RandomForestClassifier(n_estimators=n_estimators, max_features="auto", min_samples_leaf=min_samples_leaf) clf.fit(X_train, y_train) # Save model joblib.dump(clf, model_file) score = clf.score(X_test, y_test) print('Score sobre muestra de testing:') print(score) print('') # output file: pred_calif = np.array([ le.iloc[x == list(le.iloc[:, 0]), 0].index[0] for x in clf.predict(X_test) ]) y_test_calif = np.array( [le.iloc[x == list(le.iloc[:, 0]), 0].index[0] for x in y_test]) if len(sr) > 0: X_test[:, pos_sr] = le_X.inverse_transform( X_test[:, pos_sr].astype('int')) # Inverse transform of sov. ratingsS data_test = pd.DataFrame( np.column_stack((np.column_stack((X_test, y_test_calif)), pred_calif)), columns=list(feat_key.index) + ['Rating Test', 'Rating Predicc'], index=data_index[np.arange(train_size, data_index.shape[0])]) # Output file: data_test.to_csv(output_file) # Variables importances: importances = clf.feature_importances_ std = np.std([tree.feature_importances_ for tree in clf.estimators_], axis=0) indices = np.argsort(importances)[::-1] # Print the feature ranking print('') print("Ranking:") for f in range(X_train.shape[1]): print("%d. %s (%f)" % (f + 1, feat_key.index[indices[f]], importances[indices[f]])) # Plot importances: print('') plt.figure() plt.title("Importancias") plt.bar(range(X.shape[1]), importances[indices], color="r", yerr=std[indices], align="center") plt.xticks(range(X.shape[1]), np.arange(X.shape[1]) + 1) plt.xlim([-1, X_train.shape[1]]) plt.show() return (None)
def run_method(): global db sets = file_all # file_sets mlp_para = list(range(2,21)) rf_para = list(range(2,21)) columns_cla = ["method", "parameter(# of trees or # of layers)", "evaluation", "value"] columns_regr = ["method", "parameter(# of layers)", "evaluation", "value"] df_load_cla = pd.DataFrame(columns=columns_cla) df_load_regr = pd.DataFrame(columns=columns_regr) df_perf_cla = pd.DataFrame(columns=columns_cla) df_perf_regr = pd.DataFrame(columns=columns_regr) for turn,file in enumerate(sets): db = pd.read_csv('../csv/' + file + '.csv') print('open file ','../csv/' + file + '.csv') # Preprocessingm # Imputation of missing values db_values = db.values instance_db = db_values[:, 4:-4] for i in range(instance_db.shape[0]): for j in range(instance_db.shape[1]): # process the null if instance_db[i][j] == 'null': # assign NaN to null for future process instance_db[i][j] = 'NaN' # tackle column whose members are all NaN for j in range(instance_db.shape[1]): if list(instance_db[:,j]) == list(['NaN']*instance_db.shape[0]): instance_db[:,j] = [0]*instance_db.shape[0] imp = Imputer(missing_values='NaN', strategy='mean', axis=0, verbose=1) imp.fit(instance_db) instance_db = imp.transform(instance_db) # Normalization scaler = StandardScaler() scaler.fit(instance_db) instance_db = scaler.transform(instance_db) db_values[:, 4:-4] = instance_db db = pd.DataFrame(db_values,columns = db.columns) # target target_db = db_values[:, -4:] # load scores as target scores_load = target_db[:, 0] # load levels as target levels_load = target_db[:, 1] # convert 'A'... to 0... levels_load = [ord(x) - ord('A') for x in levels_load] # invalidate feature selection # instance_data = db[select_load] instance_data = db[feature] # regression process for i in range(6): for para in mlp_para: mlp_result, y_test = regression(instance_data, scores_load, para) _mse, _R2 = regression_valuate(mlp_result, y_test) df_load_regr.loc[df_load_regr.shape[0]] = ["mlp", para, "mse", _mse] df_load_regr.loc[df_load_regr.shape[0]] = ["mlp", para, "R2", _R2] # classification process count = Counter(levels_load) balance = np.array([count[i] for i in count])/len(levels_load) > 0.1 _select = len(set(balance)) > 1 if len(set(levels_load)) > 1 and _select: for i in range(6): for para in rf_para: rf_result, y_test = classification_rf(instance_data, levels_load, para) p, r, f = precision_recall_fscore_support(y_test, rf_result, average = 'macro')[:3] df_load_cla.loc[df_load_cla.shape[0]] = ["rf", para, "precision", p] df_load_cla.loc[df_load_cla.shape[0]] = ["rf", para, "recall", r] df_load_cla.loc[df_load_cla.shape[0]] = ["rf", para, "fscore", f] for para in mlp_para: mlp_result, y_test = classification_mlpc(instance_data, levels_load, para) p, r, f = precision_recall_fscore_support(y_test, mlp_result, average = 'macro')[:3] df_load_cla.loc[df_load_cla.shape[0]] = ["mlpc", para, "precision", p] df_load_cla.loc[df_load_cla.shape[0]] = ["mlpc", para, "recall", r] df_load_cla.loc[df_load_cla.shape[0]] = ["mlpc", para, "fscore", f] # performance scores as target scores_perf = target_db[:, 2] # performance levels as target levels_perf = target_db[:,3] # convert 'A'... to 0... levels_perf = [ord(x) - ord('A') for x in levels_perf] # invalidate feature selection # instance_data = db[select_perf] instance_data = db[feature] # regression process for i in range(6): for para in mlp_para: mlp_result, y_test = regression(instance_data, scores_perf, para) _mse, _R2 = regression_valuate(mlp_result, y_test) df_perf_regr.loc[df_perf_regr.shape[0]] = ["mlp", para, "mse", _mse] df_perf_regr.loc[df_perf_regr.shape[0]] = ["mlp", para, "R2", _R2] # classification processhape[0] count = Counter(levels_perf) balance = np.array([count[i] for i in count])/len(levels_load) > 0.1 _select = len(set(balance)) > 1 if len(set(levels_perf)) > 1 and _select: for i in range(6): for para in rf_para: rf_result, y_test = classification_rf(instance_data, levels_perf, para) p, r, f = precision_recall_fscore_support(y_test, rf_result, average = 'macro')[:3] df_perf_cla.loc[df_perf_cla.shape[0]] = ["rf", para, "precision", p] df_perf_cla.loc[df_perf_cla.shape[0]] = ["rf", para, "recall", r] df_perf_cla.loc[df_perf_cla.shape[0]] = ["rf", para, "fscore", f] for para in mlp_para: mlp_result, y_test = classification_mlpc(instance_data, levels_perf, para) p, r, f = precision_recall_fscore_support(y_test, mlp_result, average = 'macro')[:3] df_perf_cla.loc[df_perf_cla.shape[0]] = ["mlpc", para, "precision", p] df_perf_cla.loc[df_perf_cla.shape[0]] = ["mlpc", para, "recall", r] df_perf_cla.loc[df_perf_cla.shape[0]] = ["mlpc", para, "fscore", f] return df_load_cla, df_load_regr, df_perf_cla, df_perf_regr
# In[6]: datacopy = data.copy() data = apply_thresholding(data, thres=THRESH_BINARY_AND_THRESH_OTSU) data.head() # In[7]: training_features = data.copy() # In[8]: from sklearn.preprocessing import Imputer imputer = Imputer(strategy="median") imputer.fit(training_features) # In[9]: from sklearn.preprocessing import StandardScaler scalar = StandardScaler() scalar.fit(training_features) # In[10]: from sklearn.decomposition import PCA # In[11]: training_features = imputer.transform(training_features)
filename = 'parkinson.csv' raw_data = open(filename, 'rt') reader = csv.reader(raw_data, delimiter=',', quoting=csv.QUOTE_NONE) td = list(reader) data = numpy.array(td).astype('str') x = data[:, 1:23] # select columns 1 through end x = numpy.array(x).astype('float') w = data[:, 23] # select column 0, the stock priceprint(w) imp = Imputer(missing_values="NaN", strategy='median', axis=0) x = imp.fit_transform(x) w = numpy.array(w).astype('float') for i in range(0, len(x)): if (w[i] == 0): w[i]=1 else: w[i] = 0
print("total no.of rows : {0}".format(len(data))) print("total no. of missing rows of Pregnancies : {0}".format(len(data.loc[data['Pregnancies']==0]))) print("total no. of missing rows of Glucose : {0}".format(len(data.loc[data['Glucose']==0]))) print("total no. of missing rows of BloodPressure : {0}".format(len(data.loc[data['BloodPressure']==0]))) print("total no. of missing rows of SkinThickness : {0}".format(len(data.loc[data['SkinThickness']==0]))) print("total no. of missing rows of Insulin: {0}".format(len(data.loc[data['Insulin']==0]))) print("total no. of missing rows of BMI: {0}".format(len(data.loc[data['BMI']==0]))) print("total no. of missing rows of DiabetesPedigreeFunction: {0}".format(len(data.loc[data['DiabetesPedigreeFunction']==0]))) print("total no. of missing rows of Age: {0}".format(len(data.loc[data['Age']==0]))) # In[19]: from sklearn.preprocessing import Imputer fill_values = Imputer(missing_values = 0, strategy = 'mean',axis = 0) X_train = fill_values.fit_transform # In[20]: from sklearn.ensemble import RandomForestClassifier random_forest_model = RandomForestClassifier(random_state = 10) random_forest_model.fit(X_train, y_train.ravel()) # In[18]: predict_train_data = random_forest_model.predict(X_test)
def build_audit_na(classifier, name, with_proba=True, **kwargs): employment_mapping = { "CONSULTANT": "PRIVATE", "PSFEDERAL": "PUBLIC", "PSLOCAL": "PUBLIC", "PSSTATE": "PUBLIC", "SELFEMP": "PRIVATE", "PRIVATE": "PRIVATE" } gender_mapping = {"FEMALE": 0, "MALE": 1} mapper = DataFrameMapper([(["Age"], [ ContinuousDomain(missing_values=None, with_data=False), Alias(ExpressionTransformer( "numpy.where(pandas.notnull(X[:, 0]), X[:, 0], -999)"), name="flag_missing(Age, -999)"), Imputer(missing_values=-999) ])] + [(["Hours"], [ ContinuousDomain(missing_values=None, with_data=False), Alias(ExpressionTransformer( "numpy.where(pandas.isnull(X[:, 0]), -999, X[:, 0])"), name="flag_missing(Hours, -999)"), Imputer(missing_values=-999) ])] + [(["Income"], [ ContinuousDomain(missing_values=None, outlier_treatment="as_missing_values", low_value=5000, high_value=200000, with_data=False), Imputer() ])] + [(["Employment"], [ CategoricalDomain(missing_values=None, with_data=False), CategoricalImputer(), StringNormalizer(function="uppercase"), LookupTransformer(employment_mapping, "OTHER"), StringNormalizer(function="lowercase"), PMMLLabelBinarizer() ])] + [([column], [ CategoricalDomain(missing_values=None, with_data=False), CategoricalImputer(missing_values=None), StringNormalizer(function="lowercase"), PMMLLabelBinarizer() ]) for column in ["Education", "Marital", "Occupation"]] + [(["Gender"], [ CategoricalDomain(missing_values=None, with_data=False), CategoricalImputer(), StringNormalizer(function="uppercase"), LookupTransformer(gender_mapping, None) ])]) pipeline = PMMLPipeline([("mapper", mapper), ("classifier", classifier)]) pipeline.fit(audit_na_X, audit_na_y) customize(classifier, **kwargs) store_pkl(pipeline, name + ".pkl") adjusted = DataFrame(pipeline.predict(audit_na_X), columns=["Adjusted"]) if with_proba == True: adjusted_proba = DataFrame( pipeline.predict_proba(audit_na_X), columns=["probability(0)", "probability(1)"]) adjusted = pandas.concat((adjusted, adjusted_proba), axis=1) if isinstance(classifier, DecisionTreeClassifier): Xt = pipeline_transform(pipeline, audit_na_X) adjusted_apply = DataFrame(classifier.apply(Xt), columns=["nodeId"]) adjusted = pandas.concat((adjusted, adjusted_apply), axis=1) store_csv(adjusted, name + ".csv")
def convertMissingValues(data): #sustituir valores nan por la media de los valores de la columna imp = Imputer(missing_values=np.nan, strategy='mean') imp.fit(data) impdata = imp.transform(data) return impdata
def make_dataset(train_df, test_df): print("\n****************************************************") print("make dataset") print("Train data features shape: {}".format(train_df.shape)) print("Test data features shape: {}".format(test_df.shape)) # One Hot Encoding # yes/No -> 1/0 le = LabelEncoder() le_count = 0 # only label encode those variables with 2 or less categories for col in train_df: if train_df[col].dtype == "object": # if 2 or fewer unique categories if len(list(train_df[col].unique())) <= 2: print(col) # Train on the training data le.fit(train_df[col]) # Transeform noth training and testing data train_df[col] = le.transform(train_df[col]) test_df[col] = le.transform(test_df[col]) # keep track of how many columns were label encoded le_count += 1 print("{} columns were labeld encoded.\n".format(le_count)) train_df = pd.get_dummies(train_df) test_df = pd.get_dummies(test_df) print("Train data features shape: {}".format(train_df.shape)) print("Test data features shape: {}".format(test_df.shape)) target = train_df["TARGET"] train_df, test_df = train_df.align(test_df, join="inner", axis=1) print("Train data features shape: {}".format(train_df.shape)) print("Test data features shape: {}".format(test_df.shape)) if "TARGET" in train_df: train_df = train_df.drop("TARGET", axis=1) features = list(train_df.columns) # Median imputation of missing values imputer = Imputer(strategy="median") print("DONE: Imputation") # Scale each feature 0 - 1 scaler = MinMaxScaler(feature_range=(0, 1)) print("DONE: Scale") # Fit on the training data imputer.fit(train_df) print("DONE: Fit") # Transform both training and test data train_df = imputer.transform(train_df.astype(np.float32)) test_df = imputer.transform(test_df.astype(np.float32)) print("DONE: Transform\n") # Repeat with the scaler scaler.fit(train_df.astype(np.int)) train_df = scaler.transform(train_df) test_df = scaler.transform(test_df) print("DONE: Scalar Transform") print("Train data features shape: {}".format(train_df.shape)) print("Test data features shape: {}".format(test_df.shape)) np.save("../../all/train_X", train_df) np.save("../../all/train_target", target) np.save("../../all/test", test_df) print("DONE! save train.npy, target.npy, test.npy") # X_train, X_val, y_train, y_val = train_test_split(train_df, target, test_size=0.2, random_state=0) # print("X_train shape: {}".format(X_train.shape)) # print("X_val shape: {}".format(X_val.shape)) # print("y_train shape: {}".format(y_train.shape)) # print("y_val shape: {}".format(y_val.shape)) # np.save("../../all/X_train", X_train) # np.save("../../all/X_val", X_val) # np.save("../../all/y_train", y_train) # np.save("../../all/y_val", y_val) print("\n*********** DONE! ***************")
from sklearn.metrics import precision_score from sklearn.metrics import recall_score # Necessary to use SMOTE in the same pipeline as sklearn from imblearn.pipeline import Pipeline as imb_pipeline # Plotting import seaborn as sns import matplotlib.pyplot as plt # Added later so results can be replicable # Set to None and functions that use this will use their default random state RANDOM_STATE = 1 # Variables base_pipeline = [('imputer', Imputer(strategy='median')), ('resampling', SMOTE(random_state=RANDOM_STATE)), ('selection', SelectKBest(score_func=f_classif)), ('scaler', StandardScaler()), ('pca', PCA())] # Stuff we want to test for each model before doing careful tuning base_param_grid = { 'scaler': [None, StandardScaler()], 'selection__k': [7, 10, 15], 'pca': [None, PCA(n_components=2), PCA(n_components=4), PCA(n_components=6)], } financial_features = [
import numpy as np import pandas as pd from sklearn.preprocessing import Imputer import keras from keras.utils import np_utils # PART 1 - DATAPREPROCESSING data = pd.read_csv('cancer.data') # Bazen kullanılan verilerde kayıp yada eksik değerler olabilir. # İlk adımda eksik olan değerler -99999 değeri ile değiştirildi. (Opsiyonel) data.replace('?', -99999, inplace='true') # Imputer sınıfı ile eksik olan veriler. O veriye karşılık gelen kolonun(özelliğin) # ortalaması, standart sapması vb. yöntemlerle doldurulur. imp = Imputer(missing_values=-99999, strategy="mean", axis=0) data = pd.DataFrame(imp.fit_transform(data)) data = data.drop(0, 1) # Okunan veri girdi ve çıktı olarak ayrıştırılır. output_data = np.array(data.iloc[:, 9]) input_data = np.array(data.iloc[:, :9]) # Çıktı verisi kategorik veri olduğundan bu veriyi numerik veriye çevirmek gerekir. output_data = np_utils.to_categorical(output_data) # Verinin %80'i train, %20'si test verisi olacak şekilde ayrılır. from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(input_data, output_data, test_size=0.2,
# In[3]: design_matrix = cross_section.loc[:,['Region','RegionName','AveragePrice','Population','dist_from_lon', 'Unemployment_rate','GDP_per_cap','Inflation_index','Inflation', 'Local_pshs','Outstanding_perc','Good_perc', 'Inad_perc','Non_UK_born','Migrant_inflow_monthly', 'Migrant_outflow_monthly','Net_immigration_monthly', 'LFS_active_perc']] design_matrix['Real_GVA_per_cap'] = design_matrix['GDP_per_cap'] *100/ design_matrix['Inflation_index'] from sklearn.preprocessing import Imputer design_matrix[['Outstanding_perc','Good_perc']] =Imputer(strategy = 'median').fit_transform(design_matrix[['Outstanding_perc','Good_perc']]) design_matrix[['Inad_perc']] =Imputer(strategy = 'mean').fit_transform(design_matrix[['Inad_perc']]) design_matrix[['Local_pshs']] = Imputer(strategy = 'median').fit_transform(design_matrix[['Local_pshs']]) cols_to_create = ['Migrant_inflow_per_cap','Migrant_outflow_per_cap','Net_immig_per_cap','Non_UK_per_cap','Local_pshs_per_cap'] cols_to_use = ['Migrant_inflow_monthly','Migrant_outflow_monthly','Net_immigration_monthly','Non_UK_born','Local_pshs'] for new, old in zip(cols_to_create, cols_to_use): design_matrix[new] = design_matrix[old]/design_matrix['Population'] region_dummies = pd.get_dummies(cross_section.loc[:,['Region']],drop_first = True) dummy_names = region_dummies.columns.values
def Submit(): p = preg.get() g = gl.get() bp = BP.get() st = ST.get() i = insulin.get() b = bmi.get() d = dpf.get() a = age.get() if (p == " " or g == " " or bp == " " or st == " " or i == " " or b == " " or d == " " or a == " "): messageBox.showinfo("ERROR", "Please fill all the entries") else: l1 = [[p, g, bp, st, i, b, d, a]] print(l1) # Logistic Regression # Importing the libraries import numpy as np #import matplotlib.pyplot as plt import pandas as pd # Importing the dataset dataset = pd.read_csv('diabetes.csv') X = np.array(dataset.drop('Outcome', 1)) y = np.array(dataset['Outcome']) l1 = np.array(l1) # Splitting the dataset into the Training set and Test set from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) # Data Preprocessing # Taking care of missing data from sklearn.preprocessing import Imputer imputer = Imputer(missing_values=0, strategy='mean', axis=0) imputer = imputer.fit( X[:, 1:9]) # Upper bound is excluded only Index 1 and 2 is included X[:, 1:9] = imputer.transform(X[:, 1:9]) # Feature Scaling from sklearn.preprocessing import StandardScaler sc_X = StandardScaler() X_train = sc_X.fit_transform(X_train) X_test = sc_X.transform(X_test) l1_test = sc_X.transform(l1) # Fitting Logistic Regression to the Training set from sklearn.linear_model import LogisticRegression classifier = LogisticRegression(random_state=0) classifier.fit(X_train, y_train) # Predicting the Test set results y_pred = classifier.predict(X_test) # Making the Confusion Matrix from sklearn.metrics import confusion_matrix cm = confusion_matrix(y_test, y_pred) # Outcome of cm shows array two rows - correct and incorrect predictions # example of making a single class prediction from sklearn.datasets.samples_generator import make_blobs # generate 2d classification dataset X, y = make_blobs(n_samples=100, centers=2, n_features=8, random_state=1) # fit final model model = LogisticRegression() model.fit(X, y) # define one new instance Xnew = l1_test # make a prediction ynew = model.predict(Xnew) print("X=%s, Predicted=%s" % (Xnew[0], ynew[0])) out = ynew[0] if out == 0: messagebox.showinfo("Result", "You don't have diabetes") elif out == 1: messagebox.showinfo("Result", "You have diabetes")