# import modules import pandas as pd import numpy as np from sklearn.model_selection import train_test_split from keras.models import load_model import matplotlib.pyplot as plt from one_hot_encoding import one_hot_encoder import feature_index import csv testdata = pd.read_csv('./data/criminal_test.csv') perid = testdata.iloc[:, 0:1].values testdata = testdata.drop(['PERID'], axis=1) data = testdata[feature_index.categorical] data_one_hot_encoded = one_hot_encoder(data) X = pd.concat([testdata, data_one_hot_encoded], axis=1) # Feature Scaling from sklearn.preprocessing import StandardScaler sc = StandardScaler() X = sc.fit_transform(X) # Load model model = load_model('./model/checkpoints/criminal-ann-009-loss0.098-acc0.958.hdf5') scores = model.predict(np.array(X)) myData = [["PERID", "Criminal"]] i=0
categorical_cols.append(col) print(traindata[col].value_counts()) # Taking care of missing values in Columns with categorical data where Imputation wont work if (traindata.isnull().sum().sum()): traindata.dropna(inplace=True) traindata.fillna(method='ffill', inplace=True) y = traindata.iloc[:, -1].values train_set = traindata.drop(['id', 'P'], axis=1) numeric_cols.remove('id') numeric_cols.remove('P') # one-hot encoding data = train_set[categorical_cols] one_hot_encoded_data_train = one_hot_encoder(data, categorical_cols) X = pd.concat([train_set[numeric_cols], one_hot_encoded_data_train], axis=1) # Encoding the Independent Varialble #from sklearn.preprocessing import LabelEncoder, OneHotEncoder #labelencoder = LabelEncoder() #catg_index = [0,3,4,5,6,8,9,11,12] #for item in catg_index: # train_set[:, item] = labelencoder.fit_transform(train_set[:, item]) #onehotencoder = OneHotEncoder(categorical_features = catg_index) #X = onehotencoder.fit_transform(train_set).toarray() # ====================== Splitting the dataset into the Training set and Test set ============================= X_train, X_test, y_train, y_test = train_test_split(X, y,
print(sys.version) # load data loan = pd.read_csv('./data/loan.csv') # pre-process data drop_null_columns(loan) loan_in_progress = split_loan_in_progress(loan) loan = categorize_target(loan) # Feature Engineering by EDA trim_features(loan) # one-hot encoding loan = loan[feature_index.features] loan_one_hot_encoded = one_hot_encoder(loan) # Train-Test split y = loan_one_hot_encoded.loan_status_coded X = loan_one_hot_encoded.drop("loan_status_coded", axis=1) x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y) # oversample_SMOTE #x_train, y_train = oversample_smote(x_train, y_train) # Neural Network model y_train = encode_neural_net_y(y_train) y_test = encode_neural_net_y(y_test) model = Sequential() model.add(Dense(34, input_dim=66, activation='relu'))
labels = list(traindata) for label in labels: traindata = traindata[np.abs(traindata[label]-traindata[label].mean()) <= (3.8*traindata[label].std())] # pre-process data traindata.isnull().sum() num_cols = traindata._get_numeric_data().columns for col in traindata: print(traindata[col].value_counts()) y = traindata.iloc[:, -1].values traindata = traindata.drop(['PERID', 'Criminal'], axis=1) # Except VESTR and ANALWT_C all others are Categorical data. Hence they need feature scaling. # one-hot encoding data = traindata[feature_index.categorical] one_hot_encoded_data = one_hot_encoder(data) # Splitting the dataset into the Training set and Test set X = pd.concat([traindata, one_hot_encoded_data], axis=1) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y) # Feature Scaling from sklearn.preprocessing import StandardScaler sc = StandardScaler() X_train = sc.fit_transform(X_train) X_test = sc.transform(X_test) # ========================================================Neural Network model================================================================== # Initialising the ANN model = Sequential() # Adding the input layer and the first hidden layer
categorical_cols = [] for col in testdata: if (col not in numeric_cols): categorical_cols.append(col) print(testdata[col].value_counts()) # Taking care of missing values in Columns with categorical data where Imputation wont work testdata.fillna(method='ffill', inplace=True) test_set = testdata.drop(['id'], axis=1) numeric_cols.remove('id') # one-hot encoding data = test_set[categorical_cols] one_hot_encoded_data_test = one_hot_encoder(data, categorical_cols) feature_difference = set(one_hot_encoded_data_train) - set( one_hot_encoded_data_test) feature_difference_df = pd.DataFrame(data=np.zeros( (one_hot_encoded_data_test.shape[0], len(feature_difference))), columns=list(feature_difference)) one_hot_encoded_data_test = one_hot_encoded_data_test.join( feature_difference_df) X = pd.concat([test_set[numeric_cols], one_hot_encoded_data_test], axis=1) # Feature Scaling from sklearn.preprocessing import StandardScaler sc = StandardScaler() X = sc.fit_transform(X) # ========================================= Predicting the Test set results ======================================