Ejemplo n.º 1
0
def main_deprecated():
    # This is deprecated, never use this please.
    print("This is main, alhumdulliah")
    ##### This block is for data cleaning #####
    missing_values = ["n/a", "na", "--", "?"]
    raw_data = pd.read_csv('../dataset_diabetes/diabetic_data.csv',
                           delimiter=',', na_values=missing_values)
    # print(raw_data.head()) # print head of the data
    # print(raw_data.describe()) # shows numerical columns statistics e.g. count, mean, std, min, max etc
    # print(raw_data.shape) # prints shape of the dataset (101766, 50)
    # print(raw_data["weight"].isnull().sum()) #prints number of null values in weight column
    # print(raw_data["weight"].shape[0]) #prints number of columns in weight column
    data_cleaning = DataCleaning()
    raw_data = data_cleaning.clean_columns(raw_data, missing_bound=.2)
    cols_having_missing_values = data_cleaning.get_cols_having_missing_values(
        raw_data, False)  # cols having missing values
    # raw_data.dtypes #shows the column data types
    raw_data = data_cleaning.fill_missing_values(
        raw_data, cols_having_missing_values)
    # print(get_cols_having_missing_values(raw_data, False)) #no columns with missing values
    raw_data = data_cleaning.just_remove_columns(raw_data, columns=[
                                                 "encounter_id", "patient_nbr", "admission_type_id", "discharge_disposition_id", "admission_source_id", "num_procedures"])
    df = raw_data
    my_util = Util()
    my_util.save_df(df, "../only_calculated_datasets/cleaned_df.pkl")
    print("Filled the missing values either by the mode or mean value")
Ejemplo n.º 2
0
def clean():
    missing_values = ["n/a", "na", "--", "?"]
    raw_data = pd.read_csv('../dataset_diabetes/diabetic_data.csv',
                           delimiter=',', na_values=missing_values)
    data_cleaning = DataCleaning()
    raw_data = data_cleaning.clean_columns(raw_data, missing_bound=.2)
    cols_having_missing_values = data_cleaning.get_cols_having_missing_values(
        raw_data, False)  # cols having missing values
    raw_data = data_cleaning.fill_missing_values(
        raw_data, cols_having_missing_values)
    raw_data = data_cleaning.just_remove_columns(raw_data, columns=[
                                                 "encounter_id", "patient_nbr", "admission_type_id", "discharge_disposition_id", "admission_source_id", "num_procedures"])
    df = raw_data
    my_util = Util()
    my_util.save_df(df, "../only_calculated_datasets/cleaned_df.pkl")
Ejemplo n.º 3
0
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn import tree
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

from visualization import Visualization
from data_cleaning import DataCleaning

missing_values = ["n/a", "na", "--", "?"]
data = pd.read_csv('../dataset_diabetes/diabetic_data.csv',
                   delimiter=',',
                   na_values=missing_values)

data_cleaning = DataCleaning()
data = data_cleaning.clean_columns(data, missing_bound=0.2)

colsMissingValues = data_cleaning.get_cols_having_missing_values(data, False)
data = data_cleaning.fill_missing_values(data, colsMissingValues)
"""
data = data.values
features = []
for i in range(50):
	if isinstance(data[0][i], str):
		a = np.unique(data[:,i])
		features.append(a)
"""

data = data.to_numpy()

print(data)
Ejemplo n.º 4
0
def main():
    #Data Cleaning

    missing_values = ["n/a", "na", "--", "?"]
    data = pd.read_csv('../dataset_diabetes/diabetic_data.csv',
                       delimiter=',',
                       na_values=missing_values)

    data_cleaning = DataCleaning()
    data = data_cleaning.clean_columns(data, missing_bound=0.2)

    colsMissingValues = data_cleaning.get_cols_having_missing_values(
        data, False)
    data = data_cleaning.fill_missing_values(data, colsMissingValues)

    #Data Cleaning Done

    data = data.to_numpy()

    le = LabelEncoder()

    for i in range(50):
        if isinstance(data[0][i], str):
            data[:, i] = le.fit_transform(data[:, i])

    print(data)
    print(data.shape)

    X_train, X_test = data[0:80000, 0:49], data[80000:101766, 0:49]
    Y_train, Y_test = data[0:80000, 49:50], data[80000:101766, 49:50]
    Y_train, Y_test = Y_train.astype('int'), Y_test.astype('int')

    print(X_train)
    print(X_train.shape)
    print(Y_train)
    print(Y_train.shape)

    grid_params = {
        'criterion': ['gini', 'entropy'],
        'splitter': ['best', 'random'],
        'max_depth': [2, 4, 6],
        'min_samples_leaf': [0.02, 0.04],
        'min_samples_split': [0.2, 0.5, 0.8]
    }

    dt = DecisionTreeClassifier(random_state=50)

    # Builds a model for each possible combination of all the hyperparamter values provided using cv = 5 (5 fold cross validation)
    # cv = 5, builds a 5 fold cross validated GridSearch Object
    # Set scoring parameter as accuracy as we choose the best model based on the accuracy value
    grid_object = GridSearchCV(estimator=dt,
                               param_grid=grid_params,
                               scoring='accuracy',
                               cv=5,
                               n_jobs=-1)

    print "\nHyper Parameter Tuning Begins\n"
    # fit grid object to the training data
    grid_object.fit(X_train, Y_train)

    print "\n\nBest Param Values \t\t\n\n"
    print(grid_object.best_params_)

    #---- Hyper Parameter Tuning Ends ----

    #----- Reporting Accuracy on Test Set using Model with Best Parameters learned through Hyper Parameter Tuning -----------

    #Building Decision Tree With Best Parameters learned from Hyper Parameter Tuning
    best_params = grid_object.best_params_
    dt = DecisionTreeClassifier(
        criterion=best_params['criterion'],
        splitter=best_params['splitter'],
        max_depth=best_params['max_depth'],
        min_samples_leaf=best_params['min_samples_leaf'],
        min_samples_split=best_params['min_samples_split'],
        random_state=50)

    #dt = DecisionTreeClassifier(criterion='gini')
    dt.fit(X_train, Y_train)
    Y_pred = dt.predict(X_test)

    print "Accuracy score Test = ", accuracy_score(Y_test, Y_pred) * 100

    print "Accuracy score 5-Fold = ", kFoldVal(X_train, Y_train, dt, 5)