Esempio n. 1
0
def upper_region_classifier():
    ur_train_set = pd.read_csv("../resources/datasets/ur_train_set.csv",
                               na_values='?',
                               dtype='category')
    ur_test_set = pd.read_csv("../resources/datasets/ur_test_set.csv",
                              na_values='?',
                              dtype='category')

    # Separate training feature & training labels
    # X = ur_dataset.drop(['class'], axis=1)
    # y = ur_dataset['class']

    X_train = ur_train_set.drop(['class'], axis=1)
    y_train = ur_train_set['class']
    #
    # # Separate testing feature & testing labels
    X_test = ur_test_set.drop(['class'], axis=1)
    y_test = ur_test_set['class']

    # get_baseline_performance(X_train, y_train, X_test, y_test)
    #
    # spot_check_algorithms(X_train, y_train)

    model = RandomForestClassifier()
    model = model.fit(X_train, y_train)
    predictions = model.predict(X_train)
    print_evaluation_results(y_train, predictions)
    #
    predictions = model.predict(X_test)
    print_evaluation_results(y_test, predictions, train=False)

    joblib.dump(model, filename='../resources/models/ur_classifier.pkl')
Esempio n. 2
0
def thoracic_region_classifier():
    tr_train_set = pd.read_csv("../resources/datasets/tr_train_set.csv",
                               na_values='?',
                               dtype='category')
    tr_test_set = pd.read_csv("../resources/datasets/tr_test_set.csv",
                              na_values='?',
                              dtype='category')

    # Separate training feature & training labels
    X_train = tr_train_set.drop(['class'], axis=1)
    y_train = tr_train_set['class']

    # Separate testing feature & testing labels
    X_test = tr_test_set.drop(['class'], axis=1)
    y_test = tr_test_set['class']

    get_baseline_performance(X_train, y_train, X_test, y_test)

    model = RandomForestClassifier()
    model = model.fit(X_train, y_train)
    predictions = model.predict(X_train)
    print_evaluation_results(y_train, predictions)

    predictions = model.predict(X_test)
    print_evaluation_results(y_test, predictions, train=False)

    joblib.dump(model, filename='../resources/models/tr_classifier.pkl')
Esempio n. 3
0
def ep_region_classifier():
    ep_dataset = pd.read_csv("../resources/datasets/ep_train_set.csv", na_values='?', dtype='category')
    # ep_test_set = pd.read_csv("../resources/datasets/ep_test_set.csv", na_values='?', dtype='category')

    # Separate training feature & training labels
    X_train = ep_dataset.drop(['class'], axis=1)
    y_train = ep_dataset['class']

    # Separate testing feature & testing labels
    X_test = ep_dataset.drop(['class'], axis=1)
    y_test = ep_dataset['class']

    X_train, X_Val, y_train, y_val = train_test_split(X_train, y_train, stratify=y_train, test_size=0.1, random_state=42, shuffle=True)

    model = RandomForestClassifier()

    ########################################### Hyper-parameter Tuning ##########################################
    # Perform grid search on the classifier using f1 score as the scoring method
    grid_obj = GridSearchCV(
        estimator=model,
        param_grid={
            'n_estimators': [10, 20, 30],
            'max_depth': [6, 10, 20, 30],
            'min_samples_split': [1, 10, 100]
        },
        n_jobs=-1,
        scoring="f1_micro",
        cv=5,
        verbose=3
    )

    # Fit the grid search object to the training data and find the optimal parameters
    grid_fit = grid_obj.fit(X_train, y_train)

    # Get the best estimator
    best_clf = grid_fit.best_estimator_
    print(best_clf)

    predictions = best_clf.predict(X_Val)
    print_evaluation_results(y_val, predictions, train=False)

    model = best_clf



    ########################################### Final Model ###########################################
    model = model.fit(X_train, y_train)
    predictions = model.predict(X_train)
    print_evaluation_results(y_train, predictions)

    predictions = model.predict(X_test)
    print_evaluation_results(y_test, predictions, train=False)

    joblib.dump(model, filename='../resources/models/ep_classifier.pkl', protocol=2)
Esempio n. 4
0
def classify_by_region(data_frame):
    get_details(data_frame)
    print("Before Oversampling By Region\n", data_frame.groupby('region').size())
    # sns.countplot(data_frame['region'], label="Count")
    # plt.show()

    # sns.heatmap(data_frame.drop('region', axis=1), cmap='cool', annot=True)
    # plt.show()

    # get_feature_correlations(data_frame, plot=True, return_resulst=False)


    X = data_frame.drop(['region', 'class'], axis=1)  # Features - drop class from features - 'age', 'sex',
    y = data_frame['region']  # Labels


    mutual_info = mutual_info_classif(X, y, discrete_features='auto')
    print("mutual_info: ", mutual_info)


    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, shuffle=True)
    # X_validation, X_test, y_validation, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42, shuffle=True)


    sm = BorderlineSMOTE()
    X_resampled, y_resampled = sm.fit_sample(X_train, y_train)
    print("After Oversampling By Region\n", (pd.DataFrame(y_resampled)).groupby('region').size())
    # X_resampled.to_csv('resources/data/X_resampled.csv', index=False)
    # y_resampled.to_csv('resources/data/y_resampled.csv', header=['region'], index=False)



    ###############################################################################
    #                               4. Scale data                                 #
    ###############################################################################
    # sc = StandardScaler()
    # X_resampled = sc.fit_transform(X_resampled)
    # X_test = sc.transform(X_test)



    # https://datascienceplus.com/selecting-categorical-features-in-customer-attrition-prediction-using-python/
    # categorical feature selection
    # sf = SelectKBest(chi2, k='all')
    # sf_fit = sf.fit(X_train, y_train)
    # # print feature scores
    # for i in range(len(sf_fit.scores_)):
    #     print(' %s: %f' % (X_train.columns[i], sf_fit.scores_[i]))
    #
    # # plot the scores
    # datset = pd.DataFrame()
    # datset['feature'] = X_train.columns[range(len(sf_fit.scores_))]
    # datset['scores'] = sf_fit.scores_
    # datset = datset.sort_values(by='scores', ascending=True)
    # sns.barplot(datset['scores'], datset['feature'], color='blue')
    # sns.set_style('whitegrid')
    # plt.ylabel('Categorical Feature', fontsize=18)
    # plt.xlabel('Score', fontsize=18)
    # # plt.show()
    #
    sel_chi2 = SelectKBest(chi2, k='all')  # chi 10 - 0.64, 0.63, 0.60
    X_train_chi2 = sel_chi2.fit_transform(X_resampled, y_resampled)
    X_test_chi2 = sel_chi2.transform(X_test)



    # Spot Check Algorithms
    # spot_check_algorithms(X_resampled, y_resampled)



    # models = [SVC(kernel='poly'), RandomForestClassifier(),  GradientBoostingClassifier()]
    # for i in range(len(models)):
    #     # Get the final model
    #     parent_model = models[i] # LR(multiclass-ovr) -0.66, 0.67, 0.67, 0.69, 0.69, 0.68  MLP wid fs - 0.65, 0.69, 0.70,   GB - 0.67, without fs 0.62, 0.61,    DT - 0.58,   RF - 0.67,  multi_LR - wid fs 0.64 , voting - 0.60
    #
    #     # Train the final model
    #     parent_model.fit(X_resampled, y_resampled)
    #
    #     # Evaluate the final model on the training set
    #     predictions = parent_model.predict(X_resampled)
    #     print_evaluation_results(y_resampled, predictions)
    #
    #     # Evaluate the final model on the test set
    #     predictions = parent_model.predict(X_test)
    #     print_evaluation_results(y_test, predictions, train=False)



    # mlp = OneVsRestClassifier(MLPClassifier(hidden_layer_sizes = [100]*5, random_state=42))

    pipeline = Pipeline(
            [
                # ('selector', SelectKBest(f_classif)),
                ('model',  RandomForestClassifier(n_jobs = -1) )
            ]
    )

    # Perform grid search on the classifier using f1 score as the scoring method
    grid_obj = GridSearchCV(
            estimator= GradientBoostingClassifier(),
            param_grid={
                # 'selector__k': [5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17],
                'n_estimators': [10, 20, 30],
                'max_depth': [6, 10, 20, 30],
                # 'max_depth': [1, 10, 20, 30],
                'min_samples_split': [1, 10, 100]
                # 'model__n_estimators': np.arange(10, 200, 10)
                # 'C': [1, 10, 100]
            },

            n_jobs=-1,
            scoring="f1_micro",
            cv=5,
            verbose=3
    )

    # Fit the grid search object to the training data and find the optimal parameters
    grid_fit =  grid_obj.fit(X_resampled, y_resampled)

    # Get the best estimator
    best_clf = grid_fit.best_estimator_
    print(best_clf)




    # Get the final model
    parent_model = best_clf # LR(multiclass-ovr) -0.66, 0.67, 0.67, 0.69, 0.69, 0.68  MLP wid fs - 0.65, 0.69, 0.70,   GB - 0.67, without fs 0.62, 0.61,    DT - 0.58,   RF - 0.67,  multi_LR - wid fs 0.64 , voting - 0.60

    t0 = time()
    # Train the final model
    parent_model.fit(X_resampled, y_resampled)
    print("training time:", round(time() - t0, 3), "s")

    # Evaluate the final model on the training set
    train_predictions = parent_model.predict(X_resampled)
    print_evaluation_results(y_resampled, train_predictions)

    t0 = time()
    # Evaluate the final model on the test set
    test_predictions = parent_model.predict(X_test)
    print("predicting time:", round(time() - t0, 3), "s")

    print_evaluation_results(y_test, test_predictions, train=False)
    confusion_matrix(parent_model, X_resampled, y_resampled, X_test, y_test)
Esempio n. 5
0
def thoracic_region_classifier():
    data_frame = pd.read_csv("../resources/datasets/thoracic_region.csv",
                             na_values='?',
                             dtype='category')
    data_frame.drop('region', axis=1, inplace=True)

    get_details(data_frame)

    # make_boolean(data_frame)
    print("Before Oversampling By Class\n", data_frame.groupby('class').size())
    # sns.countplot(data_frame['class'], label="Count")
    # plt.show()

    features = data_frame.drop(['class'], axis=1)
    labels = data_frame['class']  # Labels - 1, 22

    X_train, X_test, y_train, y_test = train_test_split(features,
                                                        labels,
                                                        test_size=0.3,
                                                        random_state=42,
                                                        shuffle=True)
    # pd.DataFrame(X_train).to_csv('resources/data/X_train2.csv', index=False)
    # pd.DataFrame(X_test).to_csv('resources/data/X_test2.csv', index=False)
    # pd.DataFrame(y_train).to_csv('resources/data/y_train2.csv', index=False)
    # pd.DataFrame(y_test).to_csv('resources/data/y_test2.csv', index=False)

    print("X_train2 ", pd.DataFrame(X_train).shape)
    print("X_train2 ", pd.DataFrame(y_train).shape)

    # smote = BorderlineSMOTE()
    smote = RandomOverSampler(
    )  # minority - hamming loss increases, accuracy, jaccard, avg f1, macro avg decreases
    X_resampled2, y_resampled2 = smote.fit_sample(X_train, y_train)
    # pd.DataFrame(X_resampled2).to_csv('resources/data/X_resampled2.csv', index=False)
    # pd.DataFrame(y_resampled2).to_csv('resources/data/y_resampled2.csv', index=False)

    print("X_train2 ", pd.DataFrame(X_resampled2).shape)
    print("X_train2 ", pd.DataFrame(y_resampled2).shape)

    df = pd.DataFrame(y_resampled2)
    print(df.groupby('class').size())

    # sel_chi2 = SelectKBest(chi2, k=8)  # select 8 features
    # X_train_chi2 = sel_chi2.fit_transform(X_resampled2, y_resampled2)
    # print(sel_chi2.get_support())
    #
    # X_test_chi2 = sel_chi2.transform(X_test)
    # print(X_test.shape)
    # print(X_test_chi2.shape)

    # # ###############################################################################
    # # #                               4. Scale data                                 #
    # # ###############################################################################
    # sc = StandardScaler()
    # X_resampled2 = sc.fit_transform(X_resampled2)
    # X_test = sc.transform(X_test)

    # Spot Check Algorithms
    # spot_check_algorithms(X_train_chi2, y_resampled2)

    # Make predictions on validation dataset using the selected model
    thoracic_model = DecisionTreeClassifier(
    )  # MLP- 0.88, ExtraTreeClassifier-0.73,0.97,0.94,0.91,0.94, 0.94, 0.86   RF- 0.88, 0.88  GB- 0.89, 0.89  LR()- 0.88, 0.88  LogisticRegression(solver='liblinear', multi_class='ovr') - 0.92, kNN- 0.87, 0.92, 0.84   DT- 0.94, 0.94, 0.89, 0.94  SVC(gamma='auto') - 0.94, MultinomialNB() - 0.88
    # models2 =  VotingClassifier(
    #     estimators=[('rf', random_forest), ('knn', KNeighborsClassifier(n_neighbors=5)), ('NB', GaussianNB())],
    #     voting='hard') # 0.74

    # Train the final model
    thoracic_model = thoracic_model.fit(X_resampled2, y_resampled2)

    # Evaluate the final model on the training set
    predictions = thoracic_model.predict(X_resampled2)
    print_evaluation_results(y_resampled2, predictions)

    # Evaluate the final model on the test set
    predictions = thoracic_model.predict(X_test)
    print_evaluation_results(y_test, predictions, train=False)

    joblib.dump(thoracic_model,
                filename='../resources/models/sub_classifier_2.pkl')
Esempio n. 6
0
def classify_by_region(data_frame):
    X = data_frame.drop([TOP_LEVEL_TARGET, SECOND_LEVEL_TARGET],
                        axis=1)  # Features - drop region, class
    y = data_frame[TOP_LEVEL_TARGET]  # Labels

    # ['age', 'degree-of-diffe', 'sex_2', 'histologic-type_2', 'bone_2',
    #  'neck_2', 'mediastinum_2', 'abdominal_2']

    # data_frame.drop(['lung_2', 'pleura_2', 'peritoneum_2', 'liver_2', 'brain_2', 'skin_2', 'supraclavicular_2',
    #                  'axillar_2', 'bone-marrow_2'], axis=1, inplace=True)
    # get_feature_correlations(data_frame, plot=True, return_resulst=False)
    mutual_info = mutual_info_classif(X, y, discrete_features='auto')
    print("mutual_info: ", mutual_info)

    # 0.3 test size = 0.56 f1
    # 0.2 test size = 0.61 f1
    X_train, X_test, y_train, y_test = train_test_split(
        X,
        y,
        stratify=y,
        test_size=0.2,
        random_state=RANDOM_STATE,
        shuffle=True)

    # reject_sampler = FunctionSampler(func=outlier_rejection)
    # X_train, y_train = reject_sampler.fit_resample(X_train, y_train)

    # Baseline

    # Spot Check Algorithms
    # spot_check_algorithms(X_resampled, y_resampled)

    ##########   Handle Class Imabalnce  #########
    sm = BorderlineSMOTE(random_state=42)
    X_resampled, y_resampled = sm.fit_sample(X_train, y_train)
    print("After Oversampling By Region\n",
          (pd.DataFrame(y_resampled)).groupby('region').size())

    # https://datascienceplus.com/selecting-categorical-features-in-customer-attrition-prediction-using-python/
    # categorical feature selection
    sf = SelectKBest(f_classif, k='all')
    sf_fit = sf.fit(X_resampled, y_resampled)
    # print feature scores
    for i in range(len(sf_fit.scores_)):
        print(' %s: %f' % (X_resampled.columns[i], sf_fit.scores_[i]))

    # plot the scores
    # datset = pd.DataFrame()
    # datset['feature'] = X_train.columns[range(len(sf_fit.scores_))]
    # datset['scores'] = sf_fit.scores_
    # datset = datset.sort_values(by='scores', ascending=True)
    # sns.barplot(datset['scores'], datset['feature'], color='blue')
    # sns.set_style('whitegrid')
    # plt.ylabel('Categorical Feature', fontsize=18)
    # plt.xlabel('Score', fontsize=18)
    # plt.show()

    # models = [SVC(kernel='poly'), RandomForestClassifier(),  GradientBoostingClassifier()]
    # for i in range(len(models)):
    #     # Get the final model
    #     parent_model = models[i] # LR(multiclass-ovr) -0.66, 0.67, 0.67, 0.69, 0.69, 0.68  MLP wid fs - 0.65, 0.69, 0.70,   GB - 0.67, without fs 0.62, 0.61,    DT - 0.58,   RF - 0.67,  multi_LR - wid fs 0.64 , voting - 0.60
    #
    #     # Train the final model
    #     parent_model.fit(X_resampled, y_resampled)
    #
    #     # Evaluate the final model on the training set
    #     predictions = parent_model.predict(X_resampled)
    #     print_evaluation_results(y_resampled, predictions)
    #
    #     # Evaluate the final model on the test set
    #     predictions = parent_model.predict(X_test)
    #     print_evaluation_results(y_test, predictions, train=False)

    model = RandomForestClassifier(RANDOM_STATE)

    ########################################### Hyper-parameter Tuning ##########################################
    best_clf_rf = tune_random_forest(model, X_resampled, y_resampled)

    # g = GridSearchCV(
    #     estimator=GradientBoostingClassifier(),
    #     param_grid={
    #         "learning_rate"    : [0.05, 0.10, 0.15, 0.20, 0.25, 0.30 ] ,
    #          "max_depth"        : [ 3, 4, 5, 6, 8, 10, 12, 15],
    #          "min_child_weight" : [ 1, 3, 5, 7 ],
    #          "gamma"            : [ 0.0, 0.1, 0.2 , 0.3, 0.4 ],
    #          "colsample_bytree" : [ 0.3, 0.4, 0.5 , 0.7 ]
    #     },
    #     n_jobs=-1,
    #     scoring="f1_micro",
    #     cv=5,
    #     verbose=1
    # )
    # # Fit the grid search object to the training data and find the optimal parameters
    # grid_fit = grid_obj.fit(X_resampled, y_resampled)
    #
    # # Get the best estimator
    # best_clf_gb= grid_fit.best_estimator_
    # print(best_clf_gb)

    ########################################### Final Model ###########################################
    parent_model = best_clf_rf  # LR(multiclass-ovr) -0.66, 0.67, 0.67, 0.69, 0.69, 0.68  MLP wid fs - 0.65, 0.69, 0.70,   GB - 0.67, without fs 0.62, 0.61,    DT - 0.58,   RF - 0.67,  multi_LR - wid fs 0.64 , voting - 0.60

    t0 = time()
    # Train the final model
    parent_model.fit(X_resampled, y_resampled)
    print("training time:", round(time() - t0, 3), "s")

    # Evaluate the final model on the training set
    train_predictions = parent_model.predict(X_resampled)
    print_evaluation_results(y_resampled, train_predictions)

    t0 = time()
    # Evaluate the final model on the test set
    test_predictions = parent_model.predict(X_test)
    print("predicting time:", round(time() - t0, 3), "s")

    print_evaluation_results(y_test, test_predictions, train=False)
    confusion_matrix(parent_model, X_resampled, y_resampled, X_test, y_test)

    # https://towardsdatascience.com/feature-selection-using-random-forest-26d7b747597f
    sel = SelectFromModel(best_clf_rf)
    sel.fit(X_resampled, y_resampled)
    print(sel.get_support())
    selected_feat = X_resampled.columns[(sel.get_support())]
    print(len(selected_feat))
    print(selected_feat)
Esempio n. 7
0
def ip_region_classifier():
    ip_train_set = pd.read_csv("../resources/datasets/ip_train_set.csv",
                               dtype='category')
    ip_test_set = pd.read_csv("../resources/datasets/ip_test_set.csv",
                              dtype='category')

    # print("ip missing ", ip_train_set.isnull().sum().sum())
    # get_feature_correlations(ip_train_set)

    # Separate training feature & training labels
    X_train = ip_train_set.drop(['class'], axis=1)
    y_train = ip_train_set['class']

    # Separate testing feature & testing labels
    X_test = ip_test_set.drop(['class'], axis=1)
    y_test = ip_test_set['class']

    get_baseline_performance(X_train, y_train, X_test, y_test)

    model = RandomForestClassifier(random_state=RANDOM_STATE)
    model = model.fit(X_train, y_train)

    # https://towardsdatascience.com/machine-learning-kaggle-competition-part-two-improving-e5b4d61ab4b8

    # https://www.kaggle.com/residentmario/automated-feature-selection-with-sklearn
    # pd.Series(model.feature_importances_, index=X_train.columns[0:]).plot.bar(color='steelblue', figsize=(12, 6))
    # plt.show()

    # from sklearn.feature_selection import mutual_info_classif
    # kepler_mutual_information = mutual_info_classif(X_train, y_train)
    # plt.subplots(1, figsize=(26, 1))
    # sns.heatmap(kepler_mutual_information[:, np.newaxis].T, cmap='Blues', cbar=False, linewidths=1, annot=True)
    # plt.yticks([], [])
    # plt.gca().set_xticklabels(X_train.columns[0:], rotation=45, ha='right', fontsize=12)
    # plt.suptitle("Kepler Variable Importance (mutual_info_classif)", fontsize=18, y=1.2)
    # plt.gcf().subplots_adjust(wspace=0.2)
    # plt.show()
    #
    # trans = GenericUnivariateSelect(score_func=mutual_info_classif, mode='percentile', param=50)
    # kepler_X_trans = trans.fit_transform(X_train, y_train)
    # kepler_X_test_trans = trans.transform(X_test)
    # print("We started with {0} features but retained only {1} of them!".format(X_train.shape[1] - 1,
    #                                                                            kepler_X_trans.shape[1]))

    # https://www.kaggle.com/yaldazare/feature-selection-and-data-visualization
    # we will not only find best features but we also find how many features do we need for best accuracy.
    # The "accuracy" scoring is proportional to the number of correct classifications
    clf_rf_4 = RandomForestClassifier()
    # cv = KFold(n_repeats=3, n_splits=10, random_state=42)
    rfecv = RFECV(estimator=clf_rf_4, step=1, cv=5,
                  scoring='f1_micro')  # 5-fold cross-validation
    rfecv = rfecv.fit(X_train, y_train)

    print('Optimal number of features :', rfecv.n_features_)
    print('Best features :', X_train.columns[rfecv.support_])

    # Plot number of features VS. cross-validation scores
    import matplotlib.pyplot as plt
    plt.figure()
    plt.xlabel("Number of features selected")
    plt.ylabel("Cross validation score of number of selected features")
    plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
    plt.show()

    clr_rf_5 = model.fit(X_train, y_train)
    importances = clr_rf_5.feature_importances_
    std = np.std([tree.feature_importances_ for tree in model.estimators_],
                 axis=0)
    indices = np.argsort(importances)[::-1]

    # Print the feature ranking
    print("Feature ranking:")
    for f in range(X_train.shape[1]):
        print("%d. feature %d (%f)" %
              (f + 1, indices[f], importances[indices[f]]))

    # Plot the feature importances of the forest
    plt.figure(1, figsize=(14, 13))
    plt.title("Feature importances")
    plt.bar(range(X_train.shape[1]),
            importances[indices],
            color="g",
            yerr=std[indices],
            align="center")
    plt.xticks(range(X_train.shape[1]), X_train.columns[indices], rotation=90)
    plt.xlim([-1, X_train.shape[1]])
    plt.show()

    predictions = model.predict(X_train)
    print_evaluation_results(y_train, predictions)

    predictions = model.predict(X_test)
    print_evaluation_results(y_test, predictions, train=False)

    joblib.dump(model, filename='../resources/models/ip_classifier.pkl')
Esempio n. 8
0
def upper_region_classifier():
    ur_train_set = pd.read_csv("../resources/datasets/ur_train_set.csv",
                               na_values='?',
                               dtype='category')
    ur_test_set = pd.read_csv("../resources/datasets/ur_test_set.csv",
                              na_values='?',
                              dtype='category')

    # Separate training feature & training labels
    X = ur_train_set.drop([SECOND_LEVEL_TARGET], axis=1)
    y = ur_train_set[SECOND_LEVEL_TARGET]

    # Separate testing feature & testing labels
    X_test = ur_test_set.drop([SECOND_LEVEL_TARGET], axis=1)
    y_test = ur_test_set[SECOND_LEVEL_TARGET]

    # Dividing training set into sub-training & validation set
    X_train, X_val, y_train, y_val = train_test_split(X,
                                                      y,
                                                      test_size=0.1,
                                                      stratify=y)

    # get_baseline_performance(X_train, y_train, X_test, y_test)

    # spot_check_algorithms(X_train, y_train)

    model = RandomForestClassifier()
    model = model.fit(X_train, y_train)

    predictions = model.predict(X_train)
    print_evaluation_results(y_train, predictions)
    predictions = model.predict(X_test)
    print_evaluation_results(y_test, predictions, train=False)

    svm_model = SVC(kernel='poly', gamma=0.1, C=1.0)

    ############################################# Hyper-parameter Tuning #######################################£££####
    # cv = RepeatedStratifiedKFold(n_splits=3, random_state=42)
    params = {
        'n_estimators': [5, 10, 20, 30],
        'max_depth': [4, 6, 10, 12],
        'random_state': [13]
    }

    # Instantiate the grid search model
    grid_search = GridSearchCV(estimator=model,
                               param_grid=params,
                               scoring="accuracy",
                               cv=5,
                               n_jobs=-1,
                               verbose=1)
    grid_search.fit(X_train, y_train)
    print(grid_search.best_params_)
    print(grid_search.best_score_)
    model = grid_search.best_estimator_

    params = {
        "C": (0.1, 0.5, 1, 2, 5, 10, 20),
        "gamma": (0.001, 0.01, 0.1, 0.25, 0.5, 0.75, 1),
        "kernel": ('linear', 'poly', 'rbf')
    }
    svm_grid = GridSearchCV(svm_model,
                            params,
                            n_jobs=-1,
                            cv=5,
                            verbose=1,
                            scoring="accuracy")
    svm_grid.fit(X_train, y_train)
    print(svm_grid.best_params_)
    print(svm_grid.best_score_)

    n_estimators = [int(x) for x in np.linspace(start=200, stop=2000, num=10)]
    max_features = ['auto', 'sqrt']
    max_depth = [int(x) for x in np.linspace(10, 110, num=11)]
    max_depth.append(None)
    min_samples_split = [2, 5, 10]
    min_samples_leaf = [1, 2, 4]
    bootstrap = [True, False]

    random_grid = {
        'n_estimators': n_estimators,
        'max_features': max_features,
        'max_depth': max_depth,
        'min_samples_split': min_samples_split,
        'min_samples_leaf': min_samples_leaf,
        'bootstrap': bootstrap
    }

    rand_forest = RandomForestClassifier(random_state=42)
    rf_random = RandomizedSearchCV(estimator=rand_forest,
                                   param_distributions=random_grid,
                                   n_iter=100,
                                   cv=3,
                                   verbose=2,
                                   random_state=42,
                                   n_jobs=-1)

    rf_random.fit(X_train, y_train)
    print(rf_random.best_params_)
    print(rf_random.best_score_)

    model = tune_random_forest(model, X_train, y_train)
    # model = rf_random.best_estimator_

    ##################################################  Model Evaluation ###########################################£££####
    predictions = model.predict(X_train)
    print_evaluation_results(y_train, predictions)
    predictions = model.predict(X_test)
    print_evaluation_results(y_test, predictions, train=False)
Esempio n. 9
0
def classify_by_region(data_frame):
    X = data_frame.drop([TOP_LEVEL_TARGET, SECOND_LEVEL_TARGET], axis=1)  # Features - drop region, class
    y = data_frame[TOP_LEVEL_TARGET]  # Labels


    # get_feature_correlations(data_frame, plot=True, return_resulst=False)
    # mutual_info = mutual_info_classif(X, y, discrete_features='auto')
    # print("mutual_info: ", mutual_info)

    X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42, shuffle=True)


    ##########   Handle Class Imabalnce  #########
    sm = BorderlineSMOTE()
    X_resampled, y_resampled = sm.fit_sample(X_train, y_train)
    print("After Oversampling By Region\n", (pd.DataFrame(y_resampled)).groupby('region').size())

    ###############################################################################
    #                               4. Scale data                                 #
    ###############################################################################
    # sc = StandardScaler()
    # X_resampled = sc.fit_transform(X_resampled)
    # X_test = sc.transform(X_test)




    # https://datascienceplus.com/selecting-categorical-features-in-customer-attrition-prediction-using-python/
    # categorical feature selection
    # sf = SelectKBest(chi2, k='all')
    # sf_fit = sf.fit(X_train, y_train)
    # # print feature scores
    # for i in range(len(sf_fit.scores_)):
    #     print(' %s: %f' % (X_train.columns[i], sf_fit.scores_[i]))
    #
    # # plot the scores
    # datset = pd.DataFrame()
    # datset['feature'] = X_train.columns[range(len(sf_fit.scores_))]
    # datset['scores'] = sf_fit.scores_
    # datset = datset.sort_values(by='scores', ascending=True)
    # sns.barplot(datset['scores'], datset['feature'], color='blue')
    # sns.set_style('whitegrid')
    # plt.ylabel('Categorical Feature', fontsize=18)
    # plt.xlabel('Score', fontsize=18)
    # # plt.show()
    #
    sel_chi2 = SelectKBest(chi2, k='all')  # chi 10 - 0.64, 0.63, 0.60
    X_train_chi2 = sel_chi2.fit_transform(X_resampled, y_resampled)
    X_test_chi2 = sel_chi2.transform(X_test)






    # mlp = OneVsRestClassifier(MLPClassifier(hidden_layer_sizes = [100]*5, random_state=42))


    # Spot Check Algorithms
    # spot_check_algorithms(X_resampled, y_resampled)




    # models = [SVC(kernel='poly'), RandomForestClassifier(),  GradientBoostingClassifier()]
    # for i in range(len(models)):
    #     # Get the final model
    #     parent_model = models[i] # LR(multiclass-ovr) -0.66, 0.67, 0.67, 0.69, 0.69, 0.68  MLP wid fs - 0.65, 0.69, 0.70,   GB - 0.67, without fs 0.62, 0.61,    DT - 0.58,   RF - 0.67,  multi_LR - wid fs 0.64 , voting - 0.60
    #
    #     # Train the final model
    #     parent_model.fit(X_resampled, y_resampled)
    #
    #     # Evaluate the final model on the training set
    #     predictions = parent_model.predict(X_resampled)
    #     print_evaluation_results(y_resampled, predictions)
    #
    #     # Evaluate the final model on the test set
    #     predictions = parent_model.predict(X_test)
    #     print_evaluation_results(y_test, predictions, train=False)





    # pipeline = Pipeline(
    #         [
    #             # ('selector', SelectKBest(f_classif)),
    #             ('model',  RandomForestClassifier(n_jobs = -1) )
    #         ]
    # )
    #
    # # Perform grid search on the classifier using f1 score as the scoring method
    # grid_obj = GridSearchCV(
    #         estimator= GradientBoostingClassifier(),
    #         param_grid={
    #             # 'selector__k': [5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17],
    #             'n_estimators': [10, 20, 30],
    #             'max_depth': [6, 10, 20, 30],
    #             # 'max_depth': [1, 10, 20, 30],
    #             'min_samples_split': [1, 10, 100]
    #             # 'model__n_estimators': np.arange(10, 200, 10)
    #             # 'C': [1, 10, 100]
    #         },
    #
    #         n_jobs=-1,
    #         scoring="f1_micro",
    #         cv=5,
    #         verbose=3
    # )
    #
    # # Fit the grid search object to the training data and find the optimal parameters
    # grid_fit =  grid_obj.fit(X_resampled, y_resampled)

    # # Get the best estimator
    # best_clf = grid_fit.best_estimator_
    # print(best_clf)


    # Get the final model
    parent_model =  SVC(kernel = 'rbf', C = 10)#KNN(n_neighbors = 7)-0.52 # LR(multiclass-ovr) -0.66, 0.67, 0.67, 0.69, 0.69, 0.68  MLP wid fs - 0.65, 0.69, 0.70,   GB - 0.67, without fs 0.62, 0.61,    DT - 0.58,   RF - 0.67,  multi_LR - wid fs 0.64 , voting - 0.60

    t0 = time()
    # Train the final model
    parent_model.fit(X_resampled, y_resampled)
    print("training time:", round(time() - t0, 3), "s")

    # Evaluate the final model on the training set
    train_predictions = parent_model.predict(X_resampled)
    print_evaluation_results(y_resampled, train_predictions)

    t0 = time()
    # Evaluate the final model on the test set
    test_predictions = parent_model.predict(X_test)
    print("predicting time:", round(time() - t0, 3), "s")

    print_evaluation_results(y_test, test_predictions, train=False)
    confusion_matrix(parent_model, X_resampled, y_resampled, X_test, y_test)

    # Plot normalized confusion matrix
    # fig = plt.figure()
    # fig.set_size_inches(8, 8, forward=True)
    # # fig.align_labels()
    # plot_confusion_matrix(cnf_matrix, classes=["1", "2", "3", "4"], normalize=False, title='Normalized confusion matrix')


    # probs = parent_model.predict_proba(X_test)
    # print("Prediction probabilities for Region\n", probs)
    # plotConfusionMatrix(X_test, y_test, ['1', '2', '3', '4'])

    joblib.dump(parent_model, filename='../resources/models/parent_classifier.pkl')
Esempio n. 10
0
def ip_region_classifier():
    ip_dataset = pd.read_csv("../resources/datasets/ip_dataset.csv",   dtype='category')
    #
    # # Separate training feature & training labels
    X = ip_dataset.drop([SECOND_LEVEL_TARGET], axis=1)
    y = ip_dataset[SECOND_LEVEL_TARGET]
    #
    # # Spot check
    # # spot_check_algorithms(X, y)
    #
    # # Create a cross-validation strategy
    cv = RepeatedStratifiedKFold(n_splits=3, random_state=42)
    imba_pipeline = make_pipeline(RandomOverSampler(random_state=42),
                                  SelectKBest(k=17),
                                  RandomForestClassifier(n_estimators=500, max_depth=2, random_state=42))
    scoring = ['accuracy', 'f1_micro', 'precision_micro', 'recall_micro']
    cv_results = cross_validate(imba_pipeline, X, y, cv=cv, scoring=scoring, n_jobs=-1)
    print(sorted(cv_results.keys()))
    print(cv_results['fit_time'].mean())
    print(cv_results['score_time'].mean())
    print(cv_results['test_accuracy'].mean())
    print(cv_results['test_f1_micro'].mean())
    print(cv_results['test_precision_micro'].mean())
    print(cv_results['test_recall_micro'].mean())





    # Separate training feature & training labels
    X = ip_dataset.drop([SECOND_LEVEL_TARGET], axis=1)
    y = ip_dataset[SECOND_LEVEL_TARGET]

    # Separate testing feature & testing labels
    X_test = ip_dataset.drop([SECOND_LEVEL_TARGET], axis=1)
    y_test = ip_dataset[SECOND_LEVEL_TARGET]

    get_baseline_performance(X, y, X_test, y_test)

    model = RandomForestClassifier(random_state=42)
    model = model.fit(X, y)

    # https://towardsdatascience.com/machine-learning-kaggle-competition-part-two-improving-e5b4d61ab4b8

    # https://www.kaggle.com/residentmario/automated-feature-selection-with-sklearn
    # pd.Series(model.feature_importances_, index=X_train.columns[0:]).plot.bar(color='steelblue', figsize=(12, 6))
    # plt.show()

    # from sklearn.feature_selection import mutual_info_classif
    # kepler_mutual_information = mutual_info_classif(X_train, y_train)
    # plt.subplots(1, figsize=(26, 1))
    # sns.heatmap(kepler_mutual_information[:, np.newaxis].T, cmap='Blues', cbar=False, linewidths=1, annot=True)
    # plt.yticks([], [])
    # plt.gca().set_xticklabels(X_train.columns[0:], rotation=45, ha='right', fontsize=12)
    # plt.suptitle("Kepler Variable Importance (mutual_info_classif)", fontsize=18, y=1.2)
    # plt.gcf().subplots_adjust(wspace=0.2)
    # plt.show()
    #
    # trans = GenericUnivariateSelect(score_func=mutual_info_classif, mode='percentile', param=50)
    # kepler_X_trans = trans.fit_transform(X_train, y_train)
    # kepler_X_test_trans = trans.transform(X_test)
    # print("We started with {0} features but retained only {1} of them!".format(X_train.shape[1] - 1,
    #                                                                            kepler_X_trans.shape[1]))


    # https://www.kaggle.com/yaldazare/feature-selection-and-data-visualization
    # we will not only find best features but we also find how many features do we need for best accuracy.
    # The "accuracy" scoring is proportional to the number of correct classifications
    clf_rf_4 = RandomForestClassifier()
    # cv = KFold(n_repeats=3, n_splits=10, random_state=42)
    rfecv = RFECV(estimator=clf_rf_4, step=1, cv=5, scoring='f1_micro')  # 5-fold cross-validation
    rfecv = rfecv.fit(X, y)

    print('Optimal number of features :', rfecv.n_features_)
    print('Best features :', X.columns[rfecv.support_])

    import matplotlib.pyplot as plt
    # Plot number of features VS. cross-validation scores
    import matplotlib.pyplot as plt
    plt.figure()
    plt.xlabel("Number of features selected")
    plt.ylabel("Cross validation score of number of selected features")
    plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
    plt.show()


    clr_rf_5 = model.fit(X, y)
    importances = clr_rf_5.feature_importances_
    std = np.std([tree.feature_importances_ for tree in model.estimators_], axis=0)
    indices = np.argsort(importances)[::-1]

    # Print the feature ranking
    print("Feature ranking:")
    for f in range(X.shape[1]):
        print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))

    # Plot the feature importances of the forest
    plt.figure(1, figsize=(14, 13))
    plt.title("Feature importances")
    plt.bar(range(X.shape[1]), importances[indices], color="g", yerr=std[indices], align="center")
    plt.xticks(range(X.shape[1]), X.columns[indices], rotation=90)
    plt.xlim([-1, X.shape[1]])
    plt.show()



    predictions = model.predict(X)
    print_evaluation_results(y, predictions)

    predictions = model.predict(X_test)
    print_evaluation_results(y_test, predictions, train=False)

    # joblib.dump(model, filename='../resources/models/ip_classifier.pkl')

    print("Intraperitoneal region Columns ", list(X.columns))
    final_model = model.fit(X, y)
    joblib.dump(final_model, filename='../resources/models/IPRModel.pkl')
Esempio n. 11
0
def extra_peritoneum_region_classifier():
    data_frame = pd.read_csv(
        "../resources/datasets/extra_peritoneum_region.csv",
        na_values='?',
        dtype='category')
    data_frame.drop('region', axis=1, inplace=True)

    get_details(data_frame)

    print("Before Oversampling By Class\n", data_frame.groupby('class').size())
    # make_boolean(data_frame)
    # sns.countplot(data_frame['class'], label="Count")
    # plt.show()

    features = data_frame.drop(['class'], axis=1)
    labels = data_frame['class']  # Labels - 8, 14, 15, 16, 17, 18, 19, 20, 21

    # pca = decomposition.PCA(n_components=9)
    # pca.fit(features)
    # features = pca.transform(features)

    X_train, X_test, y_train, y_test = train_test_split(features,
                                                        labels,
                                                        test_size=0.3,
                                                        random_state=42,
                                                        shuffle=True)
    # pd.DataFrame(X_train).to_csv('resources/data/X_train2.csv', index=False)
    # pd.DataFrame(X_test).to_csv('resources/data/X_test2.csv', index=False)
    # pd.DataFrame(y_train).to_csv('resources/data/y_train2.csv', index=False)
    # pd.DataFrame(y_test).to_csv('resources/data/y_test2.csv', index=False)

    print("X_train2 ", pd.DataFrame(X_train).shape)
    print("X_train2 ", pd.DataFrame(y_train).shape)

    # smote = RandomOverSampler()  # minority - hamming loss increases, accuracy, jaccard, avg f1, macro avg decreases
    # X_resampled2, y_resampled2 = smote.fit_sample(X_train2, y_train2)
    # # X_resampled2, y_resampled2 = SMOTE().fit_resample(X_resampled2, y_resampled2)
    # pd.DataFrame(X_resampled2).to_csv('resources/data/X_resampled2.csv', index=False)
    # pd.DataFrame(y_resampled2).to_csv('resources/data/y_resampled2.csv', index=False)

    X_resampled2 = None
    y_resampled2 = None
    smote = RandomOverSampler()

    # for i in range(4):
    #     X_resampled2, y_resampled2 = smote.fit_resample(X_train2, y_train2)
    #     X_train2 = X_resampled2
    #     y_train2 = y_resampled2
    #     print("X_train2 ", pd.DataFrame(X_resampled2).shape)
    #     print("X_train2 ", pd.DataFrame(y_resampled2).shape)

    X_resampled, y_resampled = smote.fit_resample(X_train, y_train)
    # pd.DataFrame(X_resampled).to_csv('resources/data/X_resampled2.csv', index=False)
    # pd.DataFrame(y_resampled).to_csv('resources/data/y_resampled2.csv', index=False)

    # print("X_train2 ", pd.DataFrame(X_resampled2).shape)
    # print("X_train2 ", pd.DataFrame(y_resampled2).shape)

    df = pd.DataFrame(y_resampled)
    print(df.groupby('class').size())

    sel_chi2 = SelectKBest(chi2, k=8)  # select 8 features
    X_train_chi2 = sel_chi2.fit_transform(X_resampled, y_resampled)
    print(sel_chi2.get_support())

    X_test_chi2 = sel_chi2.transform(X_test)
    print(X_test.shape)
    print(X_test_chi2.shape)

    # # ###############################################################################
    # # #                               4. Scale data                                 #
    # # ###############################################################################
    # sc = StandardScaler()
    # X_resampled2 = sc.fit_transform(X_resampled2)
    # X_test2 = sc.transform(X_test2)

    estimators = [('rf', RandomForestClassifier(random_state=42)),
                  ('svr',
                   make_pipeline(StandardScaler(), KNeighborsClassifier()))]
    # clf = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression(multi_class='ovr'))
    clf = StackingClassifier(
        estimators=estimators,
        final_estimator=LogisticRegression(multi_class='ovr'))

    # Spot Check Algorithms
    spot_check_algorithms(X_train_chi2, y_resampled)

    # Make predictions on validation dataset using the selected model
    # model2 = OneVsRestClassifier(GaussianNB()) # 0.43
    # model2 = DecisionTreeClassifier() # 0.78, 0.86, 0.72
    # model2 = RandomForestClassifier() # 0.80, 0.74, 0.69, 0.65, 0.66
    # model2 = GradientBoostingClassifier()  #
    # model2 =  VotingClassifier(estimators=[('rf', RandomForestClassifier()), ('mlp', MLPClassifier()), (('NB', GaussianNB()))], voting='hard') # 0.51

    extra_peritoneum_model = OneVsRestClassifier(
        RandomForestClassifier()
    )  # -0.48, wid 8 features - 0.48, 0.52, 0.53, 0.54, 0.57, wid * f - 0.51,  clf - 0.48, kNN - 0.49

    #  Train the final model
    extra_peritoneum_model = extra_peritoneum_model.fit(
        X_train_chi2, y_resampled)

    # Evaluate the final model on the training set
    predictions = extra_peritoneum_model.predict(X_train_chi2)
    print_evaluation_results(y_resampled, predictions)

    # Evaluate the final model on the test set
    predictions = extra_peritoneum_model.predict(X_test_chi2)
    print_evaluation_results(y_test, predictions, train=False)

    joblib.dump(extra_peritoneum_model,
                filename='../resources/models/sub_classifier_4.pkl')
Esempio n. 12
0
def upper_region_classifier():
    # Read in data
    data_frame = pd.read_csv("../resources/datasets/upper_region.csv",
                             na_values='?',
                             dtype='category')
    data_frame.drop('region', axis=1, inplace=True)

    get_details(data_frame)

    print("Before Oversampling By Class\n", data_frame.groupby('class').size())
    # sns.countplot(data_frame['class'], label="Count")
    # plt.show()

    features = data_frame.drop(['class'], axis=1)
    labels = data_frame['class']  # Labels - 2, 4, 10

    X_train, X_test, y_train, y_test = train_test_split(features,
                                                        labels,
                                                        test_size=0.3,
                                                        random_state=42,
                                                        shuffle=True)

    print("X_train2 ", pd.DataFrame(X_train).shape)
    print("X_train2 ", pd.DataFrame(y_train).shape)

    ros = RandomOverSampler(
    )  # minority - hamming loss increases, accuracy, jaccard, avg f1, macro avg decreases
    X_resampled, y_resampled = ros.fit_sample(X_train, y_train)

    print("X_train2 ", pd.DataFrame(X_resampled).shape)
    print("X_train2 ", pd.DataFrame(y_resampled).shape)

    df = pd.DataFrame(y_resampled)
    print(df.groupby('class').size())

    # https://datascienceplus.com/selecting-categorical-features-in-customer-attrition-prediction-using-python/
    # categorical feature selection
    # sf = SelectKBest(chi2, k='all')
    # sf_fit = sf.fit(X_train, y_train)
    # # print feature scores
    # for i in range(len(sf_fit.scores_)):
    #     print(' %s: %f' % (X_train.columns[i], sf_fit.scores_[i]))
    #
    # # plot the scores
    # datset = pd.DataFrame()
    # datset['feature'] = X_train.columns[range(len(sf_fit.scores_))]
    # datset['scores'] = sf_fit.scores_
    # datset = datset.sort_values(by='scores', ascending=True)
    # sns.barplot(datset['scores'], datset['feature'], color='blue')
    # sns.set_style('whitegrid')
    # plt.ylabel('Categorical Feature', fontsize=18)
    # plt.xlabel('Score', fontsize=18)
    # plt.show()

    # sel_chi2 = SelectKBest(chi2, k=9)  # DT chi 9- 0.83*2, 10- 91,92
    #                                     # RF chi 9- 1,1,91,1,91
    #                                     # GB chi 9- 91,91,82,91
    # X_train_chi2 = sel_chi2.fit_transform(X_resampled, y_resampled)
    # X_test_chi2 = sel_chi2.transform(X_test)

    # # ###############################################################################
    # # #                               4. Scale data                                 #
    # # ###############################################################################
    # sc = StandardScaler()
    # X_resampled2 = sc.fit_transform(X_resampled2)
    # X_test2 = sc.transform(X_test2)

    # get_baseline_performance(X_resampled, y_resampled, X_test, y_test)

    # Spot Check Algorithms
    # spot_check_algorithms(X_resampled, y_resampled)

    # Make predictions on validation dataset using the selected model
    # upper_region_model = MLP # DT()- 0.92*4,   RF()-0.91,   RF(n_estimators=200) - 0.91, 1.0, # kNN - 0.91, knn(neigh-3) - 0.78, knn(neigh-5) - 0.91, DT - 0.79, 0.91, 0.92,  SVC(gamma='auto') - 0.91, LogisticRegression(solver='liblinear', multi_class='ovr') - 0.85,

    upper_region_model = RandomForestClassifier(n_jobs=-1,
                                                max_depth=20,
                                                n_estimators=200)

    # define Boruta feature selection method
    # feat_selector = BorutaPy(upper_region_model, n_estimators='auto', verbose=2, random_state=1)
    # # find all relevant features - 5 features should be selected
    # feat_selector.fit(X_resampled, y_resampled)
    # # check selected features - first 5 features are selected
    # print(feat_selector.support_)
    # # check ranking of features
    # print(feat_selector.ranking_)
    # # call transform() on X to filter it down to selected features
    # X_filtered = feat_selector.transform(X_resampled)

    # Train the final model
    upper_region_model = upper_region_model.fit(X_resampled, y_resampled)

    # Evaluate the final model on the training set
    predictions = upper_region_model.predict(X_resampled)
    print_evaluation_results(y_resampled, predictions)

    # Evaluate the final model on the test set
    predictions = upper_region_model.predict(X_test)
    print_evaluation_results(y_test, predictions, train=False)
Esempio n. 13
0
def intra_peritoneum_region_classifier():
    data_frame = pd.read_csv(
        "../resources/datasets/intra_peritoneum_region.csv",
        na_values='?',
        dtype='category')
    data_frame.drop('region', axis=1, inplace=True)

    get_details(data_frame)

    print("Before Oversampling By Class\n", data_frame.groupby('class').size())
    # sns.countplot(data_frame['class'], label="Count")
    # plt.show()

    features = data_frame.drop(['class'], axis=1)
    labels = data_frame['class']  # Labels - 3, 5, 6, 7, 11, 12, 13

    # pca = decomposition.PCA(n_components=9)
    # pca.fit(features)
    # features = pca.transform(features)

    X_train, X_test, y_train, y_test = train_test_split(features,
                                                        labels,
                                                        test_size=0.3,
                                                        random_state=42,
                                                        shuffle=True)
    # pd.DataFrame(X_train).to_csv('resources/data/X_train2.csv', index=False)
    # pd.DataFrame(X_test).to_csv('resources/data/X_test2.csv', index=False)
    # pd.DataFrame(y_train).to_csv('resources/data/y_train2.csv', index=False)
    # pd.DataFrame(y_test).to_csv('resources/data/y_test2.csv', index=False)

    print("X_train2 ", pd.DataFrame(X_train).shape)
    print("X_train2 ", pd.DataFrame(y_train).shape)

    smote = RandomOverSampler(
        sampling_strategy='minority'
    )  # minority - hamming loss increases, accuracy, jaccard, avg f1, macro avg decreases
    X_resampled, y_resampled = smote.fit_sample(X_train, y_train)
    # X_resampled2, y_resampled2 = SMOTE().fit_resample(X_resampled2, y_resampled2)
    # pd.DataFrame(X_resampled).to_csv('resources/data/X_resampled2.csv', index=False)
    # pd.DataFrame(y_resampled).to_csv('resources/data/y_resampled2.csv', index=False)

    print("X_train2 ", pd.DataFrame(X_resampled).shape)
    print("X_train2 ", pd.DataFrame(y_resampled).shape)

    df = pd.DataFrame(y_resampled)
    print(df.groupby('class').size())

    sel_chi2 = SelectKBest(chi2, k=8)  # select 9 features
    X_train_chi2 = sel_chi2.fit_transform(X_resampled, y_resampled)
    print(sel_chi2.get_support())

    X_test_chi2 = sel_chi2.transform(X_test)
    print(X_test.shape)
    print(X_test_chi2.shape)

    # # ###############################################################################
    # # #                               4. Scale data                                 #
    # # ###############################################################################
    # sc = StandardScaler()
    # X_resampled2 = sc.fit_transform(X_resampled2)
    # X_test2 = sc.transform(X_test2)

    # Spot Check Algorithms
    # spot_check_algorithms(X_train_chi2, y_resampled)

    # Make predictions on validation dataset using the selected model
    # intra_peritoneum_model =  KNeighborsClassifier(n_neighbors=5)  # kNN()- 0.39, kNN(neig-5) - 0.44, 0.39, LogisticRegression(solver='liblinear', multi_class='ovr'), wid 8, 9 features - 0.42, wid 12 featues - 0.40,  SVC(gamma='auto') - 0.35, OneVsRestClassifier(GaussianNB()) - 0.05

    intra_peritoneum_model = IsolationForest(n_estimators=100)
    #  Train the final model
    intra_peritoneum_model = intra_peritoneum_model.fit(
        X_train_chi2, y_resampled)

    # Evaluate the final model on the training set
    predictions = intra_peritoneum_model.predict(X_train_chi2)
    print_evaluation_results(y_resampled, predictions)

    # Evaluate the final model on the test set
    predictions = intra_peritoneum_model.predict(X_test_chi2)
    print_evaluation_results(y_test, predictions, train=False)

    joblib.dump(intra_peritoneum_model,
                filename='../resources/models/sub_classifier_3.pkl')