Beispiel #1
0
def main_function(data_frame):
    get_details(data_frame)
    print("Class count\n", data_frame.groupby(SECOND_LEVEL_TARGET).size())

    # Impute missing values
    data_frame = impute_missing_values(data_frame, "most_frequent")
    print(data_frame.head(20))
    print(data_frame.isnull().sum().sum())

    # Get the correlation matrix
    # get_feature_correlations(data_frame, plot=True, return_resulst=True)

    # Check if duplicate records exist
    is_duplicated = check_duplicates(data_frame)
    # Drop duplicate records if exist
    if is_duplicated:
        data_frame.drop_duplicates(inplace=True)
        print("Dropped duplicate records. Size after dropping duplicates: ",
              data_frame.shape)

    # One Hot Encoding
    columns_to_encode = [
        'sex', 'histologic-type', 'bone', 'bone-marrow', 'lung', 'pleura',
        'peritoneum', 'liver', 'brain', 'skin', 'neck', 'supraclavicular',
        'axillar', 'mediastinum', 'abdominal', 'small-intestine'
    ]
    data_frame = perform_one_hot_encoding(data_frame, columns_to_encode)

    # Pre-prcoessed dataset
    pre_processed_data = data_frame

    # Top Level Classifier - classify by region
    classify_by_region(pre_processed_data)

    # Create balanced datasets for the second level
    # create_separate_datasets(pre_processed_data)
    # #
    # upper_region_classifier()
    #
    # thoracic_region_classifier()
    #
    ip_region_classifier()
    #
    ep_region_classifier()
Beispiel #2
0
print(data_frame2.shape)

# Check for equal distributions
figsize(8, 8)
# Density plot of the final predictions and the test values
sns.kdeplot(data_frame['brain'], label='original')
sns.kdeplot(data_frame2['brain'], label='Synthetic')
# Label the plot
plt.xlabel('Primary Tumor Sites')
plt.ylabel('Density')
plt.title('Test Values and Predictions')
plt.show()

#########################################################  EDA  ########################################################
print("\n\n!!!!!!!!!!!!!!!!!!!!!!!  EDA  !!!!!!!!!!!!!!!!!!!!!!!!\n")
get_details(data_frame)
# visualize_class_distribution(data_frame, "class")
# visualise_feature_distribution(data_frame)
is_duplicated = check_duplicates(data_frame)

###################################################  Data Preprocessing  ###############################################
print(
    "\n\n!!!!!!!!!!!!!!!!!!!!!!!  DATA PREPROCESSING  !!!!!!!!!!!!!!!!!!!!!!!!\n"
)

# Impute missing values
data_frame = impute_missing_values(data_frame, "most_frequent")

# Drop duplicate records if exist
if is_duplicated:
    data_frame.drop_duplicates(inplace=True)
Beispiel #3
0
def classify_by_region(data_frame):
    get_details(data_frame)
    print("Before Oversampling By Region\n", data_frame.groupby('region').size())
    # sns.countplot(data_frame['region'], label="Count")
    # plt.show()

    # sns.heatmap(data_frame.drop('region', axis=1), cmap='cool', annot=True)
    # plt.show()

    # get_feature_correlations(data_frame, plot=True, return_resulst=False)


    X = data_frame.drop(['region', 'class'], axis=1)  # Features - drop class from features - 'age', 'sex',
    y = data_frame['region']  # Labels


    mutual_info = mutual_info_classif(X, y, discrete_features='auto')
    print("mutual_info: ", mutual_info)


    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, shuffle=True)
    # X_validation, X_test, y_validation, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42, shuffle=True)


    sm = BorderlineSMOTE()
    X_resampled, y_resampled = sm.fit_sample(X_train, y_train)
    print("After Oversampling By Region\n", (pd.DataFrame(y_resampled)).groupby('region').size())
    # X_resampled.to_csv('resources/data/X_resampled.csv', index=False)
    # y_resampled.to_csv('resources/data/y_resampled.csv', header=['region'], index=False)



    ###############################################################################
    #                               4. Scale data                                 #
    ###############################################################################
    # sc = StandardScaler()
    # X_resampled = sc.fit_transform(X_resampled)
    # X_test = sc.transform(X_test)



    # https://datascienceplus.com/selecting-categorical-features-in-customer-attrition-prediction-using-python/
    # categorical feature selection
    # sf = SelectKBest(chi2, k='all')
    # sf_fit = sf.fit(X_train, y_train)
    # # print feature scores
    # for i in range(len(sf_fit.scores_)):
    #     print(' %s: %f' % (X_train.columns[i], sf_fit.scores_[i]))
    #
    # # plot the scores
    # datset = pd.DataFrame()
    # datset['feature'] = X_train.columns[range(len(sf_fit.scores_))]
    # datset['scores'] = sf_fit.scores_
    # datset = datset.sort_values(by='scores', ascending=True)
    # sns.barplot(datset['scores'], datset['feature'], color='blue')
    # sns.set_style('whitegrid')
    # plt.ylabel('Categorical Feature', fontsize=18)
    # plt.xlabel('Score', fontsize=18)
    # # plt.show()
    #
    sel_chi2 = SelectKBest(chi2, k='all')  # chi 10 - 0.64, 0.63, 0.60
    X_train_chi2 = sel_chi2.fit_transform(X_resampled, y_resampled)
    X_test_chi2 = sel_chi2.transform(X_test)



    # Spot Check Algorithms
    # spot_check_algorithms(X_resampled, y_resampled)



    # models = [SVC(kernel='poly'), RandomForestClassifier(),  GradientBoostingClassifier()]
    # for i in range(len(models)):
    #     # Get the final model
    #     parent_model = models[i] # LR(multiclass-ovr) -0.66, 0.67, 0.67, 0.69, 0.69, 0.68  MLP wid fs - 0.65, 0.69, 0.70,   GB - 0.67, without fs 0.62, 0.61,    DT - 0.58,   RF - 0.67,  multi_LR - wid fs 0.64 , voting - 0.60
    #
    #     # Train the final model
    #     parent_model.fit(X_resampled, y_resampled)
    #
    #     # Evaluate the final model on the training set
    #     predictions = parent_model.predict(X_resampled)
    #     print_evaluation_results(y_resampled, predictions)
    #
    #     # Evaluate the final model on the test set
    #     predictions = parent_model.predict(X_test)
    #     print_evaluation_results(y_test, predictions, train=False)



    # mlp = OneVsRestClassifier(MLPClassifier(hidden_layer_sizes = [100]*5, random_state=42))

    pipeline = Pipeline(
            [
                # ('selector', SelectKBest(f_classif)),
                ('model',  RandomForestClassifier(n_jobs = -1) )
            ]
    )

    # Perform grid search on the classifier using f1 score as the scoring method
    grid_obj = GridSearchCV(
            estimator= GradientBoostingClassifier(),
            param_grid={
                # 'selector__k': [5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17],
                'n_estimators': [10, 20, 30],
                'max_depth': [6, 10, 20, 30],
                # 'max_depth': [1, 10, 20, 30],
                'min_samples_split': [1, 10, 100]
                # 'model__n_estimators': np.arange(10, 200, 10)
                # 'C': [1, 10, 100]
            },

            n_jobs=-1,
            scoring="f1_micro",
            cv=5,
            verbose=3
    )

    # Fit the grid search object to the training data and find the optimal parameters
    grid_fit =  grid_obj.fit(X_resampled, y_resampled)

    # Get the best estimator
    best_clf = grid_fit.best_estimator_
    print(best_clf)




    # Get the final model
    parent_model = best_clf # LR(multiclass-ovr) -0.66, 0.67, 0.67, 0.69, 0.69, 0.68  MLP wid fs - 0.65, 0.69, 0.70,   GB - 0.67, without fs 0.62, 0.61,    DT - 0.58,   RF - 0.67,  multi_LR - wid fs 0.64 , voting - 0.60

    t0 = time()
    # Train the final model
    parent_model.fit(X_resampled, y_resampled)
    print("training time:", round(time() - t0, 3), "s")

    # Evaluate the final model on the training set
    train_predictions = parent_model.predict(X_resampled)
    print_evaluation_results(y_resampled, train_predictions)

    t0 = time()
    # Evaluate the final model on the test set
    test_predictions = parent_model.predict(X_test)
    print("predicting time:", round(time() - t0, 3), "s")

    print_evaluation_results(y_test, test_predictions, train=False)
    confusion_matrix(parent_model, X_resampled, y_resampled, X_test, y_test)
Beispiel #4
0
def thoracic_region_classifier():
    data_frame = pd.read_csv("../resources/datasets/thoracic_region.csv",
                             na_values='?',
                             dtype='category')
    data_frame.drop('region', axis=1, inplace=True)

    get_details(data_frame)

    # make_boolean(data_frame)
    print("Before Oversampling By Class\n", data_frame.groupby('class').size())
    # sns.countplot(data_frame['class'], label="Count")
    # plt.show()

    features = data_frame.drop(['class'], axis=1)
    labels = data_frame['class']  # Labels - 1, 22

    X_train, X_test, y_train, y_test = train_test_split(features,
                                                        labels,
                                                        test_size=0.3,
                                                        random_state=42,
                                                        shuffle=True)
    # pd.DataFrame(X_train).to_csv('resources/data/X_train2.csv', index=False)
    # pd.DataFrame(X_test).to_csv('resources/data/X_test2.csv', index=False)
    # pd.DataFrame(y_train).to_csv('resources/data/y_train2.csv', index=False)
    # pd.DataFrame(y_test).to_csv('resources/data/y_test2.csv', index=False)

    print("X_train2 ", pd.DataFrame(X_train).shape)
    print("X_train2 ", pd.DataFrame(y_train).shape)

    # smote = BorderlineSMOTE()
    smote = RandomOverSampler(
    )  # minority - hamming loss increases, accuracy, jaccard, avg f1, macro avg decreases
    X_resampled2, y_resampled2 = smote.fit_sample(X_train, y_train)
    # pd.DataFrame(X_resampled2).to_csv('resources/data/X_resampled2.csv', index=False)
    # pd.DataFrame(y_resampled2).to_csv('resources/data/y_resampled2.csv', index=False)

    print("X_train2 ", pd.DataFrame(X_resampled2).shape)
    print("X_train2 ", pd.DataFrame(y_resampled2).shape)

    df = pd.DataFrame(y_resampled2)
    print(df.groupby('class').size())

    # sel_chi2 = SelectKBest(chi2, k=8)  # select 8 features
    # X_train_chi2 = sel_chi2.fit_transform(X_resampled2, y_resampled2)
    # print(sel_chi2.get_support())
    #
    # X_test_chi2 = sel_chi2.transform(X_test)
    # print(X_test.shape)
    # print(X_test_chi2.shape)

    # # ###############################################################################
    # # #                               4. Scale data                                 #
    # # ###############################################################################
    # sc = StandardScaler()
    # X_resampled2 = sc.fit_transform(X_resampled2)
    # X_test = sc.transform(X_test)

    # Spot Check Algorithms
    # spot_check_algorithms(X_train_chi2, y_resampled2)

    # Make predictions on validation dataset using the selected model
    thoracic_model = DecisionTreeClassifier(
    )  # MLP- 0.88, ExtraTreeClassifier-0.73,0.97,0.94,0.91,0.94, 0.94, 0.86   RF- 0.88, 0.88  GB- 0.89, 0.89  LR()- 0.88, 0.88  LogisticRegression(solver='liblinear', multi_class='ovr') - 0.92, kNN- 0.87, 0.92, 0.84   DT- 0.94, 0.94, 0.89, 0.94  SVC(gamma='auto') - 0.94, MultinomialNB() - 0.88
    # models2 =  VotingClassifier(
    #     estimators=[('rf', random_forest), ('knn', KNeighborsClassifier(n_neighbors=5)), ('NB', GaussianNB())],
    #     voting='hard') # 0.74

    # Train the final model
    thoracic_model = thoracic_model.fit(X_resampled2, y_resampled2)

    # Evaluate the final model on the training set
    predictions = thoracic_model.predict(X_resampled2)
    print_evaluation_results(y_resampled2, predictions)

    # Evaluate the final model on the test set
    predictions = thoracic_model.predict(X_test)
    print_evaluation_results(y_test, predictions, train=False)

    joblib.dump(thoracic_model,
                filename='../resources/models/sub_classifier_2.pkl')
Beispiel #5
0
def extra_peritoneum_region_classifier():
    data_frame = pd.read_csv(
        "../resources/datasets/extra_peritoneum_region.csv",
        na_values='?',
        dtype='category')
    data_frame.drop('region', axis=1, inplace=True)

    get_details(data_frame)

    print("Before Oversampling By Class\n", data_frame.groupby('class').size())
    # make_boolean(data_frame)
    # sns.countplot(data_frame['class'], label="Count")
    # plt.show()

    features = data_frame.drop(['class'], axis=1)
    labels = data_frame['class']  # Labels - 8, 14, 15, 16, 17, 18, 19, 20, 21

    # pca = decomposition.PCA(n_components=9)
    # pca.fit(features)
    # features = pca.transform(features)

    X_train, X_test, y_train, y_test = train_test_split(features,
                                                        labels,
                                                        test_size=0.3,
                                                        random_state=42,
                                                        shuffle=True)
    # pd.DataFrame(X_train).to_csv('resources/data/X_train2.csv', index=False)
    # pd.DataFrame(X_test).to_csv('resources/data/X_test2.csv', index=False)
    # pd.DataFrame(y_train).to_csv('resources/data/y_train2.csv', index=False)
    # pd.DataFrame(y_test).to_csv('resources/data/y_test2.csv', index=False)

    print("X_train2 ", pd.DataFrame(X_train).shape)
    print("X_train2 ", pd.DataFrame(y_train).shape)

    # smote = RandomOverSampler()  # minority - hamming loss increases, accuracy, jaccard, avg f1, macro avg decreases
    # X_resampled2, y_resampled2 = smote.fit_sample(X_train2, y_train2)
    # # X_resampled2, y_resampled2 = SMOTE().fit_resample(X_resampled2, y_resampled2)
    # pd.DataFrame(X_resampled2).to_csv('resources/data/X_resampled2.csv', index=False)
    # pd.DataFrame(y_resampled2).to_csv('resources/data/y_resampled2.csv', index=False)

    X_resampled2 = None
    y_resampled2 = None
    smote = RandomOverSampler()

    # for i in range(4):
    #     X_resampled2, y_resampled2 = smote.fit_resample(X_train2, y_train2)
    #     X_train2 = X_resampled2
    #     y_train2 = y_resampled2
    #     print("X_train2 ", pd.DataFrame(X_resampled2).shape)
    #     print("X_train2 ", pd.DataFrame(y_resampled2).shape)

    X_resampled, y_resampled = smote.fit_resample(X_train, y_train)
    # pd.DataFrame(X_resampled).to_csv('resources/data/X_resampled2.csv', index=False)
    # pd.DataFrame(y_resampled).to_csv('resources/data/y_resampled2.csv', index=False)

    # print("X_train2 ", pd.DataFrame(X_resampled2).shape)
    # print("X_train2 ", pd.DataFrame(y_resampled2).shape)

    df = pd.DataFrame(y_resampled)
    print(df.groupby('class').size())

    sel_chi2 = SelectKBest(chi2, k=8)  # select 8 features
    X_train_chi2 = sel_chi2.fit_transform(X_resampled, y_resampled)
    print(sel_chi2.get_support())

    X_test_chi2 = sel_chi2.transform(X_test)
    print(X_test.shape)
    print(X_test_chi2.shape)

    # # ###############################################################################
    # # #                               4. Scale data                                 #
    # # ###############################################################################
    # sc = StandardScaler()
    # X_resampled2 = sc.fit_transform(X_resampled2)
    # X_test2 = sc.transform(X_test2)

    estimators = [('rf', RandomForestClassifier(random_state=42)),
                  ('svr',
                   make_pipeline(StandardScaler(), KNeighborsClassifier()))]
    # clf = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression(multi_class='ovr'))
    clf = StackingClassifier(
        estimators=estimators,
        final_estimator=LogisticRegression(multi_class='ovr'))

    # Spot Check Algorithms
    spot_check_algorithms(X_train_chi2, y_resampled)

    # Make predictions on validation dataset using the selected model
    # model2 = OneVsRestClassifier(GaussianNB()) # 0.43
    # model2 = DecisionTreeClassifier() # 0.78, 0.86, 0.72
    # model2 = RandomForestClassifier() # 0.80, 0.74, 0.69, 0.65, 0.66
    # model2 = GradientBoostingClassifier()  #
    # model2 =  VotingClassifier(estimators=[('rf', RandomForestClassifier()), ('mlp', MLPClassifier()), (('NB', GaussianNB()))], voting='hard') # 0.51

    extra_peritoneum_model = OneVsRestClassifier(
        RandomForestClassifier()
    )  # -0.48, wid 8 features - 0.48, 0.52, 0.53, 0.54, 0.57, wid * f - 0.51,  clf - 0.48, kNN - 0.49

    #  Train the final model
    extra_peritoneum_model = extra_peritoneum_model.fit(
        X_train_chi2, y_resampled)

    # Evaluate the final model on the training set
    predictions = extra_peritoneum_model.predict(X_train_chi2)
    print_evaluation_results(y_resampled, predictions)

    # Evaluate the final model on the test set
    predictions = extra_peritoneum_model.predict(X_test_chi2)
    print_evaluation_results(y_test, predictions, train=False)

    joblib.dump(extra_peritoneum_model,
                filename='../resources/models/sub_classifier_4.pkl')
Beispiel #6
0
def upper_region_classifier():
    # Read in data
    data_frame = pd.read_csv("../resources/datasets/upper_region.csv",
                             na_values='?',
                             dtype='category')
    data_frame.drop('region', axis=1, inplace=True)

    get_details(data_frame)

    print("Before Oversampling By Class\n", data_frame.groupby('class').size())
    # sns.countplot(data_frame['class'], label="Count")
    # plt.show()

    features = data_frame.drop(['class'], axis=1)
    labels = data_frame['class']  # Labels - 2, 4, 10

    X_train, X_test, y_train, y_test = train_test_split(features,
                                                        labels,
                                                        test_size=0.3,
                                                        random_state=42,
                                                        shuffle=True)

    print("X_train2 ", pd.DataFrame(X_train).shape)
    print("X_train2 ", pd.DataFrame(y_train).shape)

    ros = RandomOverSampler(
    )  # minority - hamming loss increases, accuracy, jaccard, avg f1, macro avg decreases
    X_resampled, y_resampled = ros.fit_sample(X_train, y_train)

    print("X_train2 ", pd.DataFrame(X_resampled).shape)
    print("X_train2 ", pd.DataFrame(y_resampled).shape)

    df = pd.DataFrame(y_resampled)
    print(df.groupby('class').size())

    # https://datascienceplus.com/selecting-categorical-features-in-customer-attrition-prediction-using-python/
    # categorical feature selection
    # sf = SelectKBest(chi2, k='all')
    # sf_fit = sf.fit(X_train, y_train)
    # # print feature scores
    # for i in range(len(sf_fit.scores_)):
    #     print(' %s: %f' % (X_train.columns[i], sf_fit.scores_[i]))
    #
    # # plot the scores
    # datset = pd.DataFrame()
    # datset['feature'] = X_train.columns[range(len(sf_fit.scores_))]
    # datset['scores'] = sf_fit.scores_
    # datset = datset.sort_values(by='scores', ascending=True)
    # sns.barplot(datset['scores'], datset['feature'], color='blue')
    # sns.set_style('whitegrid')
    # plt.ylabel('Categorical Feature', fontsize=18)
    # plt.xlabel('Score', fontsize=18)
    # plt.show()

    # sel_chi2 = SelectKBest(chi2, k=9)  # DT chi 9- 0.83*2, 10- 91,92
    #                                     # RF chi 9- 1,1,91,1,91
    #                                     # GB chi 9- 91,91,82,91
    # X_train_chi2 = sel_chi2.fit_transform(X_resampled, y_resampled)
    # X_test_chi2 = sel_chi2.transform(X_test)

    # # ###############################################################################
    # # #                               4. Scale data                                 #
    # # ###############################################################################
    # sc = StandardScaler()
    # X_resampled2 = sc.fit_transform(X_resampled2)
    # X_test2 = sc.transform(X_test2)

    # get_baseline_performance(X_resampled, y_resampled, X_test, y_test)

    # Spot Check Algorithms
    # spot_check_algorithms(X_resampled, y_resampled)

    # Make predictions on validation dataset using the selected model
    # upper_region_model = MLP # DT()- 0.92*4,   RF()-0.91,   RF(n_estimators=200) - 0.91, 1.0, # kNN - 0.91, knn(neigh-3) - 0.78, knn(neigh-5) - 0.91, DT - 0.79, 0.91, 0.92,  SVC(gamma='auto') - 0.91, LogisticRegression(solver='liblinear', multi_class='ovr') - 0.85,

    upper_region_model = RandomForestClassifier(n_jobs=-1,
                                                max_depth=20,
                                                n_estimators=200)

    # define Boruta feature selection method
    # feat_selector = BorutaPy(upper_region_model, n_estimators='auto', verbose=2, random_state=1)
    # # find all relevant features - 5 features should be selected
    # feat_selector.fit(X_resampled, y_resampled)
    # # check selected features - first 5 features are selected
    # print(feat_selector.support_)
    # # check ranking of features
    # print(feat_selector.ranking_)
    # # call transform() on X to filter it down to selected features
    # X_filtered = feat_selector.transform(X_resampled)

    # Train the final model
    upper_region_model = upper_region_model.fit(X_resampled, y_resampled)

    # Evaluate the final model on the training set
    predictions = upper_region_model.predict(X_resampled)
    print_evaluation_results(y_resampled, predictions)

    # Evaluate the final model on the test set
    predictions = upper_region_model.predict(X_test)
    print_evaluation_results(y_test, predictions, train=False)
Beispiel #7
0
def intra_peritoneum_region_classifier():
    data_frame = pd.read_csv(
        "../resources/datasets/intra_peritoneum_region.csv",
        na_values='?',
        dtype='category')
    data_frame.drop('region', axis=1, inplace=True)

    get_details(data_frame)

    print("Before Oversampling By Class\n", data_frame.groupby('class').size())
    # sns.countplot(data_frame['class'], label="Count")
    # plt.show()

    features = data_frame.drop(['class'], axis=1)
    labels = data_frame['class']  # Labels - 3, 5, 6, 7, 11, 12, 13

    # pca = decomposition.PCA(n_components=9)
    # pca.fit(features)
    # features = pca.transform(features)

    X_train, X_test, y_train, y_test = train_test_split(features,
                                                        labels,
                                                        test_size=0.3,
                                                        random_state=42,
                                                        shuffle=True)
    # pd.DataFrame(X_train).to_csv('resources/data/X_train2.csv', index=False)
    # pd.DataFrame(X_test).to_csv('resources/data/X_test2.csv', index=False)
    # pd.DataFrame(y_train).to_csv('resources/data/y_train2.csv', index=False)
    # pd.DataFrame(y_test).to_csv('resources/data/y_test2.csv', index=False)

    print("X_train2 ", pd.DataFrame(X_train).shape)
    print("X_train2 ", pd.DataFrame(y_train).shape)

    smote = RandomOverSampler(
        sampling_strategy='minority'
    )  # minority - hamming loss increases, accuracy, jaccard, avg f1, macro avg decreases
    X_resampled, y_resampled = smote.fit_sample(X_train, y_train)
    # X_resampled2, y_resampled2 = SMOTE().fit_resample(X_resampled2, y_resampled2)
    # pd.DataFrame(X_resampled).to_csv('resources/data/X_resampled2.csv', index=False)
    # pd.DataFrame(y_resampled).to_csv('resources/data/y_resampled2.csv', index=False)

    print("X_train2 ", pd.DataFrame(X_resampled).shape)
    print("X_train2 ", pd.DataFrame(y_resampled).shape)

    df = pd.DataFrame(y_resampled)
    print(df.groupby('class').size())

    sel_chi2 = SelectKBest(chi2, k=8)  # select 9 features
    X_train_chi2 = sel_chi2.fit_transform(X_resampled, y_resampled)
    print(sel_chi2.get_support())

    X_test_chi2 = sel_chi2.transform(X_test)
    print(X_test.shape)
    print(X_test_chi2.shape)

    # # ###############################################################################
    # # #                               4. Scale data                                 #
    # # ###############################################################################
    # sc = StandardScaler()
    # X_resampled2 = sc.fit_transform(X_resampled2)
    # X_test2 = sc.transform(X_test2)

    # Spot Check Algorithms
    # spot_check_algorithms(X_train_chi2, y_resampled)

    # Make predictions on validation dataset using the selected model
    # intra_peritoneum_model =  KNeighborsClassifier(n_neighbors=5)  # kNN()- 0.39, kNN(neig-5) - 0.44, 0.39, LogisticRegression(solver='liblinear', multi_class='ovr'), wid 8, 9 features - 0.42, wid 12 featues - 0.40,  SVC(gamma='auto') - 0.35, OneVsRestClassifier(GaussianNB()) - 0.05

    intra_peritoneum_model = IsolationForest(n_estimators=100)
    #  Train the final model
    intra_peritoneum_model = intra_peritoneum_model.fit(
        X_train_chi2, y_resampled)

    # Evaluate the final model on the training set
    predictions = intra_peritoneum_model.predict(X_train_chi2)
    print_evaluation_results(y_resampled, predictions)

    # Evaluate the final model on the test set
    predictions = intra_peritoneum_model.predict(X_test_chi2)
    print_evaluation_results(y_test, predictions, train=False)

    joblib.dump(intra_peritoneum_model,
                filename='../resources/models/sub_classifier_3.pkl')