def main_function(data_frame): get_details(data_frame) print("Class count\n", data_frame.groupby(SECOND_LEVEL_TARGET).size()) # Impute missing values data_frame = impute_missing_values(data_frame, "most_frequent") print(data_frame.head(20)) print(data_frame.isnull().sum().sum()) # Get the correlation matrix # get_feature_correlations(data_frame, plot=True, return_resulst=True) # Check if duplicate records exist is_duplicated = check_duplicates(data_frame) # Drop duplicate records if exist if is_duplicated: data_frame.drop_duplicates(inplace=True) print("Dropped duplicate records. Size after dropping duplicates: ", data_frame.shape) # One Hot Encoding columns_to_encode = [ 'sex', 'histologic-type', 'bone', 'bone-marrow', 'lung', 'pleura', 'peritoneum', 'liver', 'brain', 'skin', 'neck', 'supraclavicular', 'axillar', 'mediastinum', 'abdominal', 'small-intestine' ] data_frame = perform_one_hot_encoding(data_frame, columns_to_encode) # Pre-prcoessed dataset pre_processed_data = data_frame # Top Level Classifier - classify by region classify_by_region(pre_processed_data) # Create balanced datasets for the second level # create_separate_datasets(pre_processed_data) # # # upper_region_classifier() # # thoracic_region_classifier() # ip_region_classifier() # ep_region_classifier()
plt.show() ######################################################### EDA ######################################################## print("\n\n!!!!!!!!!!!!!!!!!!!!!!! EDA !!!!!!!!!!!!!!!!!!!!!!!!\n") get_details(data_frame) # visualize_class_distribution(data_frame, "class") # visualise_feature_distribution(data_frame) is_duplicated = check_duplicates(data_frame) ################################################### Data Preprocessing ############################################### print( "\n\n!!!!!!!!!!!!!!!!!!!!!!! DATA PREPROCESSING !!!!!!!!!!!!!!!!!!!!!!!!\n" ) # Impute missing values data_frame = impute_missing_values(data_frame, "most_frequent") # Drop duplicate records if exist if is_duplicated: data_frame.drop_duplicates(inplace=True) print("Dropped duplicate records. Size after dropping duplicates: ", data_frame.shape) print("Combining original dataset with synthetic samples") data_frame = pd.concat([data_frame, data_frame2]) get_details(data_frame) # One Hot Encoding columns_to_encode = [ 'sex', 'histologic-type', 'bone', 'bone-marrow', 'lung', 'pleura', 'peritoneum', 'liver', 'brain', 'skin', 'neck', 'supraclavicular',