# Cast score attribute as binary 'pass' (0) / 'fail' (1) y_train = train['score'].apply(lambda x: 0 if x >= 40 else 1) y_test = test['score'].apply(lambda x: 0 if x >= 40 else 1) columns = list(X_train) # Apply SMOTE to the training data X_train, y_train = SMOTE(random_state=0).fit_resample(X_train, y_train) X_train = pd.DataFrame(data=X_train, columns=columns) y_train = pd.Series(y_train) # Handle duplicate rows by reducing to majority class if len(X_train) != len(X_train.drop_duplicates()): # Group training data by matching all attributes (duplicates) train_groups = X_train.groupby(by=list(X_train)) # Drop duplicate rows in the training data X_train = X_train.drop_duplicates().copy(deep=True) mode_labels = {} # Loop through each unique (non-duplicate) row for unique_row in train_groups.groups.keys(): # Retrieve all labels for a given row and its duplicates group_labels = y_train.loc[train_groups.groups[unique_row]] # Record the majority class for each unique row mode_labels[unique_row] = group_labels.value_counts().idxmax()