Ejemplo n.º 1
0
        # Cast score attribute as binary 'pass' (0) / 'fail' (1)
        y_train = train['score'].apply(lambda x: 0 if x >= 40 else 1)
        y_test = test['score'].apply(lambda x: 0 if x >= 40 else 1)

        columns = list(X_train)

        # Apply SMOTE to the training data
        X_train, y_train = SMOTE(random_state=0).fit_resample(X_train, y_train)
        X_train = pd.DataFrame(data=X_train, columns=columns)
        y_train = pd.Series(y_train)

        # Handle duplicate rows by reducing to majority class
        if len(X_train) != len(X_train.drop_duplicates()):

            # Group training data by matching all attributes (duplicates)
            train_groups = X_train.groupby(by=list(X_train))

            # Drop duplicate rows in the training data
            X_train = X_train.drop_duplicates().copy(deep=True)

            mode_labels = {}

            # Loop through each unique (non-duplicate) row
            for unique_row in train_groups.groups.keys():

                # Retrieve all labels for a given row and its duplicates
                group_labels = y_train.loc[train_groups.groups[unique_row]]

                # Record the majority class for each unique row
                mode_labels[unique_row] = group_labels.value_counts().idxmax()