mode_labels = {} # Loop through each unique (non-duplicate) row for unique_row in train_groups.groups.keys(): # Retrieve all labels for a given row and its duplicates group_labels = y_train.loc[train_groups.groups[unique_row]] # Record the majority class for each unique row mode_labels[unique_row] = group_labels.value_counts().idxmax() # Reduce training labels to match training data indices y_train = y_train.loc[X_train.index] # Set the appropriate training labels to the majority class for index, row in X_train.iterrows(): y_train[index] = mode_labels[tuple(row.tolist())] km = None # Initialize K-Means object and set number of clusters for edge cases num_rows = len(X_train) if num_rows <= 8: km = KMeans(n_clusters=(int(num_rows**0.5)), random_state=0, n_jobs=-1) else: km = KMeans(random_state=0, n_jobs=-1) # Fit the K-Means object to the training data