Ejemplo n.º 1
0
            mode_labels = {}

            # Loop through each unique (non-duplicate) row
            for unique_row in train_groups.groups.keys():

                # Retrieve all labels for a given row and its duplicates
                group_labels = y_train.loc[train_groups.groups[unique_row]]

                # Record the majority class for each unique row
                mode_labels[unique_row] = group_labels.value_counts().idxmax()

            # Reduce training labels to match training data indices
            y_train = y_train.loc[X_train.index]

            # Set the appropriate training labels to the majority class
            for index, row in X_train.iterrows():
                y_train[index] = mode_labels[tuple(row.tolist())]

        km = None

        # Initialize K-Means object and set number of clusters for edge cases
        num_rows = len(X_train)

        if num_rows <= 8:
            km = KMeans(n_clusters=(int(num_rows**0.5)),
                        random_state=0,
                        n_jobs=-1)
        else:
            km = KMeans(random_state=0, n_jobs=-1)

        # Fit the K-Means object to the training data