def get_unique_matrix(X, y): X_unique, unique_indexes = X.unique_rows(return_index=True) assert np.array_equal(X_unique.columnlabels, X.columnlabels) y_unique = Matrix(y.data[unique_indexes], y.rowlabels[unique_indexes], y.columnlabels) rowlabels = np.empty_like(X_unique.rowlabels, dtype=object) exp_set = set() for i, row in enumerate(X_unique.data): exp_label = tuple((l, r) for l, r in zip(X_unique.columnlabels, row)) assert exp_label not in exp_set rowlabels[i] = exp_label exp_set.add(exp_label) y_unique.rowlabels = rowlabels X_unique.rowlabels = rowlabels if X_unique.data.shape != X.data.shape: print "\n\nDIFF(num_knobs={}): X_unique: {}, X: {}\n\n".format( X_unique.columnlabels.shape[0], X_unique.data.shape, X.data.shape) dup_map = {} dup_indexes = np.array([d for d in range(X.data.shape[0]) \ if d not in unique_indexes]) for dup_idx in dup_indexes: dup_label = tuple((u''+l,r) for l,r in \ zip(X_unique.columnlabels, X.data[dup_idx])) primary_idx = [idx for idx,rl in enumerate(rowlabels) \ if rl == dup_label] assert len(primary_idx) == 1 primary_idx = primary_idx[0] if primary_idx not in dup_map: dup_map[primary_idx] = [y_unique.data[primary_idx]] dup_map[primary_idx].append(y.data[dup_idx]) for idx, yvals in dup_map.iteritems(): y_unique.data[idx] = np.median(np.vstack(yvals), axis=0) return X_unique, y_unique