Esempio n. 1
0
def get_unique_matrix(X, y):
    X_unique, unique_indexes = X.unique_rows(return_index=True)
    assert np.array_equal(X_unique.columnlabels, X.columnlabels)
    y_unique = Matrix(y.data[unique_indexes], y.rowlabels[unique_indexes],
                      y.columnlabels)

    rowlabels = np.empty_like(X_unique.rowlabels, dtype=object)
    exp_set = set()
    for i, row in enumerate(X_unique.data):
        exp_label = tuple((l, r) for l, r in zip(X_unique.columnlabels, row))
        assert exp_label not in exp_set
        rowlabels[i] = exp_label
        exp_set.add(exp_label)
    y_unique.rowlabels = rowlabels
    X_unique.rowlabels = rowlabels
    if X_unique.data.shape != X.data.shape:
        print "\n\nDIFF(num_knobs={}): X_unique: {}, X: {}\n\n".format(
            X_unique.columnlabels.shape[0], X_unique.data.shape, X.data.shape)
        dup_map = {}
        dup_indexes = np.array([d for d in range(X.data.shape[0]) \
                                if d not in unique_indexes])
        for dup_idx in dup_indexes:
            dup_label = tuple((u''+l,r) for l,r in \
                              zip(X_unique.columnlabels,
                                  X.data[dup_idx]))
            primary_idx = [idx for idx,rl in enumerate(rowlabels) \
                           if rl == dup_label]
            assert len(primary_idx) == 1
            primary_idx = primary_idx[0]
            if primary_idx not in dup_map:
                dup_map[primary_idx] = [y_unique.data[primary_idx]]
            dup_map[primary_idx].append(y.data[dup_idx])
        for idx, yvals in dup_map.iteritems():
            y_unique.data[idx] = np.median(np.vstack(yvals), axis=0)
    return X_unique, y_unique