Example #1
0
def get_state_probs(v, allstates=None, weights=None, normalized=True):
    """
    Get probability of unique states. There is an option to allow for weights counting of the words.
    
    Params:
    -------
    states (ndarray)
        nsamples x ndim
    weights (vector)
    normalized (bool=True)
        Return probability distribution instead of frequency count
    
    Returns:
    --------
    freq (ndarray) : vector of the probabilities of each state
    """
    if v.ndim == 1:
        v = v[:, None]
    n = v.shape[1]
    return_all_states = False

    if allstates is None:
        allstates = v[unique_rows(v)]
        uniqIx = unique_rows(v, return_inverse=True)
        freq = np.bincount(uniqIx)
        return_all_states = True
    else:
        if weights is None:
            weights = np.ones((v.shape[0]))

        freq = np.zeros(allstates.shape[0])
        for j, vote in enumerate(allstates):
            ix = (vote == v).sum(1) == n
            freq[j] = (ix * weights).sum()
        if np.isclose(np.sum(freq), np.sum(weights)) == 0:
            import warnings
            warnings.warn("States not found in given list of all states.")
    if normalized:
        freq = freq.astype(float) / np.sum(freq)

    if return_all_states:
        return freq, allstates
    return freq
Example #2
0
def fill_in_vote(votes):
    """
        For filling in an incomplete data set with missing votes. Take array of votes, 
        and for each vote check whether the vote is complete or not. If it is complete
        then add a count for it. If it isn't complete, go through all possible
        completions of the missing votes given the marginal distributions and give the
        fractional count for each possibility
        Filling in must start with singlets, duplet, tuplet, etc. in order to maintain
        original means.
        NB20140522 
        Args: 
            votes : array of votes, a vote on each row
        Value:
            filledVotes : list of duplets with first element the vote registered and the
                second the fractional count of that vote
    2014-05-26
    """
    from misc_fcns import unique_rows

    filledVotes, counts = votes.copy(), np.ones((votes.shape[0]))
    for nanN in range(1, votes.shape[1]):
        for v in filledVotes[np.sum(np.isnan(filledVotes), 1) == nanN, :]:
            nanIx = np.argwhere(np.isnan(v)).flatten()

            # Just get the complete voting record of the missing people.
            fullVoteIx = np.sum(np.isnan(filledVotes[:, nanIx]) == 0,
                                1) == nanN
            subVotes = filledVotes[:, nanIx][fullVoteIx, :]
            if subVotes.shape[0] == 0:
                raise Exception(
                    "There is insufficient data to measure subset.")
            uIx = unique_rows(subVotes)
            uSubVotes = subVotes[uIx, :]

            p = get_state_probs(subVotes,
                                uSubVotes,
                                weights=counts[fullVoteIx])

            # Now, fill the entries of vote.
            toAddVotes, toAddCounts = [], []
            for i in range(p.size):
                _vote = v.copy()
                _vote[nanIx] = uSubVotes[i, :]
                toAddVotes.append((_vote))
                toAddCounts.append((p[i]))
            filledVotes = np.vstack((filledVotes, np.array(toAddVotes)))
            counts = np.hstack((counts, np.array(toAddCounts)))
        # Removed these filled votes.
        keepIx = np.sum(np.isnan(filledVotes), 1) != nanN
        counts = counts[keepIx]
        filledVotes = filledVotes[keepIx, :]

    return filledVotes, counts
Example #3
0
def extract_unique_n_groups(n, X):
    """Extract unique nth tuples from data assuming 0's are missing data.
    Parameters
    ----------
    n : int
        Size of subgroups.
    X : ndarray
        n_samples,n_dim
    """
    from itertools import combinations

    ix = X[unique_rows(X != 0)]
    ix = [np.where(i != 0)[0] for i in ix]
    # For every subset with size >n, iterate through all subsets of size n.
    subsets = []
    for i in ix:
        if len(i) > n:
            for c in combinations(i, n):
                subsets.append(c)
        elif len(i) == n:
            subsets.append(i)
    ix = np.vstack(subsets)
    return ix[unique_rows(ix)]
Example #4
0
    def disc_state(self, X):
        """
        Return a mapping of the discretized states to integers
        2016-12-09

        Params:
        -------
        X (ndarray)
            n_samples x n_dim
        """
        from bisect import bisect

        self._estimate_pdf(X)
        disclabel = np.zeros_like(X, dtype=int)
        for i, x in enumerate(X):
            for j in range(self.ndim):
                disclabel[i] = [
                    bisect(self.edges[j], x_, hi=len(self.edges[j]) - 2)
                    for x_ in x
                ]
        return unique_rows(disclabel, return_inverse=True)
Example #5
0
def enumerate_states(x):
    """
    Given data array x, assign a unique integer label to each row.
    2016-03-25
    """
    return unique_rows(x, return_inverse=True)