def get_state_probs(v, allstates=None, weights=None, normalized=True): """ Get probability of unique states. There is an option to allow for weights counting of the words. Params: ------- states (ndarray) nsamples x ndim weights (vector) normalized (bool=True) Return probability distribution instead of frequency count Returns: -------- freq (ndarray) : vector of the probabilities of each state """ if v.ndim == 1: v = v[:, None] n = v.shape[1] return_all_states = False if allstates is None: allstates = v[unique_rows(v)] uniqIx = unique_rows(v, return_inverse=True) freq = np.bincount(uniqIx) return_all_states = True else: if weights is None: weights = np.ones((v.shape[0])) freq = np.zeros(allstates.shape[0]) for j, vote in enumerate(allstates): ix = (vote == v).sum(1) == n freq[j] = (ix * weights).sum() if np.isclose(np.sum(freq), np.sum(weights)) == 0: import warnings warnings.warn("States not found in given list of all states.") if normalized: freq = freq.astype(float) / np.sum(freq) if return_all_states: return freq, allstates return freq
def fill_in_vote(votes): """ For filling in an incomplete data set with missing votes. Take array of votes, and for each vote check whether the vote is complete or not. If it is complete then add a count for it. If it isn't complete, go through all possible completions of the missing votes given the marginal distributions and give the fractional count for each possibility Filling in must start with singlets, duplet, tuplet, etc. in order to maintain original means. NB20140522 Args: votes : array of votes, a vote on each row Value: filledVotes : list of duplets with first element the vote registered and the second the fractional count of that vote 2014-05-26 """ from misc_fcns import unique_rows filledVotes, counts = votes.copy(), np.ones((votes.shape[0])) for nanN in range(1, votes.shape[1]): for v in filledVotes[np.sum(np.isnan(filledVotes), 1) == nanN, :]: nanIx = np.argwhere(np.isnan(v)).flatten() # Just get the complete voting record of the missing people. fullVoteIx = np.sum(np.isnan(filledVotes[:, nanIx]) == 0, 1) == nanN subVotes = filledVotes[:, nanIx][fullVoteIx, :] if subVotes.shape[0] == 0: raise Exception( "There is insufficient data to measure subset.") uIx = unique_rows(subVotes) uSubVotes = subVotes[uIx, :] p = get_state_probs(subVotes, uSubVotes, weights=counts[fullVoteIx]) # Now, fill the entries of vote. toAddVotes, toAddCounts = [], [] for i in range(p.size): _vote = v.copy() _vote[nanIx] = uSubVotes[i, :] toAddVotes.append((_vote)) toAddCounts.append((p[i])) filledVotes = np.vstack((filledVotes, np.array(toAddVotes))) counts = np.hstack((counts, np.array(toAddCounts))) # Removed these filled votes. keepIx = np.sum(np.isnan(filledVotes), 1) != nanN counts = counts[keepIx] filledVotes = filledVotes[keepIx, :] return filledVotes, counts
def extract_unique_n_groups(n, X): """Extract unique nth tuples from data assuming 0's are missing data. Parameters ---------- n : int Size of subgroups. X : ndarray n_samples,n_dim """ from itertools import combinations ix = X[unique_rows(X != 0)] ix = [np.where(i != 0)[0] for i in ix] # For every subset with size >n, iterate through all subsets of size n. subsets = [] for i in ix: if len(i) > n: for c in combinations(i, n): subsets.append(c) elif len(i) == n: subsets.append(i) ix = np.vstack(subsets) return ix[unique_rows(ix)]
def disc_state(self, X): """ Return a mapping of the discretized states to integers 2016-12-09 Params: ------- X (ndarray) n_samples x n_dim """ from bisect import bisect self._estimate_pdf(X) disclabel = np.zeros_like(X, dtype=int) for i, x in enumerate(X): for j in range(self.ndim): disclabel[i] = [ bisect(self.edges[j], x_, hi=len(self.edges[j]) - 2) for x_ in x ] return unique_rows(disclabel, return_inverse=True)
def enumerate_states(x): """ Given data array x, assign a unique integer label to each row. 2016-03-25 """ return unique_rows(x, return_inverse=True)