def find_non_unique_ids(ids: np.array) -> Tuple[np.array]: """ Takes the ID array ids, and returns two new arrays. These arrays contain the non-unique IDs, and their positions in the original ids array, respectively. """ args = ids.argsort() mask = np.empty(args.shape, dtype=bool) sorted_ids = ids[args] # By definition, the first element should not be a repeat of itself. mask[0] = False # The actual duplicate check; works because sorted_ids is, well, sorted. mask[1:] = sorted_ids[1:] == sorted_ids[:-1] # sorted_ids may be very large and we no longer need it del sorted_ids # Now we need to put everything back where it belongs. position_corrected_mask = np.empty_like(mask) position_corrected_mask[args] = mask # Now we can compress everything to a nice integer array duplicate_positions = np.where(position_corrected_mask)[0] duplicate_ids = ids[duplicate_positions] return duplicate_ids, duplicate_positions
def __get_ranks(weights: np.array) -> np.array: weights /= np.linalg.norm(weights, ord=1) temp = weights.argsort() ranks = np.empty_like(temp) ranks[temp] = np.arange(len(weights)) + 1 return ranks
def _compute_ranks(values: np.array): """ Returns ranks in [0, len(x)) Note: This is different from scipy.stats.rankdata, which returns ranks in [1, len(x)]. """ assert values.ndim == 1 ranks = np.empty(len(values), dtype=int) ranks[values.argsort()] = np.arange(len(values)) return ranks
def rank_array(array: np.array) -> np.array: """ Rank input 1d array :param array: :return: """ array = np.array(array) order = array.argsort() ranks = order.argsort() return ranks
def retrieveStacks(patches: list, dissimilarityMatrix: np.array, n2: int) -> list: ''' Description ---------- Looks for patches that have not been put to a stack yet, retrieve n2 of their most similar patches and put them to a new stack Parameters ---------- list : patches DESCRIPTION. np.array : dissimilarityMatrix DESCRIPTION. Returns ------- list of the stacks ''' # -- Local change of patches to easy computations patches = np.array(patches) # -- Retrieve index of n2 most similar patches, current patch excluded similarPatchesIndexes = dissimilarityMatrix.argsort(axis=0)[:n2 - 1] # -- stackCount gives how many times a given patch has been put into a stack stackCount = [0] * len(patches) stacks = [] while True: try: # -- Update of the stack; this part could be optimized patchIndex = stackCount.index(0) patch = patches[patchIndex] similarPatches = list(patches[similarPatchesIndexes[:, patchIndex]]) similarPatches.append(patch) similarPatches = np.transpose(np.array(similarPatches), (1, 2, 0)) stacks.append(similarPatches) # -- Update of stackCount; this part could be optimized too stackCount = np.array(stackCount) stackCount[patchIndex] += 1 stackCount[similarPatchesIndexes[:, patchIndex]] += 1 stackCount = list(stackCount) except ValueError: # no more zeros in stackCount break return stacks
def generate_arc_mask(marginals: np.array, max_heads: int, threshold: float = None): """ Decode the scores generated by a pruner to generate an arc mask. :param marginals: array (h, m) with the marginal probability of each arc :param max_heads: maximum allowed head candidates for modifier :param threshold: prune arcs (h, m) with a score lower than this value multiplied by the highest scoring arc (h', m) for each word m. :return: a tuple (arc_mask, entropy). arc_mask is a boolean 2d array masking arcs. It has shape (n, n) where n is the instance length including root. Position (h, m) has True if the arc is valid, False otherwise. entropy is the entropy of the marginal arc probabilities (computed with the matrix-tree theorem) """ n = len(marginals) # max_marginals contains the highest probability head for each word max_marginals = marginals.max(0) # in some edge cases, the probabilities for all heads to a word are 0 invalid = max_marginals <= 0 if np.any(invalid): num_tokens = invalid.sum() msg = 'All heads have zero probability for %d token(s)' % num_tokens logger.info(msg) max_marginals[invalid] = 1 abs_threshold = 0 if threshold is None else threshold * max_marginals if n > max_heads: # clip values below the top k sorted_inds = marginals.argsort(0) np.put_along_axis(marginals, sorted_inds[:-max_heads], 0, 0) # marginals → (n + 1, n) # max_marginals → (n) mask = marginals >= abs_threshold # allow all heads where all had 0 probability (anything goes!) mask[:, invalid] = True # mask is expected to be (n + 1, n + 1) mask = np.concatenate([np.zeros([n, 1], dtype=np.bool), mask], 1) return mask
def probability2label(arProbas:np.array, oClasses:VideoClasses, nTop:int = 3) -> (int, str, float): """ # Return 3-tuple: predicted nLabel, sLabel, fProbability in addition print nTop most probable labels """ arTopLabels = arProbas.argsort()[-nTop:][::-1] arTopProbas = arProbas[arTopLabels] for i in range(nTop): sClass = oClasses.dfClass.sClass[arTopLabels[i]] + " " + oClasses.dfClass.sDetail[arTopLabels[i]] print("Top %d: [%3d] %s (confidence %.1f%%)" % \ (i+1, arTopLabels[i], sClass, arTopProbas[i]*100.)) #sClass = oClasses.dfClass.sClass[arTopLabels[0]] + " " + oClasses.dfClass.sDetail[arTopLabels[0]] return arTopLabels[0], oClasses.dfClass.sDetail[arTopLabels[0]], arTopProbas[0]
def nonlinearity_of_linear_regressor(X: numpy.array, y: numpy.array, model: LinearRegression = None, random_state: int = None, metric: str = 'mse') -> float: """ Calculate the non-linearity of a linear regressor Parameters ---------- X : numpy.array 2d-array with features columns y : numpy.array Array of response values model : LinearRegression Ordinary least square model between X,y, in case of already trained model. random_state : int, RandomState instance or None, optional (default=None) If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used by `numpy.random`. metric: str, optional (default='mae') Error calculation metric. Return ------ float: Normalized mean error """ # check if is dataframe if isinstance(X, pandas.DataFrame): X = X.values # check if y is dataframe or series if isinstance(y, pandas.DataFrame) or isinstance(y, pandas.Series): y = y.values _, cat_idx = check_cat(X) X_ = numpy.delete(X, cat_idx, axis=1) model = (LinearRegression().fit(X_, y) if not model else model) seed_ = check_random_state(random_state) seed(seed_) n, m = X.shape y = y.flatten() idx_sorted_y = y.argsort() X_sorted = X[idx_sorted_y, :] y_sorted = y[idx_sorted_y] i = 1 X_list = list() y_list = list() while i < n: x_i_list = list() for j in range(m): uniques_values = numpy.unique(X_sorted[:, j]) if len(uniques_values) <= 2: x_i_list.append(randint(0, 1)) else: x_i_list.append(uniform(X_sorted[i, j], X_sorted[i - 1, j])) x_i = numpy.array(x_i_list) y_i = numpy.array([uniform(y_sorted[i], y_sorted[i - 1])]) X_list.append(x_i) y_list.append(y_i) i = i + 1 X_ = numpy.array(X_list) y_ = numpy.array(y_list) error = model.predict(X_).reshape((n - 1, )) - numpy.array(y_).reshape( (n - 1, )) return compute_metric(error, metric)
def nonlinearity_of_nn_regressor(X: numpy.array, y: numpy.array, random_state: int = None, metric: str = 'mae') -> float: """ Non-linearity of nearest neighbor regressor. Parameters ---------- X : numpy.array 2d-array with features columns. y : numpy.array Array of response values. random_state : int, RandomState instance or None, optional (default=None) If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used by `numpy.random`. metric: str, optional (default='mae') Error calculation metric. Return ------ float: Normalized 1-NN error. """ # check if is dataframe if isinstance(X, pandas.DataFrame): X = X.values # check if y is dataframe or series if isinstance(y, pandas.DataFrame) or isinstance(y, pandas.Series): y = y.values seed_ = check_random_state(random_state) seed(seed_) tree = KDTree(X) n, m = X.shape y = y.flatten() idx_sorted_y = y.argsort() X_sorted = X[idx_sorted_y, :] y_sorted = y[idx_sorted_y] i = 1 X_list = list() y_list = list() while i < n: x_i_list = list() for j in range(m): uniques_values = numpy.unique(X_sorted[:, j]) if len(uniques_values) <= 2: x_i_list.append(randint(0, 1)) else: x_i_list.append(uniform(X_sorted[i, j], X_sorted[i - 1, j])) x_i = numpy.array(x_i_list) y_i = numpy.array([uniform(y_sorted[i], y_sorted[i - 1])]) X_list.append(x_i) y_list.append(y_i) i = i + 1 X_ = numpy.array(X_list) y_ = numpy.array(y_list) nearest_dist, nearest_ind = tree.query(X_, k=1) error = numpy.array( [y[int(nearest_ind[i])] - y_[i] for i in range(y_.shape[0])]) return compute_metric(error, metric)
def best(population: np.array, evaluation: np.array, n: int = 1) -> np.array: best = population[:, evaluation.argsort()[:n]] return best
def get_performance_vs_uncertainty(y_true: np.array, y_pred: np.array, y_unc: np.array, y_axis_label: str, performance_fn: callable = cross_entropy, performance_fn_args: dict = None): """Create plot how the uncertainty relates to model performance. Parameters ---------- y_true: np.array True labels y_pred: np.array Predictions y_unc: np.array Uncertainties y_axis_label: str plot Y-axis label performance_fn: callable Performance function used performance_fn_args: dict Arguments passed to performance function Returns ------- plt.figure Plot """ try: y_unc.squeeze(-1) except ValueError: pass if y_unc.ndim == 2: y_unc = y_unc.mean(-1) elif y_unc.ndim > 2: raise ValueError(f"Invalid uncertainty shape: {y_unc.shape}") if y_true.ndim != 1: raise ValueError("Y-true not one-dimensional") # Placeholder if performance_fn_args is None: performance_fn_args = {} order = y_unc.argsort() sorted_uncertainties = y_unc[order] sorted_labels = y_true[order] sorted_predictions = y_pred[order] # Get the first index where both 0's and 1's have occurred with at least a batch size of 64. first_index = max(64, np.argwhere(sorted_labels != sorted_labels[0])[0][0]) performances = [] percentages = [] for i in range(first_index + 1, len(sorted_uncertainties)): selected_labels = sorted_labels[:i] selected_predictions = sorted_predictions[:i] percentages.append(100 * len(selected_predictions) / len(y_pred)) performances.append( performance_fn(selected_labels, selected_predictions, **performance_fn_args)) return percentages, performances
def sort_eigenstuff(e: array, v: array) -> tuple: """ Utility func Sort eigenvalues and eigenvalues by descending magnitude """ argsort = e.argsort()[::-1] return e[argsort], v[:, argsort]
def __top(model: np.array, n: int = 1): return [i for i in model.argsort()[-n:][::-1] if model[i] != 0.0]
def get_max_k_entropies_index(entropies: np.array, k: int): return entropies.argsort()[-k:][::-1]