Ejemplo n.º 1
0
def find_non_unique_ids(ids: np.array) -> Tuple[np.array]:
    """
    Takes the ID array ids, and returns two new arrays.
    
    These arrays contain the non-unique IDs, and their
    positions in the original ids array, respectively.
    """

    args = ids.argsort()
    mask = np.empty(args.shape, dtype=bool)
    sorted_ids = ids[args]

    # By definition, the first element should not be a repeat of itself.
    mask[0] = False
    # The actual duplicate check; works because sorted_ids is, well, sorted.
    mask[1:] = sorted_ids[1:] == sorted_ids[:-1]

    # sorted_ids may be very large and we no longer need it
    del sorted_ids

    # Now we need to put everything back where it belongs.
    position_corrected_mask = np.empty_like(mask)
    position_corrected_mask[args] = mask

    # Now we can compress everything to a nice integer array
    duplicate_positions = np.where(position_corrected_mask)[0]
    duplicate_ids = ids[duplicate_positions]

    return duplicate_ids, duplicate_positions
Ejemplo n.º 2
0
    def __get_ranks(weights: np.array) -> np.array:

        weights /= np.linalg.norm(weights, ord=1)
        temp = weights.argsort()
        ranks = np.empty_like(temp)
        ranks[temp] = np.arange(len(weights)) + 1
        return ranks
Ejemplo n.º 3
0
 def _compute_ranks(values: np.array):
     """
     Returns ranks in [0, len(x))
     Note: This is different from scipy.stats.rankdata, which returns ranks in [1, len(x)].
     """
     assert values.ndim == 1
     ranks = np.empty(len(values), dtype=int)
     ranks[values.argsort()] = np.arange(len(values))
     return ranks
Ejemplo n.º 4
0
def rank_array(array: np.array) -> np.array:
    """
    Rank input 1d array
    :param array:
    :return:
    """
    array = np.array(array)
    order = array.argsort()
    ranks = order.argsort()
    return ranks
Ejemplo n.º 5
0
def retrieveStacks(patches: list, dissimilarityMatrix: np.array,
                   n2: int) -> list:
    '''
    Description
    ----------
    
        Looks for patches that have not been put to a stack yet, retrieve n2 of their 
        most similar patches and put them to a new stack
    Parameters
    ----------
    list : patches
        DESCRIPTION.
    np.array : dissimilarityMatrix
        DESCRIPTION.

    Returns
    -------
    list of the stacks

    '''
    # -- Local change of patches to easy computations
    patches = np.array(patches)

    # -- Retrieve index of n2 most similar patches, current patch excluded
    similarPatchesIndexes = dissimilarityMatrix.argsort(axis=0)[:n2 - 1]

    # -- stackCount gives how many times a given patch has been put into a stack
    stackCount = [0] * len(patches)
    stacks = []

    while True:
        try:
            # -- Update of the stack; this part could be optimized
            patchIndex = stackCount.index(0)
            patch = patches[patchIndex]
            similarPatches = list(patches[similarPatchesIndexes[:,
                                                                patchIndex]])
            similarPatches.append(patch)
            similarPatches = np.transpose(np.array(similarPatches), (1, 2, 0))
            stacks.append(similarPatches)

            # -- Update of stackCount; this part could be optimized too
            stackCount = np.array(stackCount)
            stackCount[patchIndex] += 1
            stackCount[similarPatchesIndexes[:, patchIndex]] += 1
            stackCount = list(stackCount)

        except ValueError:  # no more zeros in stackCount
            break

    return stacks
Ejemplo n.º 6
0
def generate_arc_mask(marginals: np.array, max_heads: int,
                      threshold: float = None):
    """
    Decode the scores generated by a pruner to generate an arc mask.

    :param marginals: array (h, m) with the marginal probability of each arc
    :param max_heads: maximum allowed head candidates for modifier
    :param threshold: prune arcs (h, m) with a score lower than this value
        multiplied by the highest scoring arc (h', m) for each word m.
    :return: a tuple (arc_mask, entropy).
        arc_mask is a boolean 2d array masking arcs. It has shape (n, n) where
        n is the instance length including root. Position (h, m) has True
        if the arc is valid, False otherwise.

        entropy is the entropy of the marginal arc probabilities (computed
        with the matrix-tree theorem)
    """
    n = len(marginals)

    # max_marginals contains the highest probability head for each word
    max_marginals = marginals.max(0)

    # in some edge cases, the probabilities for all heads to a word are 0
    invalid = max_marginals <= 0
    if np.any(invalid):
        num_tokens = invalid.sum()
        msg = 'All heads have zero probability for %d token(s)' % num_tokens
        logger.info(msg)
        max_marginals[invalid] = 1

    abs_threshold = 0 if threshold is None else threshold * max_marginals

    if n > max_heads:
        # clip values below the top k
        sorted_inds = marginals.argsort(0)
        np.put_along_axis(marginals, sorted_inds[:-max_heads], 0, 0)

    # marginals → (n + 1, n)
    # max_marginals → (n)
    mask = marginals >= abs_threshold

    # allow all heads where all had 0 probability (anything goes!)
    mask[:, invalid] = True

    # mask is expected to be (n + 1, n + 1)
    mask = np.concatenate([np.zeros([n, 1], dtype=np.bool), mask], 1)

    return mask
def probability2label(arProbas:np.array, oClasses:VideoClasses, nTop:int = 3) -> (int, str, float):
    """ 
    # Return
        3-tuple: predicted nLabel, sLabel, fProbability
        in addition print nTop most probable labels
    """

    arTopLabels = arProbas.argsort()[-nTop:][::-1]
    arTopProbas = arProbas[arTopLabels]

    for i in range(nTop):
        sClass = oClasses.dfClass.sClass[arTopLabels[i]] + " " + oClasses.dfClass.sDetail[arTopLabels[i]]
        print("Top %d: [%3d] %s (confidence %.1f%%)" % \
            (i+1, arTopLabels[i], sClass, arTopProbas[i]*100.))
        
    #sClass = oClasses.dfClass.sClass[arTopLabels[0]] + " " + oClasses.dfClass.sDetail[arTopLabels[0]]
    return arTopLabels[0], oClasses.dfClass.sDetail[arTopLabels[0]], arTopProbas[0]
Ejemplo n.º 8
0
def nonlinearity_of_linear_regressor(X: numpy.array,
                                     y: numpy.array,
                                     model: LinearRegression = None,
                                     random_state: int = None,
                                     metric: str = 'mse') -> float:
    """
    Calculate the non-linearity of a linear regressor

    Parameters
    ----------
    X : numpy.array
        2d-array with features columns
    y : numpy.array
        Array of response values
    model : LinearRegression
        Ordinary least square model between X,y, in case of already trained model.
    random_state : int, RandomState instance or None, optional (default=None)
            If int, random_state is the seed used by the random number generator;
            If RandomState instance, random_state is the random number generator;
            If None, the random number generator is the RandomState instance used
            by `numpy.random`.
    metric: str, optional (default='mae')
        Error calculation metric.  

    Return
    ------
    float:
        Normalized mean error
    """
    # check if is dataframe
    if isinstance(X, pandas.DataFrame):
        X = X.values

    # check if y is dataframe or series
    if isinstance(y, pandas.DataFrame) or isinstance(y, pandas.Series):
        y = y.values

    _, cat_idx = check_cat(X)
    X_ = numpy.delete(X, cat_idx, axis=1)
    model = (LinearRegression().fit(X_, y) if not model else model)

    seed_ = check_random_state(random_state)
    seed(seed_)
    n, m = X.shape
    y = y.flatten()
    idx_sorted_y = y.argsort()
    X_sorted = X[idx_sorted_y, :]
    y_sorted = y[idx_sorted_y]
    i = 1
    X_list = list()
    y_list = list()

    while i < n:
        x_i_list = list()
        for j in range(m):
            uniques_values = numpy.unique(X_sorted[:, j])
            if len(uniques_values) <= 2:
                x_i_list.append(randint(0, 1))
            else:
                x_i_list.append(uniform(X_sorted[i, j], X_sorted[i - 1, j]))
        x_i = numpy.array(x_i_list)
        y_i = numpy.array([uniform(y_sorted[i], y_sorted[i - 1])])

        X_list.append(x_i)
        y_list.append(y_i)
        i = i + 1

    X_ = numpy.array(X_list)
    y_ = numpy.array(y_list)
    error = model.predict(X_).reshape((n - 1, )) - numpy.array(y_).reshape(
        (n - 1, ))

    return compute_metric(error, metric)
Ejemplo n.º 9
0
def nonlinearity_of_nn_regressor(X: numpy.array,
                                 y: numpy.array,
                                 random_state: int = None,
                                 metric: str = 'mae') -> float:
    """
    Non-linearity of nearest neighbor regressor.

    Parameters
    ----------
    X : numpy.array
        2d-array with features columns.
    y : numpy.array
        Array of response values.
    random_state : int, RandomState instance or None, optional (default=None)
            If int, random_state is the seed used by the random number generator;
            If RandomState instance, random_state is the random number generator;
            If None, the random number generator is the RandomState instance used
            by `numpy.random`.
    metric: str, optional (default='mae')
        Error calculation metric.        

    Return
    ------
    float:
        Normalized 1-NN error.
    """
    # check if is dataframe
    if isinstance(X, pandas.DataFrame):
        X = X.values

    # check if y is dataframe or series
    if isinstance(y, pandas.DataFrame) or isinstance(y, pandas.Series):
        y = y.values

    seed_ = check_random_state(random_state)
    seed(seed_)
    tree = KDTree(X)
    n, m = X.shape
    y = y.flatten()
    idx_sorted_y = y.argsort()
    X_sorted = X[idx_sorted_y, :]
    y_sorted = y[idx_sorted_y]
    i = 1
    X_list = list()
    y_list = list()

    while i < n:
        x_i_list = list()
        for j in range(m):
            uniques_values = numpy.unique(X_sorted[:, j])
            if len(uniques_values) <= 2:
                x_i_list.append(randint(0, 1))
            else:
                x_i_list.append(uniform(X_sorted[i, j], X_sorted[i - 1, j]))
        x_i = numpy.array(x_i_list)
        y_i = numpy.array([uniform(y_sorted[i], y_sorted[i - 1])])

        X_list.append(x_i)
        y_list.append(y_i)
        i = i + 1

    X_ = numpy.array(X_list)
    y_ = numpy.array(y_list)

    nearest_dist, nearest_ind = tree.query(X_, k=1)
    error = numpy.array(
        [y[int(nearest_ind[i])] - y_[i] for i in range(y_.shape[0])])

    return compute_metric(error, metric)
Ejemplo n.º 10
0
Archivo: common.py Proyecto: gbrunow/de
def best(population: np.array, evaluation: np.array, n: int = 1) -> np.array:
    best = population[:, evaluation.argsort()[:n]]

    return best
Ejemplo n.º 11
0
def get_performance_vs_uncertainty(y_true: np.array,
                                   y_pred: np.array,
                                   y_unc: np.array,
                                   y_axis_label: str,
                                   performance_fn: callable = cross_entropy,
                                   performance_fn_args: dict = None):
    """Create plot how the uncertainty relates to model performance.

    Parameters
    ----------
    y_true: np.array
        True labels
    y_pred: np.array
        Predictions
    y_unc: np.array
        Uncertainties
    y_axis_label: str
        plot Y-axis label
    performance_fn: callable
        Performance function used
    performance_fn_args: dict
        Arguments passed to performance function

    Returns
    -------
    plt.figure
        Plot
    """
    try:
        y_unc.squeeze(-1)

    except ValueError:
        pass

    if y_unc.ndim == 2:
        y_unc = y_unc.mean(-1)

    elif y_unc.ndim > 2:
        raise ValueError(f"Invalid uncertainty shape: {y_unc.shape}")

    if y_true.ndim != 1:
        raise ValueError("Y-true not one-dimensional")

    # Placeholder
    if performance_fn_args is None:
        performance_fn_args = {}

    order = y_unc.argsort()

    sorted_uncertainties = y_unc[order]
    sorted_labels = y_true[order]
    sorted_predictions = y_pred[order]

    # Get the first index where both 0's and 1's have occurred with at least a batch size of 64.
    first_index = max(64, np.argwhere(sorted_labels != sorted_labels[0])[0][0])
    performances = []
    percentages = []

    for i in range(first_index + 1, len(sorted_uncertainties)):
        selected_labels = sorted_labels[:i]
        selected_predictions = sorted_predictions[:i]

        percentages.append(100 * len(selected_predictions) / len(y_pred))

        performances.append(
            performance_fn(selected_labels, selected_predictions,
                           **performance_fn_args))

    return percentages, performances
Ejemplo n.º 12
0
def sort_eigenstuff(e: array, v: array) -> tuple:
    """ Utility func Sort eigenvalues and eigenvalues by descending magnitude """
    argsort = e.argsort()[::-1]
    return e[argsort], v[:, argsort]
Ejemplo n.º 13
0
 def __top(model: np.array, n: int = 1):
     return [i for i in model.argsort()[-n:][::-1] if model[i] != 0.0]
Ejemplo n.º 14
0
def get_max_k_entropies_index(entropies: np.array, k: int):
    return entropies.argsort()[-k:][::-1]