Python merge_sorted Beispiele, tlz.merge_sorted Python Beispiele

Beispiel #1

0

Datei anzeigen

Datei: partitionquantiles.py Projekt: LvdKnaap/BinPacking

def merge_and_compress_summaries(vals_and_weights):
    """Merge and sort percentile summaries that are already sorted.

    Each item is a tuple like ``(vals, weights)`` where vals and weights
    are lists.  We sort both by vals.

    Equal values will be combined, their weights summed together.
    """
    vals_and_weights = [x for x in vals_and_weights if x]
    if not vals_and_weights:
        return ()
    it = merge_sorted(*[zip(x, y) for x, y in vals_and_weights])
    vals = []
    weights = []
    vals_append = vals.append
    weights_append = weights.append
    val, weight = prev_val, prev_weight = next(it)
    for val, weight in it:
        if val == prev_val:
            prev_weight += weight
        else:
            vals_append(prev_val)
            weights_append(prev_weight)
            prev_val, prev_weight = val, weight
    if val == prev_val:
        vals_append(prev_val)
        weights_append(prev_weight)
    return vals, weights

Beispiel #2

0

Datei anzeigen

def align_partitions(*dfs):
    """ Mutually partition and align DataFrame blocks

    This serves as precursor to multi-dataframe operations like join, concat,
    or merge.

    Parameters
    ----------
    dfs: sequence of dd.DataFrame, dd.Series and dd.base.Scalar
        Sequence of dataframes to be aligned on their index

    Returns
    -------
    dfs: sequence of dd.DataFrame, dd.Series and dd.base.Scalar
        These must have consistent divisions with each other
    divisions: tuple
        Full divisions sequence of the entire result
    result: list
        A list of lists of keys that show which data exist on which
        divisions
    """
    _is_broadcastable = partial(is_broadcastable, dfs)
    dfs1 = [
        df for df in dfs
        if isinstance(df, _Frame) and not _is_broadcastable(df)
    ]
    if len(dfs) == 0:
        raise ValueError("dfs contains no DataFrame and Series")
    if not all(df.known_divisions for df in dfs1):
        raise ValueError("Not all divisions are known, can't align "
                         "partitions. Please use `set_index` "
                         "to set the index.")

    divisions = list(unique(merge_sorted(*[df.divisions for df in dfs1])))
    if len(divisions) == 1:  # single value for index
        divisions = (divisions[0], divisions[0])
    dfs2 = [
        df.repartition(divisions, force=True) if isinstance(df, _Frame) else df
        for df in dfs
    ]

    result = list()
    inds = [0 for df in dfs]
    for d in divisions[:-1]:
        L = list()
        for i, df in enumerate(dfs2):
            if isinstance(df, _Frame):
                j = inds[i]
                divs = df.divisions
                if j < len(divs) - 1 and divs[j] == d:
                    L.append((df._name, inds[i]))
                    inds[i] += 1
                else:
                    L.append(None)
            else:  # Scalar has no divisions
                L.append(None)
        result.append(L)
    return dfs2, tuple(divisions), result

Beispiel #3

0

Datei anzeigen

def merge_percentiles(finalq, qs, vals, interpolation="lower", Ns=None):
    """Combine several percentile calculations of different data.

    Parameters
    ----------

    finalq : numpy.array
        Percentiles to compute (must use same scale as ``qs``).
    qs : sequence of :class:`numpy.array`s
        Percentiles calculated on different sets of data.
    vals : sequence of :class:`numpy.array`s
        Resulting values associated with percentiles ``qs``.
    Ns : sequence of integers
        The number of data elements associated with each data set.
    interpolation : {'linear', 'lower', 'higher', 'midpoint', 'nearest'}
        Specify the type of interpolation to use to calculate final
        percentiles.  For more information, see :func:`numpy.percentile`.

    Examples
    --------

    >>> finalq = [10, 20, 30, 40, 50, 60, 70, 80]
    >>> qs = [[20, 40, 60, 80], [20, 40, 60, 80]]
    >>> vals = [np.array([1, 2, 3, 4]), np.array([10, 11, 12, 13])]
    >>> Ns = [100, 100]  # Both original arrays had 100 elements

    >>> merge_percentiles(finalq, qs, vals, Ns=Ns)
    array([ 1,  2,  3,  4, 10, 11, 12, 13])
    """
    if isinstance(finalq, Iterator):
        finalq = list(finalq)
    finalq = np.array(finalq)
    qs = list(map(list, qs))
    vals = list(vals)
    if Ns is None:
        vals, Ns = zip(*vals)
    Ns = list(Ns)

    L = list(zip(*[(q, val, N) for q, val, N in zip(qs, vals, Ns) if N]))
    if not L:
        raise ValueError("No non-trivial arrays found")
    qs, vals, Ns = L

    # TODO: Perform this check above in percentile once dtype checking is easy
    #       Here we silently change meaning
    if vals[0].dtype.name == "category":
        result = merge_percentiles(finalq, qs, [v.codes for v in vals],
                                   interpolation, Ns)
        import pandas as pd

        return pd.Categorical.from_codes(result, vals[0].categories,
                                         vals[0].ordered)
    if not np.issubdtype(vals[0].dtype, np.number):
        interpolation = "nearest"

    if len(vals) != len(qs) or len(Ns) != len(qs):
        raise ValueError("qs, vals, and Ns parameters must be the same length")

    # transform qs and Ns into number of observations between percentiles
    counts = []
    for q, N in zip(qs, Ns):
        count = np.empty(len(q))
        count[1:] = np.diff(q)
        count[0] = q[0]
        count *= N
        counts.append(count)

    # Sort by calculated percentile values, then number of observations.
    # >95% of the time in this function is spent in `merge_sorted` below.
    # An alternative that uses numpy sort is shown.  It is sometimes
    # comparable to, but typically slower than, `merge_sorted`.
    #
    # >>> A = np.concatenate(map(np.array, map(zip, vals, counts)))
    # >>> A.sort(0, kind='mergesort')

    combined_vals_counts = merge_sorted(*map(zip, vals, counts))
    combined_vals, combined_counts = zip(*combined_vals_counts)

    combined_vals = np.array(combined_vals)
    combined_counts = np.array(combined_counts)

    # percentile-like, but scaled by total number of observations
    combined_q = np.cumsum(combined_counts)

    # rescale finalq percentiles to match combined_q
    desired_q = finalq * sum(Ns)

    # the behavior of different interpolation methods should be
    # investigated further.
    if interpolation == "linear":
        rv = np.interp(desired_q, combined_q, combined_vals)
    else:
        left = np.searchsorted(combined_q, desired_q, side="left")
        right = np.searchsorted(combined_q, desired_q, side="right") - 1
        np.minimum(left,
                   len(combined_vals) - 1, left)  # don't exceed max index
        lower = np.minimum(left, right)
        upper = np.maximum(left, right)
        if interpolation == "lower":
            rv = combined_vals[lower]
        elif interpolation == "higher":
            rv = combined_vals[upper]
        elif interpolation == "midpoint":
            rv = 0.5 * (combined_vals[lower] + combined_vals[upper])
        elif interpolation == "nearest":
            lower_residual = np.abs(combined_q[lower] - desired_q)
            upper_residual = np.abs(combined_q[upper] - desired_q)
            mask = lower_residual > upper_residual
            index = lower  # alias; we no longer need lower
            index[mask] = upper[mask]
            rv = combined_vals[index]
        else:
            raise ValueError("interpolation can only be 'linear', 'lower', "
                             "'higher', 'midpoint', or 'nearest'")
    return rv