Example #1
0
    def get_token_matches(self):
        """
        retrieves all matched intervals;
        verifies each paired span contains the same text
        decomposes to paired tokens - removes duplicates in the process
        :param aligned_doc:
        :return: List[Tuple[int,int]]
          ints -> token position in respective docs
            hyp[hyp_token].i, ref[ref_token].i
        """

        x = list(merge_sorted(self.matched_dict[0], self.matched_dict[1]))
        y = list(merge_sorted(x, self.matched_dict[2]))
        z = list(merge_sorted(y, self.matched_dict[3]))

        matched_spans = z

        token_matches = []
        for hyp_match, ref_match in matched_spans:
            assert hyp_match.text == ref_match.text
            m1_tokens = self.get_token_idxs(hyp_match)
            m2_tokens = self.get_token_idxs(ref_match)
            for hyp_tok, ref_tok in zip(m1_tokens, m2_tokens):
                matched = (hyp_tok, ref_tok)
                if matched not in token_matches and is_sorted(token_matches +
                                                              [matched]):
                    token_matches.append(matched)
        return token_matches
Example #2
0
def merge_and_compress_summaries(vals_and_weights):
    """Merge and sort percentile summaries that are already sorted.

    Each item is a tuple like ``(vals, weights)`` where vals and weights
    are lists.  We sort both by vals.

    Equal values will be combined, their weights summed together.
    """
    vals_and_weights = [x for x in vals_and_weights if x]
    if not vals_and_weights:
        return ()
    it = merge_sorted(*[zip(x, y) for x, y in vals_and_weights])
    vals = []
    weights = []
    vals_append = vals.append
    weights_append = weights.append
    val, weight = prev_val, prev_weight = next(it)
    for val, weight in it:
        if val == prev_val:
            prev_weight += weight
        else:
            vals_append(prev_val)
            weights_append(prev_weight)
            prev_val, prev_weight = val, weight
    if val == prev_val:
        vals_append(prev_val)
        weights_append(prev_weight)
    return vals, weights
Example #3
0
def merge_and_compress_summaries(vals_and_weights):
    """Merge and sort percentile summaries that are already sorted.

    Each item is a tuple like ``(vals, weights)`` where vals and weights
    are lists.  We sort both by vals.

    Equal values will be combined, their weights summed together.
    """
    vals_and_weights = [x for x in vals_and_weights if x]
    if not vals_and_weights:
        return ()
    it = merge_sorted(*[zip(x, y) for x, y in vals_and_weights])
    vals = []
    weights = []
    vals_append = vals.append
    weights_append = weights.append
    val, weight = prev_val, prev_weight = next(it)
    for val, weight in it:
        if val == prev_val:
            prev_weight += weight
        else:
            vals_append(prev_val)
            weights_append(prev_weight)
            prev_val, prev_weight = val, weight
    if val == prev_val:
        vals_append(prev_val)
        weights_append(prev_weight)
    return vals, weights
Example #4
0
def align_partitions(*dfs):
    """ Mutually partition and align DataFrame blocks

    This serves as precursor to multi-dataframe operations like join, concat,
    or merge.

    Parameters
    ----------
    dfs: sequence of dd.DataFrame, dd.Series and dd.base.Scalar
        Sequence of dataframes to be aligned on their index

    Returns
    -------
    dfs: sequence of dd.DataFrame, dd.Series and dd.base.Scalar
        These must have consistent divisions with each other
    divisions: tuple
        Full divisions sequence of the entire result
    result: list
        A list of lists of keys that show which data exist on which
        divisions
    """
    _is_broadcastable = partial(is_broadcastable, dfs)
    dfs1 = [
        df for df in dfs
        if isinstance(df, _Frame) and not _is_broadcastable(df)
    ]
    if len(dfs) == 0:
        raise ValueError("dfs contains no DataFrame and Series")
    if not all(df.known_divisions for df in dfs1):
        raise ValueError("Not all divisions are known, can't align "
                         "partitions. Please use `set_index` "
                         "to set the index.")

    divisions = list(unique(merge_sorted(*[df.divisions for df in dfs1])))
    if len(divisions) == 1:  # single value for index
        divisions = (divisions[0], divisions[0])
    dfs2 = [
        df.repartition(divisions, force=True) if isinstance(df, _Frame) else df
        for df in dfs
    ]

    result = list()
    inds = [0 for df in dfs]
    for d in divisions[:-1]:
        L = list()
        for i, df in enumerate(dfs2):
            if isinstance(df, _Frame):
                j = inds[i]
                divs = df.divisions
                if j < len(divs) - 1 and divs[j] == d:
                    L.append((df._name, inds[i]))
                    inds[i] += 1
                else:
                    L.append(None)
            else:  # Scalar has no divisions
                L.append(None)
        result.append(L)
    return dfs2, tuple(divisions), result
Example #5
0
def align_partitions(*dfs):
    """ Mutually partition and align DataFrame blocks

    This serves as precursor to multi-dataframe operations like join, concat,
    or merge.

    Parameters
    ----------
    dfs: sequence of dd.DataFrame, dd.Series and dd.base.Scalar
        Sequence of dataframes to be aligned on their index

    Returns
    -------
    dfs: sequence of dd.DataFrame, dd.Series and dd.base.Scalar
        These must have consistent divisions with each other
    divisions: tuple
        Full divisions sequence of the entire result
    result: list
        A list of lists of keys that show which data exist on which
        divisions
    """
    _is_broadcastable = partial(is_broadcastable, dfs)
    dfs1 = [df for df in dfs
            if isinstance(df, _Frame) and
            not _is_broadcastable(df)]
    if len(dfs) == 0:
        raise ValueError("dfs contains no DataFrame and Series")
    if not all(df.known_divisions for df in dfs1):
        raise ValueError("Not all divisions are known, can't align "
                         "partitions. Please use `set_index` "
                         "to set the index.")

    divisions = list(unique(merge_sorted(*[df.divisions for df in dfs1])))
    if len(divisions) == 1:  # single value for index
        divisions = (divisions[0], divisions[0])
    dfs2 = [df.repartition(divisions, force=True)
            if isinstance(df, _Frame) else df for df in dfs]

    result = list()
    inds = [0 for df in dfs]
    for d in divisions[:-1]:
        L = list()
        for i, df in enumerate(dfs2):
            if isinstance(df, _Frame):
                j = inds[i]
                divs = df.divisions
                if j < len(divs) - 1 and divs[j] == d:
                    L.append((df._name, inds[i]))
                    inds[i] += 1
                else:
                    L.append(None)
            else:    # Scalar has no divisions
                L.append(None)
        result.append(L)
    return dfs2, tuple(divisions), result
Example #6
0
def align_partitions(*dfs):
    """ Mutually partition and align DataFrame blocks

    This serves as precursor to multi-dataframe operations like join, concat,
    or merge.

    Parameters
    ----------
    dfs: sequence of dd.DataFrames
        Sequence of dataframes to be aligned on their index


    Returns
    -------
    dfs: sequence of dd.DataFrames
        These DataFrames have consistent divisions with each other
    divisions: tuple
        Full divisions sequence of the entire result
    result: list
        A list of lists of keys that show which dataframes exist on which
        divisions
    """
    divisions = list(unique(merge_sorted(*[df.divisions for df in dfs])))
    divisionss = [tuple(divisions) for df in dfs]
    dfs2 = [
        df.repartition(div, force=True) for df, div in zip(dfs, divisionss)
    ]

    result = list()
    inds = [0 for df in dfs]
    for d in divisions[:-1]:
        L = list()
        for i in range(len(dfs)):
            j = inds[i]
            divs = dfs2[i].divisions
            if j < len(divs) - 1 and divs[j] == d:
                L.append((dfs2[i]._name, inds[i]))
                inds[i] += 1
            else:
                L.append(None)
        result.append(L)
    return dfs2, tuple(divisions), result
Example #7
0
def align_partitions(*dfs):
    """ Mutually partition and align DataFrame blocks

    This serves as precursor to multi-dataframe operations like join, concat,
    or merge.

    Parameters
    ----------
    dfs: sequence of dd.DataFrames
        Sequence of dataframes to be aligned on their index


    Returns
    -------
    dfs: sequence of dd.DataFrames
        These DataFrames have consistent divisions with each other
    divisions: tuple
        Full divisions sequence of the entire result
    result: list
        A list of lists of keys that show which dataframes exist on which
        divisions
    """
    divisions = list(unique(merge_sorted(*[df.divisions for df in dfs])))
    divisionss = [tuple(divisions) for df in dfs]
    dfs2 = [df.repartition(div, force=True) for df, div
            in zip(dfs, divisionss)]

    result = list()
    inds = [0 for df in dfs]
    for d in divisions[:-1]:
        L = list()
        for i in range(len(dfs)):
            j = inds[i]
            divs = dfs2[i].divisions
            if j < len(divs) - 1 and divs[j] == d:
                L.append((dfs2[i]._name, inds[i]))
                inds[i] += 1
            else:
                L.append(None)
        result.append(L)
    return dfs2, tuple(divisions), result
Example #8
0
def merge_percentiles(finalq, qs, vals, Ns, interpolation='lower'):
    """ Combine several percentile calculations of different data.

    Parameters
    ----------

    finalq : numpy.array
        Percentiles to compute (must use same scale as ``qs``).
    qs : sequence of numpy.arrays
        Percentiles calculated on different sets of data.
    vals : sequence of numpy.arrays
        Resulting values associated with percentiles ``qs``.
    Ns : sequence of integers
        The number of data elements associated with each data set.
    interpolation : {'linear', 'lower', 'higher', 'midpoint', 'nearest'}
        Specify the type of interpolation to use to calculate final
        percentiles.  For more information, see numpy.percentile.

    Example
    -------

    >>> finalq = [10, 20, 30, 40, 50, 60, 70, 80]
    >>> qs = [[20, 40, 60, 80], [20, 40, 60, 80]]
    >>> vals = [np.array([1, 2, 3, 4]), np.array([10, 11, 12, 13])]
    >>> Ns = [100, 100]  # Both original arrays had 100 elements

    >>> merge_percentiles(finalq, qs, vals, Ns)
    array([ 1,  2,  3,  4, 10, 11, 12, 13])
    """
    if isinstance(finalq, Iterator):
        finalq = list(finalq)
    finalq = np.array(finalq)
    qs = list(map(list, qs))
    vals = list(vals)
    Ns = list(Ns)

    L = list(zip(*[(q, val, N) for q, val, N in zip(qs, vals, Ns) if N]))
    if not L:
        raise ValueError("No non-trivial arrays found")
    qs, vals, Ns = L

    # TODO: Perform this check above in percentile once dtype checking is easy
    #       Here we silently change meaning
    if str(vals[0].dtype) == 'category':
        result = merge_percentiles(finalq, qs, [v.codes for v in vals], Ns,
                                   interpolation)
        import pandas as pd
        return pd.Categorical.from_codes(result, vals[0].categories,
                                         vals[0].ordered)
    if not np.issubdtype(vals[0].dtype, np.number):
        interpolation = 'nearest'

    if len(vals) != len(qs) or len(Ns) != len(qs):
        raise ValueError('qs, vals, and Ns parameters must be the same length')

    # transform qs and Ns into number of observations between percentiles
    counts = []
    for q, N in zip(qs, Ns):
        count = np.empty(len(q))
        count[1:] = np.diff(q)
        count[0] = q[0]
        count *= N
        counts.append(count)

    # Sort by calculated percentile values, then number of observations.
    # >95% of the time in this function is spent in `merge_sorted` below.
    # An alternative that uses numpy sort is shown.  It is sometimes
    # comparable to, but typically slower than, `merge_sorted`.
    #
    # >>> A = np.concatenate(map(np.array, map(zip, vals, counts)))
    # >>> A.sort(0, kind='mergesort')

    combined_vals_counts = merge_sorted(*map(zip, vals, counts))
    combined_vals, combined_counts = zip(*combined_vals_counts)

    combined_vals = np.array(combined_vals)
    combined_counts = np.array(combined_counts)

    # percentile-like, but scaled by total number of observations
    combined_q = np.cumsum(combined_counts)

    # rescale finalq percentiles to match combined_q
    desired_q = finalq * sum(Ns)

    # the behavior of different interpolation methods should be
    # investigated further.
    if interpolation == 'linear':
        rv = np.interp(desired_q, combined_q, combined_vals)
    else:
        left = np.searchsorted(combined_q, desired_q, side='left')
        right = np.searchsorted(combined_q, desired_q, side='right') - 1
        np.minimum(left,
                   len(combined_vals) - 1, left)  # don't exceed max index
        lower = np.minimum(left, right)
        upper = np.maximum(left, right)
        if interpolation == 'lower':
            rv = combined_vals[lower]
        elif interpolation == 'higher':
            rv = combined_vals[upper]
        elif interpolation == 'midpoint':
            rv = 0.5 * (combined_vals[lower] + combined_vals[upper])
        elif interpolation == 'nearest':
            lower_residual = np.abs(combined_q[lower] - desired_q)
            upper_residual = np.abs(combined_q[upper] - desired_q)
            mask = lower_residual > upper_residual
            index = lower  # alias; we no longer need lower
            index[mask] = upper[mask]
            rv = combined_vals[index]
        else:
            raise ValueError("interpolation can only be 'linear', 'lower', "
                             "'higher', 'midpoint', or 'nearest'")
    return rv
Example #9
0
def merge_percentiles(finalq, qs, vals, Ns, interpolation='lower'):
    """ Combine several percentile calculations of different data.

    Parameters
    ----------

    finalq : numpy.array
        Percentiles to compute (must use same scale as ``qs``).
    qs : sequence of numpy.arrays
        Percentiles calculated on different sets of data.
    vals : sequence of numpy.arrays
        Resulting values associated with percentiles ``qs``.
    Ns : sequence of integers
        The number of data elements associated with each data set.
    interpolation : {'linear', 'lower', 'higher', 'midpoint', 'nearest'}
        Specify the type of interpolation to use to calculate final
        percentiles.  For more information, see numpy.percentile.

    Examples
    --------

    >>> finalq = [10, 20, 30, 40, 50, 60, 70, 80]
    >>> qs = [[20, 40, 60, 80], [20, 40, 60, 80]]
    >>> vals = [np.array([1, 2, 3, 4]), np.array([10, 11, 12, 13])]
    >>> Ns = [100, 100]  # Both original arrays had 100 elements

    >>> merge_percentiles(finalq, qs, vals, Ns)
    array([ 1,  2,  3,  4, 10, 11, 12, 13])
    """
    if isinstance(finalq, Iterator):
        finalq = list(finalq)
    finalq = np.array(finalq)
    qs = list(map(list, qs))
    vals = list(vals)
    Ns = list(Ns)

    L = list(zip(*[(q, val, N) for q, val, N in zip(qs, vals, Ns) if N]))
    if not L:
        raise ValueError("No non-trivial arrays found")
    qs, vals, Ns = L

    # TODO: Perform this check above in percentile once dtype checking is easy
    #       Here we silently change meaning
    if str(vals[0].dtype) == 'category':
        result = merge_percentiles(finalq, qs, [v.codes for v in vals], Ns, interpolation)
        import pandas as pd
        return pd.Categorical.from_codes(result, vals[0].categories, vals[0].ordered)
    if not np.issubdtype(vals[0].dtype, np.number):
        interpolation = 'nearest'

    if len(vals) != len(qs) or len(Ns) != len(qs):
        raise ValueError('qs, vals, and Ns parameters must be the same length')

    # transform qs and Ns into number of observations between percentiles
    counts = []
    for q, N in zip(qs, Ns):
        count = np.empty(len(q))
        count[1:] = np.diff(q)
        count[0] = q[0]
        count *= N
        counts.append(count)

    # Sort by calculated percentile values, then number of observations.
    # >95% of the time in this function is spent in `merge_sorted` below.
    # An alternative that uses numpy sort is shown.  It is sometimes
    # comparable to, but typically slower than, `merge_sorted`.
    #
    # >>> A = np.concatenate(map(np.array, map(zip, vals, counts)))
    # >>> A.sort(0, kind='mergesort')

    combined_vals_counts = merge_sorted(*map(zip, vals, counts))
    combined_vals, combined_counts = zip(*combined_vals_counts)

    combined_vals = np.array(combined_vals)
    combined_counts = np.array(combined_counts)

    # percentile-like, but scaled by total number of observations
    combined_q = np.cumsum(combined_counts)

    # rescale finalq percentiles to match combined_q
    desired_q = finalq * sum(Ns)

    # the behavior of different interpolation methods should be
    # investigated further.
    if interpolation == 'linear':
        rv = np.interp(desired_q, combined_q, combined_vals)
    else:
        left = np.searchsorted(combined_q, desired_q, side='left')
        right = np.searchsorted(combined_q, desired_q, side='right') - 1
        np.minimum(left, len(combined_vals) - 1, left) # don't exceed max index
        lower = np.minimum(left, right)
        upper = np.maximum(left, right)
        if interpolation == 'lower':
            rv = combined_vals[lower]
        elif interpolation == 'higher':
            rv = combined_vals[upper]
        elif interpolation == 'midpoint':
            rv = 0.5 * (combined_vals[lower] + combined_vals[upper])
        elif interpolation == 'nearest':
            lower_residual = np.abs(combined_q[lower] - desired_q)
            upper_residual = np.abs(combined_q[upper] - desired_q)
            mask = lower_residual > upper_residual
            index = lower  # alias; we no longer need lower
            index[mask] = upper[mask]
            rv = combined_vals[index]
        else:
            raise ValueError("interpolation can only be 'linear', 'lower', "
                             "'higher', 'midpoint', or 'nearest'")
    return rv