def get_token_matches(self): """ retrieves all matched intervals; verifies each paired span contains the same text decomposes to paired tokens - removes duplicates in the process :param aligned_doc: :return: List[Tuple[int,int]] ints -> token position in respective docs hyp[hyp_token].i, ref[ref_token].i """ x = list(merge_sorted(self.matched_dict[0], self.matched_dict[1])) y = list(merge_sorted(x, self.matched_dict[2])) z = list(merge_sorted(y, self.matched_dict[3])) matched_spans = z token_matches = [] for hyp_match, ref_match in matched_spans: assert hyp_match.text == ref_match.text m1_tokens = self.get_token_idxs(hyp_match) m2_tokens = self.get_token_idxs(ref_match) for hyp_tok, ref_tok in zip(m1_tokens, m2_tokens): matched = (hyp_tok, ref_tok) if matched not in token_matches and is_sorted(token_matches + [matched]): token_matches.append(matched) return token_matches
def merge_and_compress_summaries(vals_and_weights): """Merge and sort percentile summaries that are already sorted. Each item is a tuple like ``(vals, weights)`` where vals and weights are lists. We sort both by vals. Equal values will be combined, their weights summed together. """ vals_and_weights = [x for x in vals_and_weights if x] if not vals_and_weights: return () it = merge_sorted(*[zip(x, y) for x, y in vals_and_weights]) vals = [] weights = [] vals_append = vals.append weights_append = weights.append val, weight = prev_val, prev_weight = next(it) for val, weight in it: if val == prev_val: prev_weight += weight else: vals_append(prev_val) weights_append(prev_weight) prev_val, prev_weight = val, weight if val == prev_val: vals_append(prev_val) weights_append(prev_weight) return vals, weights
def align_partitions(*dfs): """ Mutually partition and align DataFrame blocks This serves as precursor to multi-dataframe operations like join, concat, or merge. Parameters ---------- dfs: sequence of dd.DataFrame, dd.Series and dd.base.Scalar Sequence of dataframes to be aligned on their index Returns ------- dfs: sequence of dd.DataFrame, dd.Series and dd.base.Scalar These must have consistent divisions with each other divisions: tuple Full divisions sequence of the entire result result: list A list of lists of keys that show which data exist on which divisions """ _is_broadcastable = partial(is_broadcastable, dfs) dfs1 = [ df for df in dfs if isinstance(df, _Frame) and not _is_broadcastable(df) ] if len(dfs) == 0: raise ValueError("dfs contains no DataFrame and Series") if not all(df.known_divisions for df in dfs1): raise ValueError("Not all divisions are known, can't align " "partitions. Please use `set_index` " "to set the index.") divisions = list(unique(merge_sorted(*[df.divisions for df in dfs1]))) if len(divisions) == 1: # single value for index divisions = (divisions[0], divisions[0]) dfs2 = [ df.repartition(divisions, force=True) if isinstance(df, _Frame) else df for df in dfs ] result = list() inds = [0 for df in dfs] for d in divisions[:-1]: L = list() for i, df in enumerate(dfs2): if isinstance(df, _Frame): j = inds[i] divs = df.divisions if j < len(divs) - 1 and divs[j] == d: L.append((df._name, inds[i])) inds[i] += 1 else: L.append(None) else: # Scalar has no divisions L.append(None) result.append(L) return dfs2, tuple(divisions), result
def align_partitions(*dfs): """ Mutually partition and align DataFrame blocks This serves as precursor to multi-dataframe operations like join, concat, or merge. Parameters ---------- dfs: sequence of dd.DataFrame, dd.Series and dd.base.Scalar Sequence of dataframes to be aligned on their index Returns ------- dfs: sequence of dd.DataFrame, dd.Series and dd.base.Scalar These must have consistent divisions with each other divisions: tuple Full divisions sequence of the entire result result: list A list of lists of keys that show which data exist on which divisions """ _is_broadcastable = partial(is_broadcastable, dfs) dfs1 = [df for df in dfs if isinstance(df, _Frame) and not _is_broadcastable(df)] if len(dfs) == 0: raise ValueError("dfs contains no DataFrame and Series") if not all(df.known_divisions for df in dfs1): raise ValueError("Not all divisions are known, can't align " "partitions. Please use `set_index` " "to set the index.") divisions = list(unique(merge_sorted(*[df.divisions for df in dfs1]))) if len(divisions) == 1: # single value for index divisions = (divisions[0], divisions[0]) dfs2 = [df.repartition(divisions, force=True) if isinstance(df, _Frame) else df for df in dfs] result = list() inds = [0 for df in dfs] for d in divisions[:-1]: L = list() for i, df in enumerate(dfs2): if isinstance(df, _Frame): j = inds[i] divs = df.divisions if j < len(divs) - 1 and divs[j] == d: L.append((df._name, inds[i])) inds[i] += 1 else: L.append(None) else: # Scalar has no divisions L.append(None) result.append(L) return dfs2, tuple(divisions), result
def align_partitions(*dfs): """ Mutually partition and align DataFrame blocks This serves as precursor to multi-dataframe operations like join, concat, or merge. Parameters ---------- dfs: sequence of dd.DataFrames Sequence of dataframes to be aligned on their index Returns ------- dfs: sequence of dd.DataFrames These DataFrames have consistent divisions with each other divisions: tuple Full divisions sequence of the entire result result: list A list of lists of keys that show which dataframes exist on which divisions """ divisions = list(unique(merge_sorted(*[df.divisions for df in dfs]))) divisionss = [tuple(divisions) for df in dfs] dfs2 = [ df.repartition(div, force=True) for df, div in zip(dfs, divisionss) ] result = list() inds = [0 for df in dfs] for d in divisions[:-1]: L = list() for i in range(len(dfs)): j = inds[i] divs = dfs2[i].divisions if j < len(divs) - 1 and divs[j] == d: L.append((dfs2[i]._name, inds[i])) inds[i] += 1 else: L.append(None) result.append(L) return dfs2, tuple(divisions), result
def align_partitions(*dfs): """ Mutually partition and align DataFrame blocks This serves as precursor to multi-dataframe operations like join, concat, or merge. Parameters ---------- dfs: sequence of dd.DataFrames Sequence of dataframes to be aligned on their index Returns ------- dfs: sequence of dd.DataFrames These DataFrames have consistent divisions with each other divisions: tuple Full divisions sequence of the entire result result: list A list of lists of keys that show which dataframes exist on which divisions """ divisions = list(unique(merge_sorted(*[df.divisions for df in dfs]))) divisionss = [tuple(divisions) for df in dfs] dfs2 = [df.repartition(div, force=True) for df, div in zip(dfs, divisionss)] result = list() inds = [0 for df in dfs] for d in divisions[:-1]: L = list() for i in range(len(dfs)): j = inds[i] divs = dfs2[i].divisions if j < len(divs) - 1 and divs[j] == d: L.append((dfs2[i]._name, inds[i])) inds[i] += 1 else: L.append(None) result.append(L) return dfs2, tuple(divisions), result
def merge_percentiles(finalq, qs, vals, Ns, interpolation='lower'): """ Combine several percentile calculations of different data. Parameters ---------- finalq : numpy.array Percentiles to compute (must use same scale as ``qs``). qs : sequence of numpy.arrays Percentiles calculated on different sets of data. vals : sequence of numpy.arrays Resulting values associated with percentiles ``qs``. Ns : sequence of integers The number of data elements associated with each data set. interpolation : {'linear', 'lower', 'higher', 'midpoint', 'nearest'} Specify the type of interpolation to use to calculate final percentiles. For more information, see numpy.percentile. Example ------- >>> finalq = [10, 20, 30, 40, 50, 60, 70, 80] >>> qs = [[20, 40, 60, 80], [20, 40, 60, 80]] >>> vals = [np.array([1, 2, 3, 4]), np.array([10, 11, 12, 13])] >>> Ns = [100, 100] # Both original arrays had 100 elements >>> merge_percentiles(finalq, qs, vals, Ns) array([ 1, 2, 3, 4, 10, 11, 12, 13]) """ if isinstance(finalq, Iterator): finalq = list(finalq) finalq = np.array(finalq) qs = list(map(list, qs)) vals = list(vals) Ns = list(Ns) L = list(zip(*[(q, val, N) for q, val, N in zip(qs, vals, Ns) if N])) if not L: raise ValueError("No non-trivial arrays found") qs, vals, Ns = L # TODO: Perform this check above in percentile once dtype checking is easy # Here we silently change meaning if str(vals[0].dtype) == 'category': result = merge_percentiles(finalq, qs, [v.codes for v in vals], Ns, interpolation) import pandas as pd return pd.Categorical.from_codes(result, vals[0].categories, vals[0].ordered) if not np.issubdtype(vals[0].dtype, np.number): interpolation = 'nearest' if len(vals) != len(qs) or len(Ns) != len(qs): raise ValueError('qs, vals, and Ns parameters must be the same length') # transform qs and Ns into number of observations between percentiles counts = [] for q, N in zip(qs, Ns): count = np.empty(len(q)) count[1:] = np.diff(q) count[0] = q[0] count *= N counts.append(count) # Sort by calculated percentile values, then number of observations. # >95% of the time in this function is spent in `merge_sorted` below. # An alternative that uses numpy sort is shown. It is sometimes # comparable to, but typically slower than, `merge_sorted`. # # >>> A = np.concatenate(map(np.array, map(zip, vals, counts))) # >>> A.sort(0, kind='mergesort') combined_vals_counts = merge_sorted(*map(zip, vals, counts)) combined_vals, combined_counts = zip(*combined_vals_counts) combined_vals = np.array(combined_vals) combined_counts = np.array(combined_counts) # percentile-like, but scaled by total number of observations combined_q = np.cumsum(combined_counts) # rescale finalq percentiles to match combined_q desired_q = finalq * sum(Ns) # the behavior of different interpolation methods should be # investigated further. if interpolation == 'linear': rv = np.interp(desired_q, combined_q, combined_vals) else: left = np.searchsorted(combined_q, desired_q, side='left') right = np.searchsorted(combined_q, desired_q, side='right') - 1 np.minimum(left, len(combined_vals) - 1, left) # don't exceed max index lower = np.minimum(left, right) upper = np.maximum(left, right) if interpolation == 'lower': rv = combined_vals[lower] elif interpolation == 'higher': rv = combined_vals[upper] elif interpolation == 'midpoint': rv = 0.5 * (combined_vals[lower] + combined_vals[upper]) elif interpolation == 'nearest': lower_residual = np.abs(combined_q[lower] - desired_q) upper_residual = np.abs(combined_q[upper] - desired_q) mask = lower_residual > upper_residual index = lower # alias; we no longer need lower index[mask] = upper[mask] rv = combined_vals[index] else: raise ValueError("interpolation can only be 'linear', 'lower', " "'higher', 'midpoint', or 'nearest'") return rv
def merge_percentiles(finalq, qs, vals, Ns, interpolation='lower'): """ Combine several percentile calculations of different data. Parameters ---------- finalq : numpy.array Percentiles to compute (must use same scale as ``qs``). qs : sequence of numpy.arrays Percentiles calculated on different sets of data. vals : sequence of numpy.arrays Resulting values associated with percentiles ``qs``. Ns : sequence of integers The number of data elements associated with each data set. interpolation : {'linear', 'lower', 'higher', 'midpoint', 'nearest'} Specify the type of interpolation to use to calculate final percentiles. For more information, see numpy.percentile. Examples -------- >>> finalq = [10, 20, 30, 40, 50, 60, 70, 80] >>> qs = [[20, 40, 60, 80], [20, 40, 60, 80]] >>> vals = [np.array([1, 2, 3, 4]), np.array([10, 11, 12, 13])] >>> Ns = [100, 100] # Both original arrays had 100 elements >>> merge_percentiles(finalq, qs, vals, Ns) array([ 1, 2, 3, 4, 10, 11, 12, 13]) """ if isinstance(finalq, Iterator): finalq = list(finalq) finalq = np.array(finalq) qs = list(map(list, qs)) vals = list(vals) Ns = list(Ns) L = list(zip(*[(q, val, N) for q, val, N in zip(qs, vals, Ns) if N])) if not L: raise ValueError("No non-trivial arrays found") qs, vals, Ns = L # TODO: Perform this check above in percentile once dtype checking is easy # Here we silently change meaning if str(vals[0].dtype) == 'category': result = merge_percentiles(finalq, qs, [v.codes for v in vals], Ns, interpolation) import pandas as pd return pd.Categorical.from_codes(result, vals[0].categories, vals[0].ordered) if not np.issubdtype(vals[0].dtype, np.number): interpolation = 'nearest' if len(vals) != len(qs) or len(Ns) != len(qs): raise ValueError('qs, vals, and Ns parameters must be the same length') # transform qs and Ns into number of observations between percentiles counts = [] for q, N in zip(qs, Ns): count = np.empty(len(q)) count[1:] = np.diff(q) count[0] = q[0] count *= N counts.append(count) # Sort by calculated percentile values, then number of observations. # >95% of the time in this function is spent in `merge_sorted` below. # An alternative that uses numpy sort is shown. It is sometimes # comparable to, but typically slower than, `merge_sorted`. # # >>> A = np.concatenate(map(np.array, map(zip, vals, counts))) # >>> A.sort(0, kind='mergesort') combined_vals_counts = merge_sorted(*map(zip, vals, counts)) combined_vals, combined_counts = zip(*combined_vals_counts) combined_vals = np.array(combined_vals) combined_counts = np.array(combined_counts) # percentile-like, but scaled by total number of observations combined_q = np.cumsum(combined_counts) # rescale finalq percentiles to match combined_q desired_q = finalq * sum(Ns) # the behavior of different interpolation methods should be # investigated further. if interpolation == 'linear': rv = np.interp(desired_q, combined_q, combined_vals) else: left = np.searchsorted(combined_q, desired_q, side='left') right = np.searchsorted(combined_q, desired_q, side='right') - 1 np.minimum(left, len(combined_vals) - 1, left) # don't exceed max index lower = np.minimum(left, right) upper = np.maximum(left, right) if interpolation == 'lower': rv = combined_vals[lower] elif interpolation == 'higher': rv = combined_vals[upper] elif interpolation == 'midpoint': rv = 0.5 * (combined_vals[lower] + combined_vals[upper]) elif interpolation == 'nearest': lower_residual = np.abs(combined_q[lower] - desired_q) upper_residual = np.abs(combined_q[upper] - desired_q) mask = lower_residual > upper_residual index = lower # alias; we no longer need lower index[mask] = upper[mask] rv = combined_vals[index] else: raise ValueError("interpolation can only be 'linear', 'lower', " "'higher', 'midpoint', or 'nearest'") return rv