Python max_number_of_pairs Examples

Programming Language: Python

Namespace/Package Name: recordlinkage.utils

Method/Function: max_number_of_pairs

Examples at hotexamples.com: 3

Python max_number_of_pairs - 3 examples found. These are the top rated real world Python examples of recordlinkage.utils.max_number_of_pairs extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

File: indexing.py Project: luyang1210/recordL

    def maximum_number_of_pairs(self):
        """ the maximum number of record pairs """

        if self.deduplication:
            return max_number_of_pairs(self.df_a)
        else:
            return max_number_of_pairs(self.df_a, self.df_b)

Example #2

Show file

File: indexing.py Project: luyang1210/recordL

def _random_small_link(df_a, df_b, n):

    n_max = max_number_of_pairs(df_a, df_b)

    if not isinstance(n, int) or n <= 0 or n > n_max:
        raise ValueError("n must be a integer satisfying 0<n<=%s" % n_max)

    levels = [df_a.index.values, df_b.index.values]
    names = [df_a.index.name, df_b.index.name]

    # Initialize pandas MultiIndex
    pairs = pandas.MultiIndex(levels=levels, labels=[[], []], names=names)

    # Run as long as the number of pairs is less than the requested number
    # of pairs n.
    while len(pairs) < n:

        # The number of pairs to sample (sample twice as much record pairs
        # because the duplicates are dropped).
        n_sample = (n - len(pairs)) * 2
        sample_a = numpy.random.randint(len(df_a), size=n_sample)
        sample_b = numpy.random.randint(len(df_b), size=n_sample)

        # Make a pandas MultiIndex of the sample above
        pairs_sample = pandas.MultiIndex(
            levels=levels, labels=[sample_a, sample_b], names=names
        )

        pairs = pairs.append(pairs_sample).drop_duplicates()

    return pairs[0:n]

Example #3

Show file

File: indexing.py Project: luyang1210/recordL

def _random_large_link(df_a, df_b, n):

    n_max = max_number_of_pairs(df_a, df_b)

    if not isinstance(n, int) or n <= 0 or n > n_max:
        raise ValueError("n must be a integer satisfying 0<n<=%s" % n_max)

    full_index = _fullindex_link(df_a, df_b)
    sample = numpy.random.choice(
        numpy.arange(len(full_index)), n, replace=False
    )

    return full_index[sample]