Esempio n. 1
0
    def maximum_number_of_pairs(self):
        """ the maximum number of record pairs """

        if self.deduplication:
            return full_index_size(self.df_a)
        else:
            return full_index_size(self.df_a, self.df_b)
Esempio n. 2
0
    def _dedup_index(self, df_a):

        shape = (len(df_a),)

        # with replacement
        if self.replace:
            pairs = random_pairs_with_replacement(
                self.n, shape, self.random_state)

        # without replacement
        else:

            n_max = full_index_size(shape)

            if not isinstance(self.n, int) or self.n <= 0 or self.n > n_max:
                raise ValueError(
                    "n must be a integer satisfying 0<n<=%s" % n_max)

            # large dataframes
            if n_max < 1e6:
                pairs = random_pairs_without_replacement_small_frames(
                    self.n, shape, self.random_state)
            # small dataframes
            else:
                pairs = random_pairs_without_replacement_large_frames(
                    self.n, shape, self.random_state)

        levels = [df_a.index.values, df_a.index.values]
        labels = pairs

        return pandas.MultiIndex(
            levels=levels,
            labels=labels,
            verify_integrity=False
        )
Esempio n. 3
0
def _random_small_link(df_a, df_b, n):

    n_max = full_index_size(df_a, df_b)

    if not isinstance(n, int) or n <= 0 or n > n_max:
        raise ValueError("n must be a integer satisfying 0<n<=%s" % n_max)

    levels = [df_a.index.values, df_b.index.values]
    names = [df_a.index.name, df_b.index.name]

    # Initialize pandas MultiIndex
    pairs = pandas.MultiIndex(levels=levels, labels=[[], []], names=names)

    # Run as long as the number of pairs is less than the requested number
    # of pairs n.
    while len(pairs) < n:

        # The number of pairs to sample (sample twice as much record pairs
        # because the duplicates are dropped).
        n_sample = (n - len(pairs)) * 2
        sample_a = numpy.random.randint(len(df_a), size=n_sample)
        sample_b = numpy.random.randint(len(df_b), size=n_sample)

        # Make a pandas MultiIndex of the sample above
        pairs_sample = pandas.MultiIndex(levels=levels,
                                         labels=[sample_a, sample_b],
                                         names=names)

        pairs = pairs.append(pairs_sample).drop_duplicates()

    return pairs[0:n]
Esempio n. 4
0
    def _link_index(self, df_a, df_b):

        n_max = full_index_size((df_a, df_b))

        if n_max > 1e7:
            logging.warn(
                "The number of record pairs is large. Consider a different "
                "indexation algorithm for better performance. ")

        return pandas.MultiIndex.from_product(
            [df_a.index.values, df_b.index.values],
            names=[df_a.index.name, df_b.index.name])
Esempio n. 5
0
def _random_large_link(df_a, df_b, n):

    n_max = full_index_size(df_a, df_b)

    if not isinstance(n, int) or n <= 0 or n > n_max:
        raise ValueError("n must be a integer satisfying 0<n<=%s" % n_max)

    full_index = _fullindex_link(df_a, df_b)
    sample = numpy.random.choice(numpy.arange(len(full_index)),
                                 n,
                                 replace=False)

    return full_index[sample]
Esempio n. 6
0
    def _dedup_index(self, df_a):

        n_max = full_index_size((df_a))

        if n_max > 1e7:
            logging.warn(
                "The number of record pairs is large. Consider a different "
                "indexation algorithm for better performance. ")

        levels = [df_a.index.values, df_a.index.values]
        labels = numpy.triu_indices(len(df_a.index), k=1)

        return pandas.MultiIndex(levels=levels,
                                 labels=labels,
                                 verify_integrity=False)
Esempio n. 7
0
def _random_large_dedup(df_a, n, random_state=None):

    numpy.random.seed(random_state)

    n_max = full_index_size(df_a)

    if not isinstance(n, int) or n <= 0 or n > n_max:
        raise ValueError("n must be a integer satisfying 0<n<=%s" % n_max)

    full_index = _fullindex_dedup(df_a)
    sample = numpy.random.choice(numpy.arange(len(full_index)),
                                 n,
                                 replace=False)

    return full_index[sample]
Esempio n. 8
0
def random_pairs_with_replacement(n, shape, random_state=None):
    """make random record pairs"""

    if not isinstance(random_state, np.random.RandomState):
        random_state = np.random.RandomState(random_state)

    n_max = full_index_size(shape)

    if n_max <= 0:
        raise ValueError('n_max must be larger than 0')

    # make random pairs
    indices = random_state.randint(0, n_max, n)

    if len(shape) == 1:
        return _map_tril_1d_on_2d(indices, shape[0])
    else:
        return np.array(np.unravel_index(indices, shape))
Esempio n. 9
0
    def _link_index(self, df_a, df_b):

        shape = (len(df_a), len(df_b))
        n_max = full_index_size(shape)

        if not isinstance(self.n, int):
            raise ValueError('n must be an integer')

        # with replacement
        if self.replace:

            if n_max == 0:
                raise ValueError(
                    "one of the dataframes is empty")

            pairs = random_pairs_with_replacement(
                self.n, shape, self.random_state)

        # without replacement
        else:

            if self.n <= 0 or self.n > n_max:
                raise ValueError(
                    "n must be a integer satisfying 0<n<=%s" % n_max)

            # large dataframes
            if n_max < 1e6:
                pairs = random_pairs_without_replacement_small_frames(
                    self.n, shape, self.random_state)
            # small dataframes
            else:
                pairs = random_pairs_without_replacement_large_frames(
                    self.n, shape, self.random_state)

        levels = [df_a.index.values, df_b.index.values]
        labels = pairs
        names = [df_a.index.name, df_b.index.name]

        return pandas.MultiIndex(
            levels=levels,
            labels=labels,
            names=names,
            verify_integrity=False
        )
Esempio n. 10
0
    def _link_index(self, df_a, df_b):

        shape = (len(df_a), len(df_b))
        n_max = full_index_size(shape)

        if not isinstance(self.n, int):
            raise ValueError('n must be an integer')

        # with replacement
        if self.replace:

            if n_max == 0:
                raise ValueError("one of the dataframes is empty")

            pairs = random_pairs_with_replacement(self.n, shape,
                                                  self.random_state)

        # without replacement
        else:

            if self.n <= 0 or self.n > n_max:
                raise ValueError("n must be a integer satisfying 0<n<=%s" %
                                 n_max)

            # the fraction of pairs in the sample
            frac = self.n / n_max

            # large dataframes
            if n_max < 1e6 or frac > 0.5:
                pairs = random_pairs_without_replacement(
                    self.n, shape, self.random_state)
            # small dataframes
            else:
                pairs = random_pairs_without_replacement_low_memory(
                    self.n, shape, self.random_state)

        levels = [df_a.index.values, df_b.index.values]
        codes = pairs

        return pandas.MultiIndex(levels=levels,
                                 codes=codes,
                                 verify_integrity=False)
Esempio n. 11
0
def random_pairs_without_replacement_low_memory(n, shape, random_state=None):
    """Make a sample of random pairs with replacement.

    Sample random record pairs without replacement bounded by the
    maximum number of record pairs (based on shape). This algorithm
    consumes low memory and is fast for relatively small samples.
    """

    n_max = full_index_size(shape)

    if not isinstance(random_state, np.random.RandomState):
        random_state = np.random.RandomState(random_state)

    if not isinstance(n, int) or n <= 0 or n > n_max:
        raise ValueError("n must be a integer satisfying 0<n<=%s" % n_max)

    sample = np.array([], dtype=np.int64)

    # Run as long as the number of pairs is less than the requested number
    # of pairs n.
    while len(sample) < n:

        # The number of pairs to sample (sample twice as much record pairs
        # because the duplicates are dropped).
        n_sample_size = (n - len(sample)) * 2
        sample_sub = random_state.randint(n_max, size=n_sample_size)

        # concatenate pairs and deduplicate
        pairs_non_unique = np.append(sample, sample_sub)
        sample = np.unique(pairs_non_unique)

    # return 2d indices
    if len(shape) == 1:
        return _map_tril_1d_on_2d(sample[0:n], shape[0])
    else:
        return np.array(np.unravel_index(sample[0:n], shape))
Esempio n. 12
0
def random_pairs_without_replacement(n, shape, random_state=None):
    """Return record pairs for dense sample.

    Sample random record pairs without replacement bounded by the
    maximum number of record pairs (based on shape). This algorithm is
    efficient and fast for relative small samples.
    """

    n_max = full_index_size(shape)

    if not isinstance(random_state, np.random.RandomState):
        random_state = np.random.RandomState(random_state)

    if not isinstance(n, int) or n <= 0 or n > n_max:
        raise ValueError("n must be a integer satisfying 0<n<=%s" % n_max)

    # make a sample without replacement
    sample = random_state.choice(np.arange(n_max), n, replace=False)

    # return 2d indices
    if len(shape) == 1:
        return _map_tril_1d_on_2d(sample, shape[0])
    else:
        return np.array(np.unravel_index(sample, shape))