def maximum_number_of_pairs(self): """ the maximum number of record pairs """ if self.deduplication: return max_number_of_pairs(self.df_a) else: return max_number_of_pairs(self.df_a, self.df_b)
def _random_small_link(df_a, df_b, n): n_max = max_number_of_pairs(df_a, df_b) if not isinstance(n, int) or n <= 0 or n > n_max: raise ValueError("n must be a integer satisfying 0<n<=%s" % n_max) levels = [df_a.index.values, df_b.index.values] names = [df_a.index.name, df_b.index.name] # Initialize pandas MultiIndex pairs = pandas.MultiIndex(levels=levels, labels=[[], []], names=names) # Run as long as the number of pairs is less than the requested number # of pairs n. while len(pairs) < n: # The number of pairs to sample (sample twice as much record pairs # because the duplicates are dropped). n_sample = (n - len(pairs)) * 2 sample_a = numpy.random.randint(len(df_a), size=n_sample) sample_b = numpy.random.randint(len(df_b), size=n_sample) # Make a pandas MultiIndex of the sample above pairs_sample = pandas.MultiIndex( levels=levels, labels=[sample_a, sample_b], names=names ) pairs = pairs.append(pairs_sample).drop_duplicates() return pairs[0:n]
def _random_large_link(df_a, df_b, n): n_max = max_number_of_pairs(df_a, df_b) if not isinstance(n, int) or n <= 0 or n > n_max: raise ValueError("n must be a integer satisfying 0<n<=%s" % n_max) full_index = _fullindex_link(df_a, df_b) sample = numpy.random.choice( numpy.arange(len(full_index)), n, replace=False ) return full_index[sample]