def maximum_number_of_pairs(self): """ the maximum number of record pairs """ if self.deduplication: return full_index_size(self.df_a) else: return full_index_size(self.df_a, self.df_b)
def _dedup_index(self, df_a): shape = (len(df_a),) # with replacement if self.replace: pairs = random_pairs_with_replacement( self.n, shape, self.random_state) # without replacement else: n_max = full_index_size(shape) if not isinstance(self.n, int) or self.n <= 0 or self.n > n_max: raise ValueError( "n must be a integer satisfying 0<n<=%s" % n_max) # large dataframes if n_max < 1e6: pairs = random_pairs_without_replacement_small_frames( self.n, shape, self.random_state) # small dataframes else: pairs = random_pairs_without_replacement_large_frames( self.n, shape, self.random_state) levels = [df_a.index.values, df_a.index.values] labels = pairs return pandas.MultiIndex( levels=levels, labels=labels, verify_integrity=False )
def _random_small_link(df_a, df_b, n): n_max = full_index_size(df_a, df_b) if not isinstance(n, int) or n <= 0 or n > n_max: raise ValueError("n must be a integer satisfying 0<n<=%s" % n_max) levels = [df_a.index.values, df_b.index.values] names = [df_a.index.name, df_b.index.name] # Initialize pandas MultiIndex pairs = pandas.MultiIndex(levels=levels, labels=[[], []], names=names) # Run as long as the number of pairs is less than the requested number # of pairs n. while len(pairs) < n: # The number of pairs to sample (sample twice as much record pairs # because the duplicates are dropped). n_sample = (n - len(pairs)) * 2 sample_a = numpy.random.randint(len(df_a), size=n_sample) sample_b = numpy.random.randint(len(df_b), size=n_sample) # Make a pandas MultiIndex of the sample above pairs_sample = pandas.MultiIndex(levels=levels, labels=[sample_a, sample_b], names=names) pairs = pairs.append(pairs_sample).drop_duplicates() return pairs[0:n]
def _link_index(self, df_a, df_b): n_max = full_index_size((df_a, df_b)) if n_max > 1e7: logging.warn( "The number of record pairs is large. Consider a different " "indexation algorithm for better performance. ") return pandas.MultiIndex.from_product( [df_a.index.values, df_b.index.values], names=[df_a.index.name, df_b.index.name])
def _random_large_link(df_a, df_b, n): n_max = full_index_size(df_a, df_b) if not isinstance(n, int) or n <= 0 or n > n_max: raise ValueError("n must be a integer satisfying 0<n<=%s" % n_max) full_index = _fullindex_link(df_a, df_b) sample = numpy.random.choice(numpy.arange(len(full_index)), n, replace=False) return full_index[sample]
def _dedup_index(self, df_a): n_max = full_index_size((df_a)) if n_max > 1e7: logging.warn( "The number of record pairs is large. Consider a different " "indexation algorithm for better performance. ") levels = [df_a.index.values, df_a.index.values] labels = numpy.triu_indices(len(df_a.index), k=1) return pandas.MultiIndex(levels=levels, labels=labels, verify_integrity=False)
def _random_large_dedup(df_a, n, random_state=None): numpy.random.seed(random_state) n_max = full_index_size(df_a) if not isinstance(n, int) or n <= 0 or n > n_max: raise ValueError("n must be a integer satisfying 0<n<=%s" % n_max) full_index = _fullindex_dedup(df_a) sample = numpy.random.choice(numpy.arange(len(full_index)), n, replace=False) return full_index[sample]
def random_pairs_with_replacement(n, shape, random_state=None): """make random record pairs""" if not isinstance(random_state, np.random.RandomState): random_state = np.random.RandomState(random_state) n_max = full_index_size(shape) if n_max <= 0: raise ValueError('n_max must be larger than 0') # make random pairs indices = random_state.randint(0, n_max, n) if len(shape) == 1: return _map_tril_1d_on_2d(indices, shape[0]) else: return np.array(np.unravel_index(indices, shape))
def _link_index(self, df_a, df_b): shape = (len(df_a), len(df_b)) n_max = full_index_size(shape) if not isinstance(self.n, int): raise ValueError('n must be an integer') # with replacement if self.replace: if n_max == 0: raise ValueError( "one of the dataframes is empty") pairs = random_pairs_with_replacement( self.n, shape, self.random_state) # without replacement else: if self.n <= 0 or self.n > n_max: raise ValueError( "n must be a integer satisfying 0<n<=%s" % n_max) # large dataframes if n_max < 1e6: pairs = random_pairs_without_replacement_small_frames( self.n, shape, self.random_state) # small dataframes else: pairs = random_pairs_without_replacement_large_frames( self.n, shape, self.random_state) levels = [df_a.index.values, df_b.index.values] labels = pairs names = [df_a.index.name, df_b.index.name] return pandas.MultiIndex( levels=levels, labels=labels, names=names, verify_integrity=False )
def _link_index(self, df_a, df_b): shape = (len(df_a), len(df_b)) n_max = full_index_size(shape) if not isinstance(self.n, int): raise ValueError('n must be an integer') # with replacement if self.replace: if n_max == 0: raise ValueError("one of the dataframes is empty") pairs = random_pairs_with_replacement(self.n, shape, self.random_state) # without replacement else: if self.n <= 0 or self.n > n_max: raise ValueError("n must be a integer satisfying 0<n<=%s" % n_max) # the fraction of pairs in the sample frac = self.n / n_max # large dataframes if n_max < 1e6 or frac > 0.5: pairs = random_pairs_without_replacement( self.n, shape, self.random_state) # small dataframes else: pairs = random_pairs_without_replacement_low_memory( self.n, shape, self.random_state) levels = [df_a.index.values, df_b.index.values] codes = pairs return pandas.MultiIndex(levels=levels, codes=codes, verify_integrity=False)
def random_pairs_without_replacement_low_memory(n, shape, random_state=None): """Make a sample of random pairs with replacement. Sample random record pairs without replacement bounded by the maximum number of record pairs (based on shape). This algorithm consumes low memory and is fast for relatively small samples. """ n_max = full_index_size(shape) if not isinstance(random_state, np.random.RandomState): random_state = np.random.RandomState(random_state) if not isinstance(n, int) or n <= 0 or n > n_max: raise ValueError("n must be a integer satisfying 0<n<=%s" % n_max) sample = np.array([], dtype=np.int64) # Run as long as the number of pairs is less than the requested number # of pairs n. while len(sample) < n: # The number of pairs to sample (sample twice as much record pairs # because the duplicates are dropped). n_sample_size = (n - len(sample)) * 2 sample_sub = random_state.randint(n_max, size=n_sample_size) # concatenate pairs and deduplicate pairs_non_unique = np.append(sample, sample_sub) sample = np.unique(pairs_non_unique) # return 2d indices if len(shape) == 1: return _map_tril_1d_on_2d(sample[0:n], shape[0]) else: return np.array(np.unravel_index(sample[0:n], shape))
def random_pairs_without_replacement(n, shape, random_state=None): """Return record pairs for dense sample. Sample random record pairs without replacement bounded by the maximum number of record pairs (based on shape). This algorithm is efficient and fast for relative small samples. """ n_max = full_index_size(shape) if not isinstance(random_state, np.random.RandomState): random_state = np.random.RandomState(random_state) if not isinstance(n, int) or n <= 0 or n > n_max: raise ValueError("n must be a integer satisfying 0<n<=%s" % n_max) # make a sample without replacement sample = random_state.choice(np.arange(n_max), n, replace=False) # return 2d indices if len(shape) == 1: return _map_tril_1d_on_2d(sample, shape[0]) else: return np.array(np.unravel_index(sample, shape))