def choice(self, size=None, indexes=None): """ Return a random sample based on this attribute's probability. If indexes and n are both set, ignore n. Parameters ---------- size : int size of random sample indexes : array-like array of indexes in bins """ if indexes is None: if size is None: size = len(self) indexes = Series( np.random.choice(len(self.prs), size=size, p=self.prs)) column = indexes.map(lambda x: self._sampling_bins(x)) if self.atype == 'datetime': if not self.categorical: column = column.map(self._date_formatter) elif self.atype == 'integer': column = column.round().astype(int) elif self.atype == 'string': if not self.categorical: column = column.map(lambda x: utils.randomize_string(int(x))) return column
def random(self, size=None): """ Return an random array with same length (usually used for non-categorical attribute). """ if size is None: size = len(self) if self._min == self._max: rands = np.ones(size) * self._min else: rands = np.arange(self._min, self._max, (self._max - self._min) / size) np.random.shuffle(rands) if self.atype == 'string': if self._min == self._max: length = self._min else: length = np.random.randint(self._min, self._max) vectorized = np.vectorize(lambda x: utils.randomize_string(length)) rands = vectorized(rands) elif self.atype == 'integer': rands = list(map(int, rands)) elif self.atype == 'datetime': rands = list(map(self._date_formatter, rands)) return Series(rands)
def choice(self, size=None, indexes=None): """ Return a random sample based on this attribute's probability and distribution bins (default value is base random distribution bins based on its probability). Parameters ---------- size : int size of random sample indexes : array-like array of indexes in distribution bins """ if indexes is None: size = size or self.size indexes = Series( np.random.choice(len(self.prs), size=size, p=self.prs)) column = indexes.map(self._random_sample_at) if self.type == 'datetime': if not self.categorical: column = column.map(self._date_formatter) elif self.type == 'float': column = column.round(self._decimals) elif self.type == 'integer': column = column.round().astype(int) elif self.type == 'string': if not self.categorical: column = column.map(lambda x: utils.randomize_string(int(x))) return column
def random(self, size=None): """ Return an random array with same length (usually used for non-categorical attribute). """ size = size or self.size if self.min_ == self.max_: rands = np.ones(size) * self.min_ else: rands = np.arange(self.min_, self.max_, (self.max_ - self.min_) / size) np.random.shuffle(rands) if self.type == 'string': if self.min_ == self.max_: length = self.min_ else: length = np.random.randint(self.min_, self.max_) vectorized = np.vectorize(lambda x: utils.randomize_string(length)) rands = vectorized(rands) elif self.type == 'integer': rands = list(map(int, rands)) elif self.type == 'datetime': rands = list(map(self._date_formatter, rands)) return Series(rands)
def test_set_domain_for_string_attribute(): strings = list(map(lambda x: randomize_string(5), range(size))) attr = Attribute(Series(strings, name='String'), categorical=True) bins = attr.bins attr.domain = ['a', 'b', 'China', 'USA'] assert len(bins) + 4 == len(attr.bins)
def test_string_attribute(): strings = list(map(lambda x: randomize_string(5), range(size))) attr = Attribute(Series(strings, name='String'), categorical=True) assert attr.atype == 'string' assert attr._min == 5 assert attr.categorical
def test_random_strings(): strings = list(map(lambda x: randomize_string(5), range(size))) attr = Attribute(Series(strings, name='String')) randoms = attr.random() assert len(randoms) == size
def test_choice_strings(): strings = list(map(lambda x: randomize_string(5), range(size))) attr = Attribute(Series(strings, name='String')) choices = attr.choice() assert len(choices) == size