def test_compute_class_weights(self): # regression test for issue #181 np.random.seed(0) y = np.random.choice(a=[0, 1, 2], size=1000, p=[0.3, 0.6, 0.1]) class_counts = Counter(y) weights = compute_class_weights('log', class_counts=class_counts) self.assertEqual(weights[1], 1.0)
def _dataset_with_targets(self, Xs, Y, train, context=None): if not callable(Xs) and not callable(Y): if self.config.use_auxiliary_info: dataset = lambda: zip( Xs, Y, context ) # Do not need to check if context is callable - it is turned in along with Xs, and thus must have the same form else: dataset = lambda: zip(Xs, Y) elif callable(Xs) and callable(Y): if self.config.use_auxiliary_info: dataset = lambda: zip(Xs(), Y(), context() ) # encode one sample at a time. else: dataset = lambda: zip(Xs(), Y()) else: raise ValueError( "Either neither or both of Xs and Y should be callable, not a mixture" ) dataset_encoded = lambda: itertools.chain.from_iterable( map(lambda xy: self.text_to_tokens_mask(*xy), dataset())) if not callable(Y) and train: dataset_encoded_list = list(dataset_encoded()) dataset_encoded_list = self._filter_empty_examples( dataset_encoded_list) class_counts = self._compute_class_counts(dataset_encoded_list) self.config.dataset_size = len(dataset_encoded_list) if self.config.class_weights is not None: self.config.class_weights = compute_class_weights( class_weights=self.config.class_weights, class_counts=class_counts) shape_def = self.feed_shape_type_def() return Dataset.from_generator( lambda: self.wrap_tqdm(dataset_encoded(), train), *shape_def)
def _dataset_with_targets(self, Xs, Y, train): if not callable(Xs) and not callable(Y): dataset = lambda: zip(Xs, Y) elif callable(Xs) and callable(Y): dataset = lambda: zip(Xs(), Y()) # encode one sample at a time. else: raise ValueError( "Either neither or both of Xs and Y should be callable, not a mixture" ) dataset_encoded = lambda: itertools.chain.from_iterable( map(lambda xy: self.text_to_tokens_mask(*xy), dataset())) shape_def = self.feed_shape_type_def() if not callable(Y) and train: dataset_encoded_list = list(dataset_encoded()) class_counts = self._compute_class_counts(dataset_encoded_list) self.config.dataset_size = len(dataset_encoded_list) if self.config.class_weights is not None: self.config.class_weights = compute_class_weights( class_weights=self.config.class_weights, class_counts=class_counts) return Dataset.from_generator( lambda: self.wrap_tqdm(dataset_encoded(), train), *shape_def)
def _compute_class_weights(self, class_weights, class_counts): class_weights = compute_class_weights( class_weights=class_weights, class_counts=class_counts, n_total=self.config.dataset_size, multilabel=True ) return class_weights
def _post_data_initialization(self, Y): self.label_encoder = self._target_encoder() if not callable(Y): Y_fit = Y self.label_encoder.fit(Y) else: Y_fit = list(itertools.islice(Y(), 10000)) self.label_encoder.fit(Y_fit) self.config.pad_idx = self.pad_idx target_dim = self.label_encoder.target_dim self.lm_loss_coef = self.config.lm_loss_coef if target_dim is not None else 1.0 self.target_dim = target_dim if Y_fit is not None: self.config.class_weights = compute_class_weights( class_weights=self.config.class_weights, Y=Y_fit)
def _compute_class_weights(self, class_weights, class_counts): return compute_class_weights(class_weights=class_weights, class_counts=class_counts)