def test_combined_iterator_reporting(self, mnist_factory): iterator_train, iterator_train_meta = mnist_factory.get_dataset_iterator( split="train") iterator_test, iterator_test_meta = mnist_factory.get_dataset_iterator( split="test") meta_train = MetaFactory.get_dataset_meta( identifier="id x", dataset_name="MNIST", dataset_tag="train", iterator_meta=iterator_train_meta) meta_test = MetaFactory.get_dataset_meta( identifier="id x", dataset_name="MNIST", dataset_tag="train", iterator_meta=iterator_test_meta) informed_iterator_train = InformedDatasetFactory.get_dataset_iterator( iterator_train, meta_train) informed_iterator_test = InformedDatasetFactory.get_dataset_iterator( iterator_test, meta_test) meta_combined = MetaFactory.get_dataset_meta_from_existing( informed_iterator_train.dataset_meta, dataset_tag="full") iterator = InformedDatasetFactory.get_combined_dataset_iterator( [informed_iterator_train, informed_iterator_test], meta_combined) report = DatasetIteratorReportGenerator.generate_report(iterator) assert report.length == 70000 and report.sub_reports[ 0].length == 60000 and report.sub_reports[1].length == 10000 assert not report.sub_reports[ 0].sub_reports and not report.sub_reports[1].sub_reports
def dataset_meta(self) -> DatasetMeta: iterator_meta = MetaFactory.get_iterator_meta(sample_pos=0, target_pos=1, tag_pos=2) return MetaFactory.get_dataset_meta(identifier="identifier_1", dataset_name="TEST DATASET", dataset_tag="train", iterator_meta=iterator_meta)
def iterator(self) -> str: targets = [1]*100 + [2]*200 + [3]*300 sequence_targets = torch.Tensor(targets) sequence_samples = torch.ones_like(sequence_targets) iterator = SequenceDatasetIterator([sequence_samples, sequence_targets]) iterator_meta = MetaFactory.get_iterator_meta(sample_pos=0, target_pos=1, tag_pos=1) meta = MetaFactory.get_dataset_meta(identifier="dataset id", dataset_name="dataset", dataset_tag="full", iterator_meta=iterator_meta) return InformedDatasetFactory.get_dataset_iterator(iterator, meta)
def get_shuffled_iterator(identifier: str, iterator: InformedDatasetIteratorIF, seed: int) -> InformedDatasetIteratorIF: meta = MetaFactory.get_dataset_meta_from_existing( iterator.dataset_meta, identifier=identifier) return InformedDatasetFactory.get_shuffled_dataset_iterator( iterator, meta, seed)
def get_in_memory_iterator( identifier: str, iterator: InformedDatasetIteratorIF) -> InformedDatasetIteratorIF: meta = MetaFactory.get_dataset_meta_from_existing( iterator.dataset_meta, identifier=identifier) return InformedDatasetFactory.get_in_memory_dataset_iterator( iterator, meta)
def _get_iterator(self, split: str): dataset_identifier = self._get_resource_id(element="reuters.hdf5") dataset_resource = self.storage_connector.get_resource( identifier=dataset_identifier) meta = MetaFactory.get_iterator_meta(sample_pos=0, target_pos=1, tag_pos=2) return ReutersIterator(dataset_resource, split), meta
def _get_iterator(self): dataset_identifier = self._get_resource_id(element="news_groups.hdf5") dataset_resource = self.storage_connector.get_resource( identifier=dataset_identifier) meta = MetaFactory.get_iterator_meta(sample_pos=0, target_pos=1, tag_pos=2) return NewsGroupsIterator(dataset_resource), meta
def _get_iterator(self, split: str): dataset_identifier = self._get_resource_id(element=f"{split}.pd") dataset_resource = self.storage_connector.get_resource( identifier=dataset_identifier) meta = MetaFactory.get_iterator_meta(sample_pos=0, target_pos=1, tag_pos=2) return KDDIterator(dataset_resource), meta
def _get_iterator(self, split: str, length: float, num_samples: List[int], seed: int = 1, translation: List[int] = None): meta = MetaFactory.get_iterator_meta(sample_pos=0, target_pos=1, tag_pos=2) return XORSquaresIterator(seed, length, num_samples, translation), meta
def _get_iterator(self, split: str): """Supported splits: train, val, test """ dataset_identifier = self._get_resource_id(element="atis_dataset.hdf5") dataset_resource = self.storage_connector.get_resource( identifier=dataset_identifier) meta = MetaFactory.get_iterator_meta(sample_pos=0, target_pos=1, tag_pos=2) return AtisIterator(dataset_resource, split), meta
def _get_iterator(self, noise_std: float, interval: List[float], num_samples: int, seed: int = 1): meta = MetaFactory.get_iterator_meta(sample_pos=0, target_pos=1, tag_pos=2) return NoisyXCubedIterator(seed, noise_std, interval, num_samples), meta
def get_iterator_view( identifier: str, iterator: InformedDatasetIteratorIF, selection_fun: Callable[[DatasetIteratorIF], List[int]], view_tags: Dict[str, Any]) -> InformedDatasetIteratorIF: valid_indices = selection_fun(iterator) # valid_indices = list(np.argwhere(valid_mask).flatten()) meta = MetaFactory.get_dataset_meta_from_existing( iterator.dataset_meta, identifier=identifier) return InformedDatasetFactory.get_dataset_iterator_view( iterator, meta, valid_indices, view_tags)
def get_filtered_labels_iterator( identifier: str, iterator: InformedDatasetIteratorIF, filtered_labels: List[Any]) -> InformedDatasetIteratorIF: valid_indices = [ i for i in range(len(iterator)) if iterator[i][iterator.dataset_meta.target_pos] in filtered_labels ] meta = MetaFactory.get_dataset_meta_from_existing( iterator.dataset_meta, identifier=identifier) return InformedDatasetFactory.get_dataset_iterator_view( iterator, meta, valid_indices)
def test_combined_iterator_reporting(self, informed_dataset_iterator): meta_combined = MetaFactory.get_dataset_meta_from_existing( informed_dataset_iterator.dataset_meta, dataset_tag="full") iterator = InformedDatasetFactory.get_combined_dataset_iterator( [informed_dataset_iterator, informed_dataset_iterator], meta_combined) report = DatasetIteratorReportGenerator.generate_report(iterator) assert report.length == 2180 and report.sub_reports[ 0].length == 1090 and report.sub_reports[1].length == 1090 assert not report.sub_reports[ 0].sub_reports and not report.sub_reports[1].sub_reports
def get_mapped_labels_iterator( identifier: str, iterator: DatasetIteratorIF, mappings: Dict) -> InformedDatasetIteratorIF: label_mapper_post_processor = LabelMapperPostProcessor( mappings=mappings, target_position=iterator.dataset_meta.target_pos, tag_position=iterator.dataset_meta.tag_pos) meta = MetaFactory.get_dataset_meta_from_existing( iterator.dataset_meta, identifier=identifier) return InformedDatasetFactory.get_dataset_iterator( PostProcessedDatasetIterator(iterator, label_mapper_post_processor), meta)
def _get_iterator(self, split: str, scale_factor: float, noise_std: float, num_samples: List[int], seed: int = 1, translation: List[int] = None): meta = MetaFactory.get_iterator_meta(sample_pos=0, target_pos=1, tag_pos=2) return CirclesIterator(seed, noise_std, num_samples, scale_factor, translation), meta
def _get_iterator(self, split: str, noise_std: float, num_samples: List[int], seed: int = 1, translation: List[float] = None, scaling: List[int] = None): meta = MetaFactory.get_iterator_meta(sample_pos=0, target_pos=1, tag_pos=2) return HalfMoonIterator(seed, noise_std, num_samples, translation, scaling), meta
def _get_iterator(self, split: str, num_samples: List[int], classes: List[int], hypercube: List[Tuple[int, int]], seed: int = 1): meta = MetaFactory.get_iterator_meta(sample_pos=0, target_pos=1, tag_pos=2) return UniformNoiseIterator(seed=seed, num_samples=num_samples, classes=classes, hypercube=hypercube), meta
def get_one_hot_encoded_target_iterators( identifier: str, iterators: Dict[str, InformedDatasetIteratorIF], target_vector_size: int) -> Dict[str, DatasetIteratorIF]: target_position = list(iterators.items())[0][1].dataset_meta.target_pos postprocessor = OneHotEncodedTargetPostProcessor( target_vector_size=target_vector_size, target_position=target_position) return { name: InformedDatasetFactory.get_dataset_iterator( PostProcessedDatasetIterator(iterator, postprocessor), MetaFactory.get_dataset_meta_from_existing( iterator.dataset_meta, identifier=identifier)) for name, iterator in iterators.items() }
def test_plain_iterator_reporting(self, mnist_factory): iterator, iterator_meta = mnist_factory.get_dataset_iterator( split="train") dataset_meta = MetaFactory.get_dataset_meta( identifier="id x", dataset_name="MNIST", dataset_tag="train", iterator_meta=iterator_meta) informed_iterator = InformedDatasetIterator(iterator, dataset_meta) report = DatasetIteratorReportGenerator.generate_report( informed_iterator) print(report) assert report.length == 60000 and not report.sub_reports
def _get_iterator(self): sample_identifier = self._get_resource_id(element="samples.pt") target_identifier = self._get_resource_id(element="targets.pt") sample_resource = self.storage_connector.get_resource( identifier=sample_identifier) target_resource = self.storage_connector.get_resource( identifier=target_identifier) text_sample_resource = StreamedTextResource.from_streamed_resouce( sample_resource) text_target_resource = StreamedTextResource.from_streamed_resouce( target_resource) meta = MetaFactory.get_iterator_meta(sample_pos=0, target_pos=1, tag_pos=2) return ArrhythmiaIterator(text_sample_resource, text_target_resource), meta
def _get_iterator(self, split: str, class_label: int, radius: float, start_degree: float, end_degree: float, num_samples: int, seed: int = 1, translation: List[int] = None, noise_std: int = 0): meta = MetaFactory.get_iterator_meta(sample_pos=0, target_pos=1, tag_pos=2) return CircularSegmentIterator(seed, class_label, radius, start_degree, end_degree, num_samples, noise_std, translation), meta
def get_feature_encoded_iterators( identifier: str, iterators: Dict[str, InformedDatasetIteratorIF], feature_encoding_configs: Dict[str, List[Any]] ) -> Dict[str, DatasetIteratorIF]: sample_position = list(iterators.items())[0][1].dataset_meta.sample_pos feature_encoder_post_processor = FeatureEncoderPostProcessor( sample_position=sample_position, feature_encoding_configs=feature_encoding_configs) feature_encoder_post_processor.fit(iterators) return { name: InformedDatasetFactory.get_dataset_iterator( PostProcessedDatasetIterator(iterator, feature_encoder_post_processor), MetaFactory.get_dataset_meta_from_existing( iterator.dataset_meta, identifier=identifier)) for name, iterator in iterators.items() }
def _split(iterator: InformedDatasetIteratorIF, seed: int, split_config: Dict) -> Dict[str, InformedDatasetIteratorIF]: names = list(split_config.keys()) ratios = list(split_config.values()) if stratified: splitter = SplitterFactory.get_stratified_splitter( ratios=ratios, seed=seed) else: splitter = SplitterFactory.get_random_splitter(ratios=ratios, seed=seed) splitted_iterators = splitter.split(iterator) dataset_metas = [ MetaFactory.get_dataset_meta_from_existing( iterator.dataset_meta, identifier=identifier, dataset_tag=name) for name in names ] return { name: InformedDatasetFactory.get_dataset_iterator( splitted_iterators[i], dataset_metas[i]) for i, name in enumerate(names) }
def get_combined_iterators( identifier: str, iterators: Dict[str, Dict[str, InformedDatasetIteratorIF]], combine_configs: Dict) -> Dict[str, InformedDatasetIteratorIF]: """Combines iterators. Args: identifier (str): iterators (Dict[str, Dict[str, InformedDatasetIteratorIF]]): Dictionary mapping from iterator_name -> split_name -> iterator combine_configs (Dict): Returns: Dict[str, InformedDatasetIteratorIF]: """ def get_iterators_to_be_combined( iterators: Dict[str, Dict[str, InformedDatasetIteratorIF]], split_config: List): return [ iterators[element["iterators_name"]][split_name] for element in split_config for split_name in element["splits"] ] combined_iterators = {} for split_config in combine_configs: iterator_list = get_iterators_to_be_combined( iterators, split_config["old_splits"]) meta = MetaFactory.get_dataset_meta_from_existing( dataset_meta=iterator_list[0].dataset_meta, identifier=identifier, dataset_name="combined_dataset", dataset_tag=None) combined_iterators[split_config[ "new_split"]] = InformedDatasetFactory.get_dataset_iterator( CombinedDatasetIterator(iterator_list), meta) return combined_iterators
def _get_iterator(self, split: str, class_label: int, seed: int, num_samples: int, covariance: np.array, mean: Tuple[int, int]): meta = MetaFactory.get_iterator_meta(sample_pos=0, target_pos=1, tag_pos=2) return GaussianIterator(seed, class_label, num_samples, covariance, mean), meta
def _get_iterator(self, split: str, high_level_targets: bool = True): dataset_identifier = self._get_resource_id(element="trec_dataset.hdf5") dataset_resource = self.storage_connector.get_resource(identifier=dataset_identifier) meta = MetaFactory.get_iterator_meta(sample_pos=0, target_pos=1, tag_pos=2) return TrecIterator(dataset_resource, split, high_level_targets), meta