def __init__(self, hparams, device: Optional[torch.device] = None): self._hparams = HParams(hparams, self.default_hparams()) # Defaultizes hyperparameters of each dataset datasets_hparams = self._hparams.datasets defaultized_datasets_hparams = [] for hparams_i in datasets_hparams: data_type = hparams_i.get("data_type", None) defaultized_ds_hpms = HParams(hparams_i, _default_dataset_hparams(data_type)) defaultized_datasets_hparams.append(defaultized_ds_hpms) self._hparams.datasets = defaultized_datasets_hparams self._vocab = self.make_vocab(self._hparams.datasets) self._embedding = self.make_embedding(self._hparams.datasets, self._vocab) dummy_source = SequenceDataSource[Any]([]) name_prefix: List[str] = [] self._names: List[Dict[str, Any]] = [] sources: List[DataSource] = [] filters: List[Optional[Callable[[str], bool]]] = [] self._databases: List[DataBase] = [] for idx, hparams_i in enumerate(self._hparams.datasets): data_type = _DataType(hparams_i.data_type) source_i: DataSource if _is_text_data(data_type): source_i = TextLineDataSource( hparams_i.files, compression_type=hparams_i.compression_type, delimiter=hparams_i.delimiter) sources.append(source_i) if ((hparams_i.length_filter_mode == _LengthFilterMode.DISCARD.value) and hparams_i.max_seq_length is not None): def _get_filter(max_seq_length): return lambda x: len(x) <= max_seq_length filters.append(_get_filter(hparams_i.max_seq_length)) else: filters.append(None) self._names.append({ field: connect_name(hparams_i.data_name, field) for field in ["text", "text_ids", "length"] }) dataset_hparams = dict_fetch( hparams_i, MonoTextData.default_hparams()["dataset"]) dataset_hparams["data_name"] = None self._databases.append( MonoTextData(hparams={"dataset": dataset_hparams}, device=device, vocab=self._vocab[idx], embedding=self._embedding[idx], data_source=dummy_source)) elif _is_scalar_data(data_type): source_i = TextLineDataSource( hparams_i.files, compression_type=hparams_i.compression_type) sources.append(source_i) filters.append(None) self._names.append({"data": hparams_i.data_name}) dataset_hparams = dict_fetch( hparams_i, ScalarData.default_hparams()["dataset"]) dataset_hparams["data_name"] = "data" self._databases.append( ScalarData(hparams={"dataset": dataset_hparams}, device=device, data_source=dummy_source)) elif _is_record_data(data_type): source_i = PickleDataSource(file_paths=hparams_i.files) sources.append(source_i) self._names.append({ name: connect_name(hparams_i.data_name, name) for name in hparams_i.feature_original_types.keys() }) filters.append(None) dataset_hparams = dict_fetch( hparams_i, RecordData.default_hparams()["dataset"]) self._databases.append( RecordData(hparams={"dataset": dataset_hparams}, device=device, data_source=dummy_source)) else: raise ValueError(f"Unknown data type: {hparams_i.data_type}") # check for duplicate names for i in range(1, len(name_prefix)): if name_prefix[i] in name_prefix[:i - 1]: raise ValueError(f"Duplicate data name: {name_prefix[i]}") name_prefix.append(hparams_i["data_name"]) self._name_to_id = {v: k for k, v in enumerate(name_prefix)} data_source: DataSource = ZipDataSource(*sources) if any(filters): def filter_fn(data): return all( fn(data) for fn, data in zip(filters, data) if fn is not None) data_source = FilterDataSource(data_source, filter_fn=filter_fn) super().__init__(data_source, self._hparams, device)
def __init__(self, hparams, device: Optional[torch.device] = None): print("Using local texar") self._hparams = HParams(hparams, self.default_hparams()) # Defaultizes hyperparameters of each dataset datasets_hparams = self._hparams.datasets defaultized_datasets_hparams = [] for hparams_i in datasets_hparams: data_type = hparams_i.get("data_type", None) #print("data_type:", data_type) defaultized_ds_hpms = HParams(hparams_i, _default_dataset_hparams(data_type)) defaultized_datasets_hparams.append(defaultized_ds_hpms) self._hparams.datasets = defaultized_datasets_hparams #print("will make_vocab") self._vocab = self.make_vocab(self._hparams.datasets) #print("will make_embedding") self._embedding = self.make_embedding(self._hparams.datasets, self._vocab) dummy_source = SequenceDataSource[Any]([]) name_prefix: List[str] = [] self._names: List[Dict[str, Any]] = [] sources: List[DataSource] = [] filters: List[Optional[Callable[[str], bool]]] = [] self._databases: List[DatasetBase] = [] for idx, hparams_i in enumerate(self._hparams.datasets): data_type = hparams_i.data_type source_i: DataSource if _is_text_data(data_type): #print("will TextLineDataSource") source_i = TextLineDataSource( hparams_i.files, compression_type=hparams_i.compression_type, delimiter=hparams_i.delimiter) sources.append(source_i) if ((hparams_i.length_filter_mode == _LengthFilterMode.DISCARD.value) and hparams_i.max_seq_length is not None): def _get_filter(max_seq_length): return lambda x: len(x) <= max_seq_length filters.append(_get_filter(hparams_i.max_seq_length)) else: filters.append(None) self._names.append({ field: connect_name(hparams_i.data_name, field) for field in ["text", "text_ids", "length"] }) dataset_hparams = dict_fetch( hparams_i, MonoTextData.default_hparams()["dataset"]) dataset_hparams["data_name"] = None self._databases.append( MonoTextData(hparams={"dataset": dataset_hparams}, device=device, vocab=self._vocab[idx], embedding=self._embedding[idx], data_source=dummy_source)) elif _is_scalar_data(data_type): source_i = TextLineDataSource( hparams_i.files, compression_type=hparams_i.compression_type) sources.append(source_i) filters.append(None) self._names.append({"data": hparams_i.data_name}) dataset_hparams = dict_fetch( hparams_i, ScalarData.default_hparams()["dataset"]) dataset_hparams["data_name"] = "data" self._databases.append( ScalarData(hparams={"dataset": dataset_hparams}, device=device, data_source=dummy_source)) elif _is_record_data(data_type): source_i = PickleDataSource(file_paths=hparams_i.files) sources.append(source_i) # TODO: Only check `feature_types` when we finally remove # `feature_original_types`. feature_types = (hparams_i.feature_types or hparams_i.feature_original_types) self._names.append({ name: connect_name(hparams_i.data_name, name) for name in feature_types.keys() }) filters.append(None) dataset_hparams = dict_fetch( hparams_i, RecordData.default_hparams()["dataset"]) self._databases.append( RecordData(hparams={"dataset": dataset_hparams}, device=device, data_source=dummy_source)) else: raise ValueError(f"Unknown data type: {hparams_i.data_type}") # check for duplicate names for i in range(1, len(name_prefix)): if name_prefix[i] in name_prefix[:i - 1]: raise ValueError(f"Duplicate data name: {name_prefix[i]}") name_prefix.append(hparams_i["data_name"]) self._name_to_id = {v: k for k, v in enumerate(name_prefix)} self._processed_cache = [] self._datafile_id = 0 # for training from multiple files self._index_at_beginning_of_this_dataset = 0 self._datafile_prefix = hparams_i.files #self._datafile_num = 33 # hparams_i.datafile_num #self._datafile_num = 64 # hparams_i.datafile_num #self._datafile_num = 3 # hparams_i.datafile_num #self._datafile_num = 16 # hparams_i.datafile_num #self._datafile_num = 26 # hparams_i.datafile_num self._datafile_num = 1 # hparams_i.datafile_num #self._datafile_num = 3 # hparams_i.datafile_num data_source: DataSource = ZipDataSource(*sources) if any(filters): def filter_fn(data): return all( fn(data) for fn, data in zip(filters, data) if fn is not None) data_source = FilterDataSource(data_source, filter_fn=filter_fn) #print("data init derive done") super(MultiAlignedData, self).__init__(data_source, self._hparams, device) #self._dataset_size = 3000000 #self._dataset_size = 6400000 #self._dataset_size = 16000000 #self._dataset_size = 3802215 #self._dataset_size = 1250000 #self._dataset_size = 3000 self._dataset_size = 834229