def __init__(self, hparams, device: Optional[torch.device] = None):
        self._hparams = HParams(hparams, self.default_hparams())
        # Defaultizes hyperparameters of each dataset
        datasets_hparams = self._hparams.datasets
        defaultized_datasets_hparams = []
        for hparams_i in datasets_hparams:
            data_type = hparams_i.get("data_type", None)
            defaultized_ds_hpms = HParams(hparams_i,
                                          _default_dataset_hparams(data_type))
            defaultized_datasets_hparams.append(defaultized_ds_hpms)
        self._hparams.datasets = defaultized_datasets_hparams

        self._vocab = self.make_vocab(self._hparams.datasets)
        self._embedding = self.make_embedding(self._hparams.datasets,
                                              self._vocab)

        dummy_source = SequenceDataSource[Any]([])
        name_prefix: List[str] = []
        self._names: List[Dict[str, Any]] = []
        sources: List[DataSource] = []
        filters: List[Optional[Callable[[str], bool]]] = []
        self._databases: List[DataBase] = []
        for idx, hparams_i in enumerate(self._hparams.datasets):
            data_type = _DataType(hparams_i.data_type)
            source_i: DataSource

            if _is_text_data(data_type):
                source_i = TextLineDataSource(
                    hparams_i.files,
                    compression_type=hparams_i.compression_type,
                    delimiter=hparams_i.delimiter)
                sources.append(source_i)
                if ((hparams_i.length_filter_mode
                     == _LengthFilterMode.DISCARD.value)
                        and hparams_i.max_seq_length is not None):

                    def _get_filter(max_seq_length):
                        return lambda x: len(x) <= max_seq_length

                    filters.append(_get_filter(hparams_i.max_seq_length))
                else:
                    filters.append(None)

                self._names.append({
                    field: connect_name(hparams_i.data_name, field)
                    for field in ["text", "text_ids", "length"]
                })

                dataset_hparams = dict_fetch(
                    hparams_i,
                    MonoTextData.default_hparams()["dataset"])
                dataset_hparams["data_name"] = None
                self._databases.append(
                    MonoTextData(hparams={"dataset": dataset_hparams},
                                 device=device,
                                 vocab=self._vocab[idx],
                                 embedding=self._embedding[idx],
                                 data_source=dummy_source))
            elif _is_scalar_data(data_type):
                source_i = TextLineDataSource(
                    hparams_i.files,
                    compression_type=hparams_i.compression_type)
                sources.append(source_i)
                filters.append(None)
                self._names.append({"data": hparams_i.data_name})

                dataset_hparams = dict_fetch(
                    hparams_i,
                    ScalarData.default_hparams()["dataset"])
                dataset_hparams["data_name"] = "data"
                self._databases.append(
                    ScalarData(hparams={"dataset": dataset_hparams},
                               device=device,
                               data_source=dummy_source))
            elif _is_record_data(data_type):
                source_i = PickleDataSource(file_paths=hparams_i.files)
                sources.append(source_i)
                self._names.append({
                    name: connect_name(hparams_i.data_name, name)
                    for name in hparams_i.feature_original_types.keys()
                })
                filters.append(None)

                dataset_hparams = dict_fetch(
                    hparams_i,
                    RecordData.default_hparams()["dataset"])
                self._databases.append(
                    RecordData(hparams={"dataset": dataset_hparams},
                               device=device,
                               data_source=dummy_source))
            else:
                raise ValueError(f"Unknown data type: {hparams_i.data_type}")

            # check for duplicate names
            for i in range(1, len(name_prefix)):
                if name_prefix[i] in name_prefix[:i - 1]:
                    raise ValueError(f"Duplicate data name: {name_prefix[i]}")

            name_prefix.append(hparams_i["data_name"])

        self._name_to_id = {v: k for k, v in enumerate(name_prefix)}

        data_source: DataSource = ZipDataSource(*sources)

        if any(filters):

            def filter_fn(data):
                return all(
                    fn(data) for fn, data in zip(filters, data)
                    if fn is not None)

            data_source = FilterDataSource(data_source, filter_fn=filter_fn)
        super().__init__(data_source, self._hparams, device)
    def __init__(self, hparams, device: Optional[torch.device] = None):
        print("Using local texar")
        self._hparams = HParams(hparams, self.default_hparams())
        # Defaultizes hyperparameters of each dataset
        datasets_hparams = self._hparams.datasets
        defaultized_datasets_hparams = []
        for hparams_i in datasets_hparams:
            data_type = hparams_i.get("data_type", None)
            #print("data_type:", data_type)
            defaultized_ds_hpms = HParams(hparams_i,
                                          _default_dataset_hparams(data_type))
            defaultized_datasets_hparams.append(defaultized_ds_hpms)
        self._hparams.datasets = defaultized_datasets_hparams

        #print("will make_vocab")
        self._vocab = self.make_vocab(self._hparams.datasets)
        #print("will make_embedding")
        self._embedding = self.make_embedding(self._hparams.datasets,
                                              self._vocab)

        dummy_source = SequenceDataSource[Any]([])
        name_prefix: List[str] = []
        self._names: List[Dict[str, Any]] = []
        sources: List[DataSource] = []
        filters: List[Optional[Callable[[str], bool]]] = []
        self._databases: List[DatasetBase] = []
        for idx, hparams_i in enumerate(self._hparams.datasets):
            data_type = hparams_i.data_type
            source_i: DataSource

            if _is_text_data(data_type):
                #print("will TextLineDataSource")
                source_i = TextLineDataSource(
                    hparams_i.files,
                    compression_type=hparams_i.compression_type,
                    delimiter=hparams_i.delimiter)
                sources.append(source_i)
                if ((hparams_i.length_filter_mode
                     == _LengthFilterMode.DISCARD.value)
                        and hparams_i.max_seq_length is not None):

                    def _get_filter(max_seq_length):
                        return lambda x: len(x) <= max_seq_length

                    filters.append(_get_filter(hparams_i.max_seq_length))
                else:
                    filters.append(None)

                self._names.append({
                    field: connect_name(hparams_i.data_name, field)
                    for field in ["text", "text_ids", "length"]
                })

                dataset_hparams = dict_fetch(
                    hparams_i,
                    MonoTextData.default_hparams()["dataset"])
                dataset_hparams["data_name"] = None
                self._databases.append(
                    MonoTextData(hparams={"dataset": dataset_hparams},
                                 device=device,
                                 vocab=self._vocab[idx],
                                 embedding=self._embedding[idx],
                                 data_source=dummy_source))
            elif _is_scalar_data(data_type):
                source_i = TextLineDataSource(
                    hparams_i.files,
                    compression_type=hparams_i.compression_type)
                sources.append(source_i)
                filters.append(None)
                self._names.append({"data": hparams_i.data_name})

                dataset_hparams = dict_fetch(
                    hparams_i,
                    ScalarData.default_hparams()["dataset"])
                dataset_hparams["data_name"] = "data"
                self._databases.append(
                    ScalarData(hparams={"dataset": dataset_hparams},
                               device=device,
                               data_source=dummy_source))
            elif _is_record_data(data_type):
                source_i = PickleDataSource(file_paths=hparams_i.files)
                sources.append(source_i)
                # TODO: Only check `feature_types` when we finally remove
                #   `feature_original_types`.
                feature_types = (hparams_i.feature_types
                                 or hparams_i.feature_original_types)
                self._names.append({
                    name: connect_name(hparams_i.data_name, name)
                    for name in feature_types.keys()
                })
                filters.append(None)

                dataset_hparams = dict_fetch(
                    hparams_i,
                    RecordData.default_hparams()["dataset"])
                self._databases.append(
                    RecordData(hparams={"dataset": dataset_hparams},
                               device=device,
                               data_source=dummy_source))
            else:
                raise ValueError(f"Unknown data type: {hparams_i.data_type}")

            # check for duplicate names
            for i in range(1, len(name_prefix)):
                if name_prefix[i] in name_prefix[:i - 1]:
                    raise ValueError(f"Duplicate data name: {name_prefix[i]}")

            name_prefix.append(hparams_i["data_name"])

        self._name_to_id = {v: k for k, v in enumerate(name_prefix)}
        self._processed_cache = []
        self._datafile_id = 0  # for training from multiple files
        self._index_at_beginning_of_this_dataset = 0
        self._datafile_prefix = hparams_i.files
        #self._datafile_num = 33 # hparams_i.datafile_num
        #self._datafile_num = 64 # hparams_i.datafile_num
        #self._datafile_num = 3 # hparams_i.datafile_num
        #self._datafile_num = 16 # hparams_i.datafile_num
        #self._datafile_num = 26 # hparams_i.datafile_num
        self._datafile_num = 1  # hparams_i.datafile_num
        #self._datafile_num = 3 # hparams_i.datafile_num

        data_source: DataSource = ZipDataSource(*sources)

        if any(filters):

            def filter_fn(data):
                return all(
                    fn(data) for fn, data in zip(filters, data)
                    if fn is not None)

            data_source = FilterDataSource(data_source, filter_fn=filter_fn)
        #print("data init derive done")
        super(MultiAlignedData, self).__init__(data_source, self._hparams,
                                               device)
        #self._dataset_size = 3000000
        #self._dataset_size = 6400000
        #self._dataset_size = 16000000
        #self._dataset_size = 3802215
        #self._dataset_size = 1250000
        #self._dataset_size = 3000
        self._dataset_size = 834229