def _process_dataset(self, dataset, hparams, data_spec): name_prefix = PairedTextData._get_name_prefix( hparams["source_dataset"], hparams["target_dataset"]) tran_fn, data_spec = self._make_processor(hparams["source_dataset"], hparams["target_dataset"], data_spec, name_prefix=name_prefix) num_parallel_calls = hparams["num_parallel_calls"] dataset = dataset.map(lambda *args: tran_fn(dsutils.maybe_tuple(args)), num_parallel_calls=num_parallel_calls) # Filters by length src_length_name = dsutils._connect_name( data_spec.name_prefix[0], data_spec.decoder[0].length_tensor_name) tgt_length_name = dsutils._connect_name( data_spec.name_prefix[1], data_spec.decoder[1].length_tensor_name) filter_fn = self._make_length_filter(hparams["source_dataset"], hparams["target_dataset"], src_length_name, tgt_length_name, data_spec.decoder[0], data_spec.decoder[1]) if filter_fn: dataset = dataset.filter(filter_fn) # Truncates data count dataset = dataset.take(hparams["max_dataset_size"]) return dataset, data_spec
def text_id_name(self): """The name of text index tensor, "text_ids" by default. """ name = dsutils._connect_name( self._data_spec.name_prefix, self._data_spec.decoder.text_id_tensor_name) return name
def length_name(self): """The name of length tensor, "length" by default. """ name = dsutils._connect_name( self._data_spec.name_prefix, self._data_spec.decoder.length_tensor_name) return name
def target_text_id_name(self): """The name of the target text index tensor, "target_text_ids" by default. """ name = dsutils._connect_name(self._data_spec.name_prefix[1], self._tgt_decoder.text_id_tensor_name) return name
def source_text_id_name(self): """The name of the source text index tensor, "source_text_ids" by default. """ name = dsutils._connect_name(self._data_spec.name_prefix[0], self._src_decoder.text_id_tensor_name) return name
def _get_length_name(i): if not _is_text_data(hparams["datasets"][i]["data_type"]): return None name = dsutils._connect_name( data_spec.name_prefix[i], data_spec.decoder[i].length_tensor_name) return name
def utterance_cnt_name(self): """The name of utterance count tensor, "utterance_cnt" by default. """ if not self._hparams.dataset.variable_utterance: raise ValueError("`utterance_cnt_name` is not defined.") name = dsutils._connect_name( self._data_spec.name_prefix, self._data_spec.decoder.utterance_cnt_tensor_name) return name
def target_utterance_cnt_name(self): """The name of the target text utterance count tensor, "target_utterance_cnt" by default. """ if not self._hparams.target_dataset.variable_utterance: raise ValueError( "`utterance_cnt_name` of target data is undefined.") name = dsutils._connect_name( self._data_spec.name_prefix[1], self._tgt_decoder.utterance_cnt_tensor_name) return name
def source_utterance_cnt_name(self): """The name of the source text utterance count tensor, "source_utterance_cnt" by default. """ if not self._hparams.source_dataset.variable_utterance: raise ValueError( "`utterance_cnt_name` of source data is undefined.") name = dsutils._connect_name( self._data_spec.name_prefix[0], self._src_decoder.utterance_cnt_tensor_name) return name
def data_name(self, name_or_id): """The name of the data tensor of scalar dataset by its name or id.. If the dataset is not a scalar data, returns `None`. """ i = self._maybe_name_to_id(name_or_id) if not _is_scalar_data(self._hparams.datasets[i]["data_type"]): return None name = dsutils._connect_name( self._data_spec.name_prefix[i], self._data_spec.decoder[i].data_tensor_name) return name
def text_id_name(self, name_or_id): """The name of length tensor of text dataset by its name or id. If the dataset is not of text type, returns `None`. """ i = self._maybe_name_to_id(name_or_id) if not _is_text_data(self._hparams.datasets[i]["data_type"]): return None name = dsutils._connect_name( self._data_spec.name_prefix[i], self._data_spec.decoder[i].text_id_tensor_name) return name
def utterance_cnt_name(self, name_or_id): """The name of utterance count tensor of text dataset by its name or id. If the dataset is not variable utterance text data, returns `None`. """ i = self._maybe_name_to_id(name_or_id) if not _is_text_data(self._hparams.datasets[i]["data_type"]) or \ not self._hparams.datasets[i]["variable_utterance"]: return None name = dsutils._connect_name( self._data_spec.name_prefix[i], self._data_spec.decoder[i].utterance_cnt_tensor_name) return name
def _process_dataset(self, dataset, hparams, data_spec): chained_tran, data_spec = self._make_processor( hparams["dataset"], data_spec, name_prefix=hparams["dataset"]["data_name"]) num_parallel_calls = hparams["num_parallel_calls"] dataset = dataset.map( lambda *args: chained_tran(dsutils.maybe_tuple(args)), num_parallel_calls=num_parallel_calls) # Filters by length length_name = dsutils._connect_name( data_spec.name_prefix, data_spec.decoder.length_tensor_name) filter_fn = self._make_length_filter(hparams["dataset"], length_name, data_spec.decoder) if filter_fn: dataset = dataset.filter(filter_fn) # Truncates data count dataset = dataset.take(hparams["max_dataset_size"]) return dataset, data_spec
def target_length_name(self): """The name of the target length tensor, "target_length" by default. """ name = dsutils._connect_name(self._data_spec.name_prefix[1], self._tgt_decoder.length_tensor_name) return name
def source_length_name(self): """The name of the source length tensor, "source_length" by default. """ name = dsutils._connect_name(self._data_spec.name_prefix[0], self._src_decoder.length_tensor_name) return name