def test_buckets_similar_size(self): # testing dataset is 3 x 6 sequences of lengths 0 - 5 iterators = { "sentences": lambda: [["word" for _ in range(l)] for l in range(6)] * 3 } dataset = Dataset("dataset", iterators=iterators, shuffled=True) # we use batch size 6 and bucket span 2 scheme = BatchingScheme(6, 2, False, None) # we process the dataset in two epochs and save what did the batches # look like batches = [] for batch in dataset.batches(scheme): batches.append(list(batch.get_series("sentences"))) # this setup should divide the data to 3 batches self.assertEqual(len(batches), 3) for batch in batches: # each batch should contain 6 values self.assertEqual(len(batch), 6) lengths = set(len(b) for b in batch) # the values in the batch should have two lengths self.assertEqual(len(lengths), 2) # the lengths should differ by one self.assertEqual(max(lengths) - min(lengths), 1)
def test_batching_lazy_shuffle(self): iterators = {"a": lambda: range(5), "b": lambda: range(5, 10)} dataset = Dataset("dataset", iterators=iterators, shuffled=True, buffer_size=(3, 5)) batches = [] for epoch in range(2): epoch = [] for batch in dataset.batches(DEFAULT_BATCHING_SCHEME): epoch.append({s: list(batch.get_series(s)) for s in iterators}) batches.append(epoch) epoch_data = [] epoch_data.append( [c for batch in batches[0] for b in batch.values() for c in b]) epoch_data.append( [c for batch in batches[1] for b in batch.values() for c in b]) self.assertEqual(set(epoch_data[0]), set(range(10))) self.assertEqual(set(epoch_data[0]), set(epoch_data[1])) self.assertNotEqual(epoch_data[0], epoch_data[1])
def test_bucketing_no_leftovers(self): # testing dataset is 50 sequences of lengths 1 - 50 iterators = { "sentences": lambda: (["word" for _ in range(l)] for l in range(1, 50)) } dataset = Dataset("dataset", iterators=iterators, shuffled=False) # we use batch size 7 and bucket span 10 scheme = BatchingScheme(7, 10, False, None, False) # we process the dataset in two epochs and save what did the batches # look like batches = [] for batch in dataset.batches(scheme): batches.append(list(batch.get_series("sentences"))) ref_batches = [[["word" for _ in range(l)] for l in range(1, 8)], [["word" for _ in range(l)] for l in range(10, 17)], [["word" for _ in range(l)] for l in range(20, 27)], [["word" for _ in range(l)] for l in range(30, 37)], [["word" for _ in range(l)] for l in range(40, 47)]] self.assertSequenceEqual(ref_batches, batches)
def test_bucketing(self): # testing dataset is 50 sequences of lengths 1 - 50 iterators = { "sentences": lambda: (["word" for _ in range(l)] for l in range(1, 50)) } # we use batch size 7 and bucket span 10 scheme = BatchingScheme(bucket_boundaries=[9, 19, 29, 39, 49], bucket_batch_sizes=[7, 7, 7, 7, 7, 7]) dataset = Dataset("dataset", iterators=iterators, batching=scheme, shuffled=False) # we process the dataset in two epochs and save what did the batches # look like batches = [] for batch in dataset.batches(): batches.append(list(batch.get_series("sentences"))) ref_batches = [[["word" for _ in range(l)] for l in range(1, 8)], [["word" for _ in range(l)] for l in range(10, 17)], [["word" for _ in range(l)] for l in range(20, 27)], [["word" for _ in range(l)] for l in range(30, 37)], [["word" for _ in range(l)] for l in range(40, 47)], [["word" for _ in range(l)] for l in range(8, 10)], [["word" for _ in range(l)] for l in range(17, 20)], [["word" for _ in range(l)] for l in range(27, 30)], [["word" for _ in range(l)] for l in range(37, 40)], [["word" for _ in range(l)] for l in range(47, 50)]] self.assertSequenceEqual(ref_batches, batches)
def test_batching_lazy_noshuffle(self): iterators = {"a": lambda: range(5), "b": lambda: range(10, 15)} dataset = Dataset("dataset", iterators=iterators, shuffled=False, buffer_size=(3, 5)) batches = [] for epoch in range(2): epoch = [] for batch in dataset.batches(DEFAULT_BATCHING_SCHEME): epoch.append({s: list(batch.get_series(s)) for s in iterators}) batches.append(epoch) self.assertEqual(batches, [[{ "a": [0, 1, 2], "b": [10, 11, 12] }, { "a": [3, 4], "b": [13, 14] }], [{ "a": [0, 1, 2], "b": [10, 11, 12] }, { "a": [3, 4], "b": [13, 14] }]])
def __call__( self, dataset: Dataset, generated_series: Dict[str, Iterable[Any]]) -> Iterable[List[str]]: source_series = generated_series.get( self._source_id, dataset.get_series(self._source_id)) edits_series = generated_series.get(self._edits_id, dataset.get_series(self._edits_id)) for src_seq, edit_seq in zip(source_series, edits_series): yield reconstruct(src_seq, edit_seq)
def _do_postprocess( self, dataset: Dataset, generated_series: Dict[str, Iterable[Any]]) -> Iterable[List[str]]: source_series = generated_series.get(self._source_id) if source_series is None: source_series = dataset.get_series(self._source_id) edits_series = generated_series.get(self._edits_id) if edits_series is None: edits_series = dataset.get_series(self._edits_id) for src_seq, edit_seq in zip(source_series, edits_series): reconstructed = reconstruct(src_seq, edit_seq) yield reconstructed
def feed_dict(self, dataset: Dataset, train: bool = False) -> FeedDict: fd = ModelPart.feed_dict(self, dataset, train) sentences = cast(Iterable[List[str]], dataset.maybe_get_series(self.data_id)) if sentences is None and train: raise ValueError("When training, you must feed " "reference sentences") if sentences is not None: vectors, paddings = self.vocabulary.sentences_to_tensor( list(sentences), train_mode=train, max_len=self.max_length) # sentences_to_tensor returns time-major tensors, targets need to # be batch-major vectors = vectors.T paddings = paddings.T # Need to convert the data to a sparse representation bool_mask = (paddings > 0.5) indices = np.stack(np.where(bool_mask), axis=1) values = vectors[bool_mask] fd[self.train_targets] = tf.SparseTensorValue( indices=indices, values=values, dense_shape=vectors.shape) return fd
def run_on_dataset(tf_manager: TensorFlowManager, runners: List[BaseRunner], dataset: Dataset, postprocess: Callable, write_out: bool=False, batch_size: Optional[int]=None) \ -> Tuple[List[ExecutionResult], Dict[str, List[Any]]]: """Apply the model on a dataset and optionally write outputs to files. Args: tf_manager: TensorFlow manager with initialized sessions. runners: A function that runs the code dataset: The dataset on which the model will be executed. evaluators: List of evaluators that are used for the model evaluation if the target data are provided. postprocess: an object to use as postprocessing of the write_out: Flag whether the outputs should be printed to a file defined in the dataset object. extra_fetches: Extra tensors to evaluate for each batch. Returns: Tuple of resulting sentences/numpy arrays, and evaluation results if they are available which are dictionary function -> value. """ contains_targets = all(dataset.has_series(runner.output_series) for runner in runners) all_results = tf_manager.execute(dataset, runners, train=contains_targets, batch_size=batch_size) result_data_raw = {runner.output_series: result.outputs for runner, result in zip(runners, all_results)} if postprocess is not None: result_data = postprocess(dataset, result_data_raw) else: result_data = result_data_raw if write_out: for series_id, data in result_data.items(): if series_id in dataset.series_outputs: path = dataset.series_outputs[series_id] if isinstance(data, np.ndarray): np.save(path, data) log('Result saved as numpy array to "{}"'.format(path)) else: with open(path, 'w') as f_out: f_out.writelines( [" ".join(sent) + "\n" for sent in data]) log("Result saved as plain text \"{}\"".format(path)) else: log("There is no output file for dataset: {}" .format(dataset.name), color='red') return all_results, result_data
def dataset_from_files(**kwargs): """ Creates a dataset from the provided arguments. Paths to the data are provided in a form of dictionary. Args: kwargs: Arguments are treated as a dictionary. Paths to the data series are specified here. Series identifiers should not contain underscores. You can specify a language for the serie by adding a preprocess method you want to apply on the textual data by naming the function as <identifier>_preprocess=function OR the preprocessor can be specified globally """ random_seed = kwargs.get('random_seed', None) preprocess = kwargs.get('preprocessor', lambda x: x) series_paths = _get_series_paths(kwargs) if len(series_paths) > 0: log("Initializing dataset with: {}".format(", ".join(series_paths))) series = {s: Dataset.create_series(series_paths[s], preprocess) for s in series_paths} name = kwargs.get('name', _get_name_from_paths(series_paths)) series_outputs = {SERIES_OUTPUT.match(key)[1]: value for key, value in kwargs.items() if SERIES_OUTPUT.match(key)} dataset = Dataset(name, series, series_outputs, random_seed) log("Dataset length: {}".format(len(dataset))) return dataset
def feed_dict(self, dataset: Dataset, train: bool = False) -> FeedDict: """Populate the feed dictionary for the decoder object. Arguments: dataset: The dataset to use for the decoder. train: Boolean flag, telling whether this is a training run. """ fd = ModelPart.feed_dict(self, dataset, train) sentences = dataset.maybe_get_series(self.data_id) if sentences is None and train: raise ValueError("When training, you must feed " "reference sentences") go_symbol_idx = self.vocabulary.get_word_index(START_TOKEN) fd[self.go_symbols] = np.full([len(dataset)], go_symbol_idx, dtype=np.int32) if sentences is not None: sentences_list = list(sentences) # train_mode=False, since we don't want to <unk>ize target words! inputs, _ = self.vocabulary.sentences_to_tensor( sentences_list, self.max_output_len, train_mode=False, add_start_symbol=False, add_end_symbol=True, pad_to_max_len=False) fd[self.train_inputs] = inputs return fd
def feed_dict(self, dataset: Dataset, train: bool = False) -> FeedDict: fd = {} # type: FeedDict sentences = cast(Iterable[List[str]], dataset.get_series(self.data_id, allow_none=True)) fd[self.train_mode] = train if sentences is not None: vectors, paddings = self.vocabulary.sentences_to_tensor( list(sentences), train_mode=train) # sentences_to_tensor returns time-major tensors, targets need to # be batch-major vectors = vectors.T paddings = paddings.T # Need to convert the data to a sparse representation bool_mask = (paddings > 0.5) indices = np.stack(np.where(bool_mask), axis=1) values = vectors[bool_mask] fd[self.train_targets] = tf.SparseTensorValue( indices=indices, values=values, dense_shape=vectors.shape) return fd
def feed_dict(self, dataset: Dataset, train: bool = False) -> FeedDict: fd = {} # type: FeedDict fd[self.train_mode] = train # for checking the lengths of individual factors arr_strings = [] last_paddings = None for name, vocabulary in zip(self.data_ids, self.vocabularies): factors = dataset.get_series(name) vectors, paddings = vocabulary.sentences_to_tensor( list(factors), self.max_input_len, pad_to_max_len=False, train_mode=train) # pylint: disable=unsubscriptable-object fd[self.input_factors[name]] = list(zip(*vectors)) # pylint: enable=unsubscriptable-object arr_strings.append(paddings.tostring()) last_paddings = paddings if len(set(arr_strings)) > 1: raise ValueError("The lenghts of factors do not match") fd[self.input_mask] = list(zip(*last_paddings)) return fd
def execute(self, dataset: Dataset, execution_scripts, train=False, compute_losses=True, summaries=True, batch_size=None, log_progress: int = 0) -> List[ExecutionResult]: if batch_size is None: batch_size = len(dataset) batched_dataset = dataset.batch_dataset(batch_size) last_log_time = time.process_time() batch_results = [ [] for _ in execution_scripts] # type: List[List[ExecutionResult]] for batch_id, batch in enumerate(batched_dataset): if 0 < log_progress < time.process_time() - last_log_time: log("Processed {} examples.".format(batch_id * batch_size)) last_log_time = time.process_time() executables = [s.get_executable(compute_losses=compute_losses, summaries=summaries, num_sessions=len(self.sessions)) for s in execution_scripts] while not all(ex.result is not None for ex in executables): self._run_executables(batch, executables, train) for script_list, executable in zip(batch_results, executables): script_list.append(executable.result) collected_results = [] # type: List[ExecutionResult] for result_list in batch_results: collected_results.append(reduce_execution_results(result_list)) return collected_results
def feed_dict(self, dataset: Dataset, train: bool = False) -> FeedDict: """Populate the feed dictionary with the encoder inputs. Encoder input placeholders: ``encoder_input``: Stores indices to the vocabulary, shape (batch, time) ``encoder_padding``: Stores the padding (ones and zeros, indicating valid words and positions after the end of sentence, shape (batch, time) ``train_mode``: Boolean scalar specifying the mode (train vs runtime) Arguments: dataset: The dataset to use train: Boolean flag telling whether it is training time """ # pylint: disable=invalid-name fd = {} # type: FeedDict fd[self.train_mode] = train sentences = dataset.get_series(self.data_id) vectors, paddings = self.vocabulary.sentences_to_tensor( list(sentences), self.max_input_len, pad_to_max_len=False, train_mode=train) # as sentences_to_tensor returns lists of shape (time, batch), # we need to transpose fd[self.inputs] = list(zip(*vectors)) fd[self.input_mask] = list(zip(*paddings)) return fd
def feed_dict(self, dataset: Dataset, train: bool = False) -> FeedDict: """Populate the feed dictionary for the decoder object. Arguments: dataset: The dataset to use for the decoder. train: Boolean flag, telling whether this is a training run. """ sentences = cast(Iterable[List[str]], dataset.get_series(self.data_id, allow_none=True)) if sentences is None and train: raise ValueError("When training, you must feed " "reference sentences") sentences_list = list(sentences) if sentences is not None else None fd = {} # type: FeedDict fd[self.train_mode] = train go_symbol_idx = self.vocabulary.get_word_index(START_TOKEN) fd[self.go_symbols] = np.full([len(dataset)], go_symbol_idx, dtype=np.int32) if sentences is not None: # train_mode=False, since we don't want to <unk>ize target words! inputs, weights = self.vocabulary.sentences_to_tensor( sentences_list, self.max_output_len, train_mode=False, add_start_symbol=False, add_end_symbol=True, pad_to_max_len=False) fd[self.train_inputs] = inputs fd[self.train_mask] = weights return fd
def feed_dict(self, dataset: Dataset, train: bool = False) -> FeedDict: fd = ModelPart.feed_dict(self, dataset, train) sentences = cast(Iterable[List[str]], dataset.maybe_get_series(self.data_id)) if sentences is None and train: raise ValueError("When training, you must feed " "reference sentences") if sentences is not None: vectors, paddings = self.vocabulary.sentences_to_tensor( list(sentences), train_mode=train, max_len=self.max_length) # sentences_to_tensor returns time-major tensors, targets need to # be batch-major vectors = vectors.T paddings = paddings.T bool_mask = (paddings > 0.5) flat_labels = vectors[bool_mask] label_lengths = bool_mask.sum(axis=1) fd[self.label_lengths] = label_lengths fd[self.flat_labels] = flat_labels return fd
def check_dataset_and_coders(dataset: Dataset, runners: Iterable[BaseRunner]) -> None: # pylint: disable=protected-access data_list = [] for runner in runners: for c in runner.all_coders: if hasattr(c, "data_id"): data_list.append((c.data_id, c)) elif hasattr(c, "data_ids"): data_list.extend([(d, c) for d in c.data_ids]) else: log(("Coder: {} does not have " "a data attribute.").format(c)) debug("Found series: {}".format(str(data_list)), "checking") missing = [] for (serie, coder) in data_list: if not dataset.has_series(serie): log("dataset {} does not have serie {}".format( dataset.name, serie)) missing.append((coder, serie)) if missing: formated = [ "{} ({}, {}.{})".format(serie, cod.name, cod.__class__.__module__, cod.__class__.__name__) for cod, serie in missing ] raise CheckingException("Dataset '{}' is mising series {}:".format( dataset.name, ", ".join(formated)))
def feed_dict(self, dataset: Dataset, train: bool = False) -> FeedDict: """Feed the placholders with the data. Arguments: dataset: The dataset. train: A flag whether the train mode is enabled. Returns: The constructed feed dictionary that contains the factor data and the mask. """ fd = {} # type: FeedDict # for checking the lengths of individual factors arr_strings = [] last_paddings = None for factor_plc, name, vocabulary in zip( self.input_factors, self.data_ids, self.vocabularies): factors = dataset.get_series(name) vectors, paddings = vocabulary.sentences_to_tensor( list(factors), self.max_length, pad_to_max_len=False, train_mode=train) fd[factor_plc] = list(zip(*vectors)) arr_strings.append(paddings.tostring()) last_paddings = paddings if len(set(arr_strings)) > 1: raise ValueError("The lenghts of factors do not match") fd[self.mask] = list(zip(*last_paddings)) return fd
def run(data): # pragma: no cover exp = APP.config["experiment"] dataset = Dataset("request", data, {}) _, response_data = exp.run_model(dataset, write_out=False) return response_data
def feed_dict(self, dataset: Dataset, train: bool = False) -> FeedDict: """Populate the feed dictionary with the encoder inputs. Arguments: dataset: The dataset to use train: Boolean flag telling whether it is training time """ # pylint: disable=invalid-name fd = {} # type: FeedDict fd[self.train_mode] = train series = list(dataset.get_series(self.data_id)) lengths = [] inputs = [] max_len = max(x.shape[0] for x in series) if self.max_input_len is not None: max_len = min(self.max_input_len, max_len) for x in series: length = min(max_len, x.shape[0]) x_padded = np.zeros(shape=(max_len, ) + x.shape[1:], dtype=x.dtype) x_padded[:length] = x[:length] lengths.append(length) inputs.append(x_padded) fd[self.inputs] = inputs fd[self._input_lengths] = lengths return fd
def post_request(): start_time = datetime.datetime.now() request_data = request.get_json() if request_data is None: response_data = {"error": "No data were provided."} code = 400 else: args = APP.config['args'] try: dataset = Dataset("request", request_data, {}) # TODO check the dataset # check_dataset_and_coders(dataset, args.encoders) _, response_data = run_on_dataset( args.tf_manager, args.runners, dataset, args.postprocess, write_out=False) code = 200 # pylint: disable=broad-except except Exception as exc: response_data = {'error': str(exc)} code = 400 response_data['duration'] = ( datetime.datetime.now() - start_time).total_seconds() json_response = json.dumps(response_data) response = flask.Response(json_response, content_type='application/json; charset=utf-8') response.headers.add('content-length', len(json_response.encode('utf-8'))) response.status_code = code return response
def execute(self, dataset: Dataset, execution_scripts, train=False, compute_losses=True, summaries=True, batch_size=None) -> List[ExecutionResult]: if batch_size is None: batch_size = len(dataset) batched_dataset = dataset.batch_dataset(batch_size) batch_results = [[] for _ in execution_scripts ] # type: List[List[ExecutionResult]] for batch in batched_dataset: executables = [ s.get_executable(compute_losses=compute_losses, summaries=summaries) for s in execution_scripts ] while not all(ex.result is not None for ex in executables): all_feedables = set() # type: Set[Any] # type: Dict[Executable, tf.Tensor] all_tensors_to_execute = {} additional_feed_dicts = [] tensor_list_lengths = [] # type: List[int] for executable in executables: if executable.result is None: (feedables, tensors_to_execute, add_feed_dict) = executable.next_to_execute() all_feedables = all_feedables.union(feedables) all_tensors_to_execute[executable] = tensors_to_execute additional_feed_dicts.append(add_feed_dict) tensor_list_lengths.append(len(tensors_to_execute)) else: tensor_list_lengths.append(0) feed_dict = _feed_dicts(batch, all_feedables, train=train) for fdict in additional_feed_dicts: feed_dict.update(fdict) session_results = [ sess.run(all_tensors_to_execute, feed_dict=feed_dict) for sess in self.sessions ] for executable in executables: if executable.result is None: executable.collect_results( [res[executable] for res in session_results]) for script_list, executable in zip(batch_results, executables): script_list.append(executable.result) collected_results = [] # type: List[ExecutionResult] for result_list in batch_results: collected_results.append(reduce_execution_results(result_list)) return collected_results
def feed_dict(self, dataset: Dataset, train: bool = False) -> FeedDict: fd = ModelPart.feed_dict(self, dataset, train) # if it is from the pickled file, it is a list, not a numpy tensor, # so convert it as as a prevention images = np.array(list(dataset.get_series(self.data_id))) fd[self.image_input] = images / 255.0 return fd
def feed_dict(self, dataset: Dataset, train: bool = False) -> FeedDict: fd = ModelPart.feed_dict(self, dataset, train) images = np.array(dataset.get_series(self.data_id)) assert images.shape[1:] == (self.height, self.width, 3) fd[self.input_image] = images return fd
def feed_dict(self, dataset: Dataset, train: bool = False) -> FeedDict: fd = ModelPart.feed_dict(self, dataset, train) sentences = dataset.maybe_get_series(self.data_id) if sentences is not None: fd[self.target_tokens] = pad_batch(list(sentences)) return fd
def feed_dict(self, dataset: Dataset, train: bool = False) -> FeedDict: fd = ModelPart.feed_dict(self, dataset, train) sentences = dataset.maybe_get_series(self.data_id) sentences_list = list(sentences) if sentences is not None else None if sentences_list is not None: fd[self.train_inputs] = list(zip(*sentences_list))[0] return fd
def run(data): # pragma: no cover exp = APP.config["experiment"] dataset = Dataset("request", data, BatchingScheme(batch_size=1), {}, preprocessors=APP.config["preprocess"]) _, response_data, _ = exp.run_model(dataset, write_out=False) return response_data
def feed_dict(self, dataset: Dataset, train: bool = False) -> FeedDict: # if it is from the pickled file, it is list, not numpy tensor, # so convert it as as a prevention images = np.array(dataset.get_series(self.data_id)) f_dict = {} f_dict[self.image_input] = images / 225.0 f_dict[self.train_mode] = train return f_dict
def feed_dict(self, dataset: Dataset, train: bool = False) -> FeedDict: fd = ModelPart.feed_dict(self, dataset, train) sentences = dataset.maybe_get_series(self.data_id) if sentences is not None: fd[self.target_tokens] = pad_batch( list(sentences), self.max_output_len, self.add_start_symbol, self.add_end_symbol) return fd
def feed_dict(self, dataset: Dataset, train: bool = False) -> FeedDict: fd = ModelPart.feed_dict(self, dataset, train) sentences = dataset.maybe_get_series(self.data_id) if sentences is not None: label_tensors, _ = self.vocabulary.sentences_to_tensor( list(sentences), self.max_output_len) fd[self.gt_inputs[0]] = label_tensors[0] return fd
def feed_dict(self, dataset: Dataset, train: bool = False) -> FeedDict: fd = ModelPart.feed_dict(self, dataset, train) sentences = dataset.maybe_get_series(self.data_id) if sentences is not None: vectors, _ = self.vocabulary.sentences_to_tensor( list(sentences), pad_to_max_len=False, train_mode=train) fd[self.train_targets] = vectors.T return fd
def feed_dict(self, dataset: Dataset, train: bool = False) -> FeedDict: fd = ModelPart.feed_dict(self, dataset, train) sentences = dataset.maybe_get_series(self.data_id) if sentences is not None: labels = [l[0] for l in pad_batch(list(sentences), self.max_output_len)] fd[self.targets] = labels return fd
def feed_dict(self, dataset: Dataset, train: bool = False) -> FeedDict: """Populate the feed dictionary with the encoder inputs. Arguments: dataset: The dataset to use train: Boolean flag telling whether it is training time """ fd = ModelPart.feed_dict(self, dataset, train) sentences = dataset.get_series(self.data_id) fd[self.input_tokens] = pad_batch(list(sentences), self.max_input_len) return fd
def feed_dict(self, dataset: Dataset, train: bool = False) -> FeedDict: fd = ModelPart.feed_dict(self, dataset, train) sentences = dataset.maybe_get_series(self.data_id) if sentences is None and train: raise ValueError("You must feed reference sentences when training") if sentences is not None: fd[self.target_tokens] = pad_batch(list(sentences), self.max_length) return fd
def feed_dict(self, dataset: Dataset, train: bool = False) -> FeedDict: """Populate the feed dictionary for the decoder object. Arguments: dataset: The dataset to use for the decoder. train: Boolean flag, telling whether this is a training run. """ fd = ModelPart.feed_dict(self, dataset, train) sentences = dataset.maybe_get_series(self.data_id) if sentences is None and train: raise ValueError("When training, you must feed " "reference sentences") if sentences is not None: fd[self.train_tokens] = pad_batch( list(sentences), self.max_output_len, add_start_symbol=False, add_end_symbol=True) return fd
def feed_dict(self, dataset: Dataset, train: bool = False) -> FeedDict: """Feed the placholders with the data. Arguments: dataset: The dataset. train: A flag whether the train mode is enabled. Returns: The constructed feed dictionary that contains the factor data and the mask. """ fd = ModelPart.feed_dict(self, dataset, train) # for checking the lengths of individual factors for factor_plc, name in zip(self.input_factors, self.data_ids): sentences = dataset.get_series(name) fd[factor_plc] = pad_batch( list(sentences), self.max_length, self.add_start_symbol, self.add_end_symbol) return fd
def feed_dict(self, dataset: Dataset, train: bool = False) -> FeedDict: fd = ModelPart.feed_dict(self, dataset, train) series = list(dataset.get_series(self.data_id)) lengths = [] inputs = [] max_len = max(x.shape[0] for x in series) if self.max_input_len is not None: max_len = min(self.max_input_len, max_len) for x in series: length = min(max_len, x.shape[0]) x_padded = np.zeros(shape=(max_len,) + x.shape[1:], dtype=x.dtype) x_padded[:length] = x[:length] lengths.append(length) inputs.append(x_padded) fd[self.temporal_states] = inputs fd[self._input_lengths] = lengths return fd
def feed_dict(self, dataset: Dataset, train: bool = False) -> FeedDict: fd = ModelPart.feed_dict(self, dataset, train) fd[self.spatial_input] = list(dataset.get_series(self.data_id)) return fd
def feed_dict(self, dataset: Dataset, train: bool = False) -> FeedDict: fd = ModelPart.feed_dict(self, dataset, train) fd[self.vector] = dataset.get_series(self.data_id) return fd