def fit(self, x_train: TextSamplesVar, y_train: TextSamplesVar, x_validate: TextSamplesVar = None, y_validate: TextSamplesVar = None, batch_size: int = 64, epochs: int = 5, callbacks: List[tf.keras.callbacks.Callback] = None, fit_kwargs: Dict = None) -> 'tf.keras.callbacks.History': """ Trains the model for a given number of epochs with given data set list. Args: x_train: Array of train feature data (if the model has a single input), or tuple of train feature data array (if the model has multiple inputs) y_train: Array of train label data x_validate: Array of validation feature data (if the model has a single input), or tuple of validation feature data array (if the model has multiple inputs) y_validate: Array of validation label data batch_size: Number of samples per gradient update, default to 64. epochs: Number of epochs to train the model. An epoch is an iteration over the entire `x` and `y` data provided. callbacks: List of `tf.keras.callbacks.Callback` instances. List of callbacks to apply during training. See :py:class:`tf.keras.callbacks`. fit_kwargs: fit_kwargs: additional arguments passed to :meth:`tf.keras.Model.fit` Returns: A :py:class:`tf.keras.callback.History` object. Its `History.history` attribute is a record of training loss values and metrics values at successive epochs, as well as validation loss values and validation metrics values (if applicable). """ train_gen = CorpusGenerator(x_train, y_train) if x_validate is not None: valid_gen = CorpusGenerator(x_validate, y_validate) else: valid_gen = None return self.fit_generator(train_sample_gen=train_gen, valid_sample_gen=valid_gen, batch_size=batch_size, epochs=epochs, callbacks=callbacks, fit_kwargs=fit_kwargs)
def fit(self, x_train: TextSamplesVar, y_train: TextSamplesVar, *, batch_size: int = 64, epochs: int = 5, callbacks: List[tf.keras.callbacks.Callback] = None) -> tf.keras.callbacks.History: train_gen = CorpusGenerator(x_train, y_train) self.build_model_generator(train_gen) train_dataset = Seq2SeqDataSet(train_gen, batch_size=batch_size, encoder_processor=self.encoder_processor, encoder_seq_length=self.encoder_seq_length, decoder_processor=self.decoder_processor, decoder_seq_length=self.decoder_seq_length) if callbacks is None: callbacks = [] history_callback = tf.keras.callbacks.History() callbacks.append(history_callback) for c in callbacks: c.set_model(self) c.on_train_begin() for epoch in range(epochs): for c in callbacks: c.on_epoch_begin(epoch=epoch) enc_hidden = tf.zeros((batch_size, self.hidden_size)) total_loss = [] with tqdm.tqdm(total=len(train_dataset)) as p_bar: for (inputs, targets) in train_dataset.take(): p_bar.update(1) batch_loss = self.train_step(inputs, targets, enc_hidden) total_loss.append(batch_loss.numpy()) info = f"Epoch {epoch + 1}/{epochs} | Epoch Loss: {np.mean(total_loss):.4f} " \ f"Batch Loss: {batch_loss.numpy():.4f}" p_bar.set_description_str(info) logs = {'loss': np.mean(total_loss)} for c in callbacks: c.on_epoch_end(epoch=epoch, logs=logs) return history_callback
def build_model(self, x_data: TextSamplesVar, y_data: TextSamplesVar) -> None: """ Build Model with x_data and y_data This function will setup a :class:`CorpusGenerator`, then call :meth:`ABCClassificationModel.build_model_gen` for preparing processor and model Args: x_data: y_data: Returns: """ train_gen = CorpusGenerator(x_data, y_data) self.build_model_generator([train_gen])
def test_multi_label_processor(self): from kashgari.corpus import JigsawToxicCommentCorpus file_path = TestMacros.jigsaw_mini_corpus_path corpus = JigsawToxicCommentCorpus(file_path) x_set, y_set = corpus.load_data() corpus_gen = CorpusGenerator(x_set, y_set) processor = ClassificationProcessor(multi_label=True) processor.build_vocab_generator([corpus_gen]) transformed_idx = processor.transform(y_set[20:40]) info_dict = processor.to_dict() p2: ClassificationProcessor = load_data_object(info_dict) assert (transformed_idx == p2.transform(y_set[20:40])).all() x1s = y_set[20:40] x2s = p2.inverse_transform(transformed_idx) for sample_x1, sample_x2 in zip(x1s, x2s): assert sorted(sample_x1) == sorted(sample_x2)
def test_batch_generator(self): x, y = ChineseDailyNerCorpus.load_data('valid') text_processor = SequenceProcessor() label_processor = SequenceProcessor(build_vocab_from_labels=True, min_count=1) corpus_gen = CorpusGenerator(x, y) text_processor.build_vocab_generator([corpus_gen]) label_processor.build_vocab_generator([corpus_gen]) batch_dataset1 = BatchDataSet(corpus_gen, text_processor=text_processor, label_processor=label_processor, segment=False, seq_length=None, max_position=100, batch_size=12) duplicate_len = len(batch_dataset1) assert len(list(batch_dataset1.take(duplicate_len))) == duplicate_len assert len(list(batch_dataset1.take(1))) == 1
def build_vocab(self, x_data: TextSamplesVar, y_data: TextSamplesVar) -> None: corpus_gen = CorpusGenerator(x_data, y_data) self.build_vocab_generator([corpus_gen])
def test_corpus_generator(self): x_set, y_set = TestMacros.load_labeling_corpus() corpus_gen = CorpusGenerator(x_set, y_set) pass
def build_model(self, x_train: TextSamplesVar, y_train: TextSamplesVar) -> None: train_gen = CorpusGenerator(x_train, y_train) self.build_model_generator(train_gen)