Python CorpusGenerator Examples, kashgari.generators.CorpusGenerator Python Examples

Example #1

0

Show file

File: abc_model.py Project: sfwyly/Kashgari

    def fit(self,
            x_train: TextSamplesVar,
            y_train: TextSamplesVar,
            x_validate: TextSamplesVar = None,
            y_validate: TextSamplesVar = None,
            batch_size: int = 64,
            epochs: int = 5,
            callbacks: List[tf.keras.callbacks.Callback] = None,
            fit_kwargs: Dict = None) -> 'tf.keras.callbacks.History':
        """
        Trains the model for a given number of epochs with given data set list.

        Args:
            x_train: Array of train feature data (if the model has a single input),
                or tuple of train feature data array (if the model has multiple inputs)
            y_train: Array of train label data
            x_validate: Array of validation feature data (if the model has a single input),
                or tuple of validation feature data array (if the model has multiple inputs)
            y_validate: Array of validation label data
            batch_size: Number of samples per gradient update, default to 64.
            epochs: Number of epochs to train the model.
                An epoch is an iteration over the entire `x` and `y` data provided.
            callbacks: List of `tf.keras.callbacks.Callback` instances.
                List of callbacks to apply during training.
                See :py:class:`tf.keras.callbacks`.
            fit_kwargs: fit_kwargs: additional arguments passed to :meth:`tf.keras.Model.fit`

        Returns:
            A :py:class:`tf.keras.callback.History`  object. Its `History.history` attribute is
            a record of training loss values and metrics values
            at successive epochs, as well as validation loss values
            and validation metrics values (if applicable).
        """
        train_gen = CorpusGenerator(x_train, y_train)
        if x_validate is not None:
            valid_gen = CorpusGenerator(x_validate, y_validate)
        else:
            valid_gen = None
        return self.fit_generator(train_sample_gen=train_gen,
                                  valid_sample_gen=valid_gen,
                                  batch_size=batch_size,
                                  epochs=epochs,
                                  callbacks=callbacks,
                                  fit_kwargs=fit_kwargs)

Example #2

0

Show file

File: model.py Project: zhengxin2016/Kashgari

    def fit(self,
            x_train: TextSamplesVar,
            y_train: TextSamplesVar,
            *,
            batch_size: int = 64,
            epochs: int = 5,
            callbacks: List[tf.keras.callbacks.Callback] = None) -> tf.keras.callbacks.History:
        train_gen = CorpusGenerator(x_train, y_train)
        self.build_model_generator(train_gen)

        train_dataset = Seq2SeqDataSet(train_gen,
                                       batch_size=batch_size,
                                       encoder_processor=self.encoder_processor,
                                       encoder_seq_length=self.encoder_seq_length,
                                       decoder_processor=self.decoder_processor,
                                       decoder_seq_length=self.decoder_seq_length)

        if callbacks is None:
            callbacks = []
        history_callback = tf.keras.callbacks.History()
        callbacks.append(history_callback)

        for c in callbacks:
            c.set_model(self)
            c.on_train_begin()

        for epoch in range(epochs):
            for c in callbacks:
                c.on_epoch_begin(epoch=epoch)
            enc_hidden = tf.zeros((batch_size, self.hidden_size))
            total_loss = []

            with tqdm.tqdm(total=len(train_dataset)) as p_bar:
                for (inputs, targets) in train_dataset.take():
                    p_bar.update(1)
                    batch_loss = self.train_step(inputs, targets, enc_hidden)
                    total_loss.append(batch_loss.numpy())
                    info = f"Epoch {epoch + 1}/{epochs} | Epoch Loss: {np.mean(total_loss):.4f} " \
                           f"Batch Loss: {batch_loss.numpy():.4f}"
                    p_bar.set_description_str(info)
            logs = {'loss': np.mean(total_loss)}
            for c in callbacks:
                c.on_epoch_end(epoch=epoch, logs=logs)

        return history_callback

Example #3

0

Show file

File: abc_model.py Project: sfwyly/Kashgari

    def build_model(self, x_data: TextSamplesVar,
                    y_data: TextSamplesVar) -> None:
        """
        Build Model with x_data and y_data

        This function will setup a :class:`CorpusGenerator`,
         then call :meth:`ABCClassificationModel.build_model_gen` for preparing processor and model

        Args:
            x_data:
            y_data:

        Returns:

        """

        train_gen = CorpusGenerator(x_data, y_data)
        self.build_model_generator([train_gen])

Example #4

0

Show file

    def test_multi_label_processor(self):
        from kashgari.corpus import JigsawToxicCommentCorpus
        file_path = TestMacros.jigsaw_mini_corpus_path
        corpus = JigsawToxicCommentCorpus(file_path)
        x_set, y_set = corpus.load_data()

        corpus_gen = CorpusGenerator(x_set, y_set)

        processor = ClassificationProcessor(multi_label=True)
        processor.build_vocab_generator([corpus_gen])
        transformed_idx = processor.transform(y_set[20:40])

        info_dict = processor.to_dict()

        p2: ClassificationProcessor = load_data_object(info_dict)
        assert (transformed_idx == p2.transform(y_set[20:40])).all()

        x1s = y_set[20:40]
        x2s = p2.inverse_transform(transformed_idx)
        for sample_x1, sample_x2 in zip(x1s, x2s):
            assert sorted(sample_x1) == sorted(sample_x2)

Example #5

0

Show file

File: test_generator.py Project: sfwyly/Kashgari

    def test_batch_generator(self):
        x, y = ChineseDailyNerCorpus.load_data('valid')

        text_processor = SequenceProcessor()
        label_processor = SequenceProcessor(build_vocab_from_labels=True,
                                            min_count=1)

        corpus_gen = CorpusGenerator(x, y)

        text_processor.build_vocab_generator([corpus_gen])
        label_processor.build_vocab_generator([corpus_gen])

        batch_dataset1 = BatchDataSet(corpus_gen,
                                      text_processor=text_processor,
                                      label_processor=label_processor,
                                      segment=False,
                                      seq_length=None,
                                      max_position=100,
                                      batch_size=12)

        duplicate_len = len(batch_dataset1)
        assert len(list(batch_dataset1.take(duplicate_len))) == duplicate_len
        assert len(list(batch_dataset1.take(1))) == 1

Example #6

0

Show file

 def build_vocab(self, x_data: TextSamplesVar,
                 y_data: TextSamplesVar) -> None:
     corpus_gen = CorpusGenerator(x_data, y_data)
     self.build_vocab_generator([corpus_gen])

Example #7

0

Show file

File: test_generator.py Project: sfwyly/Kashgari

 def test_corpus_generator(self):
     x_set, y_set = TestMacros.load_labeling_corpus()
     corpus_gen = CorpusGenerator(x_set, y_set)
     pass

Example #8

0

Show file

File: model.py Project: zhengxin2016/Kashgari

 def build_model(self,
                 x_train: TextSamplesVar,
                 y_train: TextSamplesVar) -> None:
     train_gen = CorpusGenerator(x_train, y_train)
     self.build_model_generator(train_gen)