Example #1
0
    def load_model(
        cls, model_path: str
    ) -> Union["ABCLabelingModel", "ABCClassificationModel"]:
        model_config_path = os.path.join(model_path, 'model_config.json')
        model_config = json.loads(open(model_config_path, 'r').read())
        model = load_data_object(model_config)

        model.embedding = load_data_object(model_config['embedding'])
        model.text_processor = load_data_object(model_config['text_processor'])
        model.label_processor = load_data_object(
            model_config['label_processor'])

        tf_model_str = json.dumps(model_config['tf_model'])

        print(tf_model_str)
        model.tf_model = tf.keras.models.model_from_json(
            tf_model_str, custom_objects=kashgari.custom_objects)

        if isinstance(model.tf_model.layers[-1], KConditionalRandomField):
            model.layer_crf = model.tf_model.layers[-1]

        model.tf_model.load_weights(
            os.path.join(model_path, 'model_weights.h5'))
        model.embedding.embed_model.load_weights(
            os.path.join(model_path, 'embed_model_weights.h5'))
        return model
Example #2
0
    def load_model(cls, model_path: str) -> 'Seq2Seq':
        from kashgari.utils import load_data_object
        model_config_path = os.path.join(model_path, 'model_config.json')
        model_config = json.loads(open(model_config_path, 'r').read())
        model = load_data_object(model_config)

        # Load processors and embeddings
        model.encoder_processor = load_data_object(model_config['encoder_processor'])
        model.decoder_processor = load_data_object(model_config['decoder_processor'])
        model.encoder_embedding = load_data_object(model_config['encoder_embedding'])
        model.decoder_embedding = load_data_object(model_config['decoder_embedding'])

        model._build_encoder_decoder()
        # Load Model Weights
        model.encoder_embedding.embed_model.load_weights(os.path.join(model_path, 'encoder_embed_weights.h5'))
        model.decoder_embedding.embed_model.load_weights(os.path.join(model_path, 'decoder_embed_weights.h5'))

        # ------ Fix Start -------
        # load model issue on TF 2.3
        # Unable to load weights saved in HDF5 format into a subclassed Model which has not created its variables yet.
        # Call the Model first, then load the weights.
        input_seq = model.encoder_processor.transform([['hello']],
                                                      seq_length=model.encoder_seq_length)
        dec_input = tf.expand_dims([3], 0)
        enc_hidden = tf.zeros((1, model.hidden_size))
        dec_hidden = enc_hidden
        enc_output, enc_hidden = model.encoder(input_seq, enc_hidden)
        _ = model.decoder(dec_input, dec_hidden, enc_output)
        # ------ Fix End -------

        model.encoder.load_weights(os.path.join(model_path, 'encoder_weights.h5'))
        model.decoder.load_weights(os.path.join(model_path, 'decoder_weights.h5'))

        return model
Example #3
0
    def test_processor(self):
        x_set, y_set = TestMacros.load_classification_corpus()
        processor = ClassificationProcessor()
        processor.build_vocab(x_set, y_set)
        transformed_idx = processor.transform(y_set[20:40])

        info_dict = processor.to_dict()

        p2: ClassificationProcessor = load_data_object(info_dict)
        assert (transformed_idx == p2.transform(y_set[20:40])).all()
        assert y_set[20:40] == p2.inverse_transform(transformed_idx)
Example #4
0
    def load_model(
        cls,
        model_path: str,
        custom_objects: Dict = None,
        encoding: str = 'utf-8'
    ) -> Union["ABCLabelingModel", "ABCClassificationModel"]:
        if custom_objects is None:
            custom_objects = {}

        if cls.__name__ not in custom_objects:
            custom_objects[cls.__name__] = cls

        model_config_path = os.path.join(model_path, 'model_config.json')
        model_config = json.loads(
            open(model_config_path, 'r', encoding=encoding).read())
        model = load_data_object(model_config, custom_objects)

        model.embedding = load_data_object(model_config['embedding'],
                                           custom_objects)
        model.text_processor = load_data_object(model_config['text_processor'],
                                                custom_objects)
        model.label_processor = load_data_object(
            model_config['label_processor'], custom_objects)

        tf_model_str = json.dumps(model_config['tf_model'])

        model.tf_model = tf.keras.models.model_from_json(
            tf_model_str, custom_objects=kashgari.custom_objects)

        if isinstance(model.tf_model.layers[-1], KConditionalRandomField):
            model.crf_layer = model.tf_model.layers[-1]

        model.tf_model.load_weights(
            os.path.join(model_path, 'model_weights.h5'))
        model.embedding.embed_model.load_weights(
            os.path.join(model_path, 'embed_model_weights.h5'))
        return model
Example #5
0
    def test_text_processor(self):
        x_set, y_set = TestMacros.load_labeling_corpus()
        x_samples = random.sample(x_set, 5)
        text_processor = SequenceProcessor(min_count=1)
        text_processor.build_vocab(x_set, y_set)
        text_idx = text_processor.transform(x_samples)

        text_info_dict = text_processor.to_dict()
        text_processor2: SequenceProcessor = load_data_object(text_info_dict)

        text_idx2 = text_processor2.transform(x_samples)
        sample_lengths = [len(i) for i in x_samples]

        assert (text_idx2 == text_idx).all()
        assert text_processor.inverse_transform(
            text_idx, lengths=sample_lengths) == x_samples
        assert text_processor2.inverse_transform(
            text_idx2, lengths=sample_lengths) == x_samples
Example #6
0
    def test_multi_label_processor(self):
        from kashgari.corpus import JigsawToxicCommentCorpus
        file_path = TestMacros.jigsaw_mini_corpus_path
        corpus = JigsawToxicCommentCorpus(file_path)
        x_set, y_set = corpus.load_data()

        corpus_gen = CorpusGenerator(x_set, y_set)

        processor = ClassificationProcessor(multi_label=True)
        processor.build_vocab_generator([corpus_gen])
        transformed_idx = processor.transform(y_set[20:40])

        info_dict = processor.to_dict()

        p2: ClassificationProcessor = load_data_object(info_dict)
        assert (transformed_idx == p2.transform(y_set[20:40])).all()

        x1s = y_set[20:40]
        x2s = p2.inverse_transform(transformed_idx)
        for sample_x1, sample_x2 in zip(x1s, x2s):
            assert sorted(sample_x1) == sorted(sample_x2)
Example #7
0
    def test_base_cases(self):
        embedding = self.build_embedding()
        x, y = SMP2018ECDTCorpus.load_data()
        processor = SequenceProcessor()
        processor.build_vocab(x, y)
        embedding.setup_text_processor(processor)

        samples = random.sample(x, sample_count)
        res = embedding.embed(samples)
        max_len = max([len(i) for i in samples]) + 2

        if embedding.max_position is not None:
            max_len = embedding.max_position

        assert res.shape == (len(samples), max_len, embedding.embedding_size)

        # Test Save And Load
        embed_dict = embedding.to_dict()
        embedding2 = load_data_object(embed_dict)
        embedding2.setup_text_processor(processor)
        assert embedding2.embed(samples).shape == (len(samples), max_len,
                                                   embedding.embedding_size)
Example #8
0
    def test_label_processor(self):
        x_set, y_set = TestMacros.load_labeling_corpus()
        text_processor = SequenceProcessor(build_vocab_from_labels=True,
                                           min_count=1)
        text_processor.build_vocab(x_set, y_set)

        samples = random.sample(y_set, 20)

        text_idx = text_processor.transform(samples)

        text_info_dict = text_processor.to_dict()

        text_processor2: SequenceProcessor = load_data_object(text_info_dict)

        text_idx2 = text_processor2.transform(samples)
        lengths = [len(i) for i in samples]
        assert (text_idx2 == text_idx).all()
        assert text_processor2.inverse_transform(text_idx,
                                                 lengths=lengths) == samples
        assert text_processor2.inverse_transform(text_idx2,
                                                 lengths=lengths) == samples

        text_idx3 = text_processor.transform(samples, seq_length=20)
        assert [len(i) for i in text_idx3] == [20] * len(text_idx3)