def load_model( cls, model_path: str ) -> Union["ABCLabelingModel", "ABCClassificationModel"]: model_config_path = os.path.join(model_path, 'model_config.json') model_config = json.loads(open(model_config_path, 'r').read()) model = load_data_object(model_config) model.embedding = load_data_object(model_config['embedding']) model.text_processor = load_data_object(model_config['text_processor']) model.label_processor = load_data_object( model_config['label_processor']) tf_model_str = json.dumps(model_config['tf_model']) print(tf_model_str) model.tf_model = tf.keras.models.model_from_json( tf_model_str, custom_objects=kashgari.custom_objects) if isinstance(model.tf_model.layers[-1], KConditionalRandomField): model.layer_crf = model.tf_model.layers[-1] model.tf_model.load_weights( os.path.join(model_path, 'model_weights.h5')) model.embedding.embed_model.load_weights( os.path.join(model_path, 'embed_model_weights.h5')) return model
def load_model(cls, model_path: str) -> 'Seq2Seq': from kashgari.utils import load_data_object model_config_path = os.path.join(model_path, 'model_config.json') model_config = json.loads(open(model_config_path, 'r').read()) model = load_data_object(model_config) # Load processors and embeddings model.encoder_processor = load_data_object(model_config['encoder_processor']) model.decoder_processor = load_data_object(model_config['decoder_processor']) model.encoder_embedding = load_data_object(model_config['encoder_embedding']) model.decoder_embedding = load_data_object(model_config['decoder_embedding']) model._build_encoder_decoder() # Load Model Weights model.encoder_embedding.embed_model.load_weights(os.path.join(model_path, 'encoder_embed_weights.h5')) model.decoder_embedding.embed_model.load_weights(os.path.join(model_path, 'decoder_embed_weights.h5')) # ------ Fix Start ------- # load model issue on TF 2.3 # Unable to load weights saved in HDF5 format into a subclassed Model which has not created its variables yet. # Call the Model first, then load the weights. input_seq = model.encoder_processor.transform([['hello']], seq_length=model.encoder_seq_length) dec_input = tf.expand_dims([3], 0) enc_hidden = tf.zeros((1, model.hidden_size)) dec_hidden = enc_hidden enc_output, enc_hidden = model.encoder(input_seq, enc_hidden) _ = model.decoder(dec_input, dec_hidden, enc_output) # ------ Fix End ------- model.encoder.load_weights(os.path.join(model_path, 'encoder_weights.h5')) model.decoder.load_weights(os.path.join(model_path, 'decoder_weights.h5')) return model
def test_processor(self): x_set, y_set = TestMacros.load_classification_corpus() processor = ClassificationProcessor() processor.build_vocab(x_set, y_set) transformed_idx = processor.transform(y_set[20:40]) info_dict = processor.to_dict() p2: ClassificationProcessor = load_data_object(info_dict) assert (transformed_idx == p2.transform(y_set[20:40])).all() assert y_set[20:40] == p2.inverse_transform(transformed_idx)
def load_model( cls, model_path: str, custom_objects: Dict = None, encoding: str = 'utf-8' ) -> Union["ABCLabelingModel", "ABCClassificationModel"]: if custom_objects is None: custom_objects = {} if cls.__name__ not in custom_objects: custom_objects[cls.__name__] = cls model_config_path = os.path.join(model_path, 'model_config.json') model_config = json.loads( open(model_config_path, 'r', encoding=encoding).read()) model = load_data_object(model_config, custom_objects) model.embedding = load_data_object(model_config['embedding'], custom_objects) model.text_processor = load_data_object(model_config['text_processor'], custom_objects) model.label_processor = load_data_object( model_config['label_processor'], custom_objects) tf_model_str = json.dumps(model_config['tf_model']) model.tf_model = tf.keras.models.model_from_json( tf_model_str, custom_objects=kashgari.custom_objects) if isinstance(model.tf_model.layers[-1], KConditionalRandomField): model.crf_layer = model.tf_model.layers[-1] model.tf_model.load_weights( os.path.join(model_path, 'model_weights.h5')) model.embedding.embed_model.load_weights( os.path.join(model_path, 'embed_model_weights.h5')) return model
def test_text_processor(self): x_set, y_set = TestMacros.load_labeling_corpus() x_samples = random.sample(x_set, 5) text_processor = SequenceProcessor(min_count=1) text_processor.build_vocab(x_set, y_set) text_idx = text_processor.transform(x_samples) text_info_dict = text_processor.to_dict() text_processor2: SequenceProcessor = load_data_object(text_info_dict) text_idx2 = text_processor2.transform(x_samples) sample_lengths = [len(i) for i in x_samples] assert (text_idx2 == text_idx).all() assert text_processor.inverse_transform( text_idx, lengths=sample_lengths) == x_samples assert text_processor2.inverse_transform( text_idx2, lengths=sample_lengths) == x_samples
def test_multi_label_processor(self): from kashgari.corpus import JigsawToxicCommentCorpus file_path = TestMacros.jigsaw_mini_corpus_path corpus = JigsawToxicCommentCorpus(file_path) x_set, y_set = corpus.load_data() corpus_gen = CorpusGenerator(x_set, y_set) processor = ClassificationProcessor(multi_label=True) processor.build_vocab_generator([corpus_gen]) transformed_idx = processor.transform(y_set[20:40]) info_dict = processor.to_dict() p2: ClassificationProcessor = load_data_object(info_dict) assert (transformed_idx == p2.transform(y_set[20:40])).all() x1s = y_set[20:40] x2s = p2.inverse_transform(transformed_idx) for sample_x1, sample_x2 in zip(x1s, x2s): assert sorted(sample_x1) == sorted(sample_x2)
def test_base_cases(self): embedding = self.build_embedding() x, y = SMP2018ECDTCorpus.load_data() processor = SequenceProcessor() processor.build_vocab(x, y) embedding.setup_text_processor(processor) samples = random.sample(x, sample_count) res = embedding.embed(samples) max_len = max([len(i) for i in samples]) + 2 if embedding.max_position is not None: max_len = embedding.max_position assert res.shape == (len(samples), max_len, embedding.embedding_size) # Test Save And Load embed_dict = embedding.to_dict() embedding2 = load_data_object(embed_dict) embedding2.setup_text_processor(processor) assert embedding2.embed(samples).shape == (len(samples), max_len, embedding.embedding_size)
def test_label_processor(self): x_set, y_set = TestMacros.load_labeling_corpus() text_processor = SequenceProcessor(build_vocab_from_labels=True, min_count=1) text_processor.build_vocab(x_set, y_set) samples = random.sample(y_set, 20) text_idx = text_processor.transform(samples) text_info_dict = text_processor.to_dict() text_processor2: SequenceProcessor = load_data_object(text_info_dict) text_idx2 = text_processor2.transform(samples) lengths = [len(i) for i in samples] assert (text_idx2 == text_idx).all() assert text_processor2.inverse_transform(text_idx, lengths=lengths) == samples assert text_processor2.inverse_transform(text_idx2, lengths=lengths) == samples text_idx3 = text_processor.transform(samples, seq_length=20) assert [len(i) for i in text_idx3] == [20] * len(text_idx3)