def __init__(self, embedding: ABCEmbedding = None, sequence_length: int = None, hyper_parameters: Dict[str, Dict[str, Any]] = None): """ Args: embedding: embedding object sequence_length: target sequence length hyper_parameters: hyper_parameters to overwrite """ super(ABCLabelingModel, self).__init__() if embedding is None: embedding = BareEmbedding() # type: ignore if hyper_parameters is None: hyper_parameters = self.default_hyper_parameters() self.tf_model: Optional[tf.keras.Model] = None self.embedding = embedding self.hyper_parameters = hyper_parameters self.sequence_length = sequence_length self.text_processor: SequenceProcessor = SequenceProcessor() self.label_processor: SequenceProcessor = SequenceProcessor( build_in_vocab='labeling', min_count=1, build_vocab_from_labels=True) self.crf_layer: Optional[KConditionalRandomField] = None
def test_batch_generator(self): text_processor = SequenceProcessor(segment=True) label_processor = SequenceProcessor( build_in_vocab='labeling', min_count=1, build_vocab_from_labels=True, ) corpus_gen = MongoGenerator(db_name="spo", mongo_url="mongodb://localhost:27017", collection_name="test", buffer_size=3200) text_processor.build_vocab_generator([corpus_gen]) label_processor.build_vocab_generator([corpus_gen]) batch_dataset1 = BatchDataSet(corpus_gen, text_processor=text_processor, label_processor=label_processor, segment=False, seq_length=100, max_position=100, batch_size=12) print(len(batch_dataset1)) duplicate_len = 100 aaa = list(batch_dataset1.take(duplicate_len)) assert len(aaa) == duplicate_len bbb = list(batch_dataset1.take(1)) assert len(bbb) == 1 ccc = list(batch_dataset1.take()) print(len(ccc))
def __init__(self, encoder_embedding: ABCEmbedding = None, decoder_embedding: ABCEmbedding = None, encoder_seq_length: int = None, decoder_seq_length: int = None, hidden_size: int = 1024, **kwargs: Any): """ Init Labeling Model Args: embedding: embedding object sequence_length: target sequence length hyper_parameters: hyper_parameters to overwrite **kwargs: """ logger.warning( "Seq2Seq API is experimental. It may be changed in the future without notice." ) if encoder_embedding is None: encoder_embedding = BareEmbedding( embedding_size=256) # type: ignore self.encoder_embedding = encoder_embedding if decoder_embedding is None: decoder_embedding = BareEmbedding( embedding_size=256) # type: ignore self.decoder_embedding = decoder_embedding self.encoder_processor = SequenceProcessor(min_count=1) self.decoder_processor = SequenceProcessor( build_vocab_from_labels=True, min_count=1) self.encoder: GRUEncoder = None self.decoder: AttGRUDecoder = None self.hidden_size: int = hidden_size self.encoder_seq_length = encoder_seq_length self.decoder_seq_length = decoder_seq_length self.optimizer = tf.keras.optimizers.Adam() self.loss_object = tf.keras.losses.SparseCategoricalCrossentropy( from_logits=True, reduction='none')
def test_base_cases(self): embedding = self.build_embedding() x, y = SMP2018ECDTCorpus.load_data() processor = SequenceProcessor() processor.build_vocab(x, y) embedding.setup_text_processor(processor) samples = random.sample(x, sample_count) res = embedding.embed(samples) max_len = max([len(i) for i in samples]) + 2 if embedding.max_position is not None: max_len = embedding.max_position assert res.shape == (len(samples), max_len, embedding.embedding_size) # Test Save And Load embed_dict = embedding.to_dict() embedding2 = load_data_object(embed_dict) embedding2.setup_text_processor(processor) assert embedding2.embed(samples).shape == (len(samples), max_len, embedding.embedding_size)
def test_text_processor(self): x_set, y_set = TestMacros.load_labeling_corpus() x_samples = random.sample(x_set, 5) text_processor = SequenceProcessor(min_count=1) text_processor.build_vocab(x_set, y_set) text_idx = text_processor.transform(x_samples) text_info_dict = text_processor.to_dict() text_processor2: SequenceProcessor = load_data_object(text_info_dict) text_idx2 = text_processor2.transform(x_samples) sample_lengths = [len(i) for i in x_samples] assert (text_idx2 == text_idx).all() assert text_processor.inverse_transform( text_idx, lengths=sample_lengths) == x_samples assert text_processor2.inverse_transform( text_idx2, lengths=sample_lengths) == x_samples
def __init__(self, embedding: ABCEmbedding = None, *, sequence_length: int = None, hyper_parameters: Dict[str, Dict[str, Any]] = None, multi_label: bool = False, text_processor: ABCProcessor = None, label_processor: ABCProcessor = None): """ Args: embedding: embedding object sequence_length: target sequence length hyper_parameters: hyper_parameters to overwrite multi_label: is multi-label classification text_processor: text processor label_processor: label processor """ super(ABCClassificationModel, self).__init__() if embedding is None: embedding = BareEmbedding() # type: ignore if hyper_parameters is None: hyper_parameters = self.default_hyper_parameters() if text_processor is None: text_processor = SequenceProcessor() if label_processor is None: label_processor = ClassificationProcessor(multi_label=multi_label) self.tf_model: keras.Model = None self.embedding = embedding self.hyper_parameters = hyper_parameters self.sequence_length = sequence_length self.multi_label = multi_label self.text_processor = text_processor self.label_processor = label_processor
def test_label_processor(self): x_set, y_set = TestMacros.load_labeling_corpus() text_processor = SequenceProcessor(build_vocab_from_labels=True, min_count=1) text_processor.build_vocab(x_set, y_set) samples = random.sample(y_set, 20) text_idx = text_processor.transform(samples) text_info_dict = text_processor.to_dict() text_processor2: SequenceProcessor = load_data_object(text_info_dict) text_idx2 = text_processor2.transform(samples) lengths = [len(i) for i in samples] assert (text_idx2 == text_idx).all() assert text_processor2.inverse_transform(text_idx, lengths=lengths) == samples assert text_processor2.inverse_transform(text_idx2, lengths=lengths) == samples text_idx3 = text_processor.transform(samples, seq_length=20) assert [len(i) for i in text_idx3] == [20] * len(text_idx3)
class Seq2Seq: def to_dict(self) -> Dict[str, Any]: return { 'tf_version': tf.__version__, # type: ignore 'coco_nlp_version': coco_nlp.__version__, '__class_name__': self.__class__.__name__, '__module__': self.__class__.__module__, 'config': { 'encoder_seq_length': self.encoder_seq_length, # type: ignore 'decoder_seq_length': self.decoder_seq_length, # type: ignore 'hidden_size': self.hidden_size }, 'encoder_embedding': self.encoder_embedding.to_dict(), # type: ignore 'decoder_embedding': self.decoder_embedding.to_dict(), 'encoder_processor': self.encoder_processor.to_dict(), 'decoder_processor': self.decoder_processor.to_dict(), } def __init__(self, encoder_embedding: ABCEmbedding = None, decoder_embedding: ABCEmbedding = None, encoder_seq_length: int = None, decoder_seq_length: int = None, hidden_size: int = 1024, **kwargs: Any): """ Init Labeling Model Args: embedding: embedding object sequence_length: target sequence length hyper_parameters: hyper_parameters to overwrite **kwargs: """ logger.warning( "Seq2Seq API is experimental. It may be changed in the future without notice." ) if encoder_embedding is None: encoder_embedding = BareEmbedding( embedding_size=256) # type: ignore self.encoder_embedding = encoder_embedding if decoder_embedding is None: decoder_embedding = BareEmbedding( embedding_size=256) # type: ignore self.decoder_embedding = decoder_embedding self.encoder_processor = SequenceProcessor(min_count=1) self.decoder_processor = SequenceProcessor( build_vocab_from_labels=True, min_count=1) self.encoder: GRUEncoder = None self.decoder: AttGRUDecoder = None self.hidden_size: int = hidden_size self.encoder_seq_length = encoder_seq_length self.decoder_seq_length = decoder_seq_length self.optimizer = tf.keras.optimizers.Adam() self.loss_object = tf.keras.losses.SparseCategoricalCrossentropy( from_logits=True, reduction='none') # @tf.function def loss_function(self, real: tf.Tensor, pred: tf.Tensor) -> tf.Tensor: mask = tf.math.logical_not(tf.math.equal(real, 0)) loss_ = self.loss_object(real, pred) mask = tf.cast(mask, dtype=loss_.dtype) loss_ *= mask return tf.reduce_mean(loss_) def build_model(self, x_train: TextSamplesVar, y_train: TextSamplesVar) -> None: train_gen = CorpusGenerator(x_train, y_train) self.build_model_generator(train_gen) def _build_encoder_decoder(self) -> None: self.encoder = GRUEncoder(self.encoder_embedding, hidden_size=self.hidden_size) self.decoder = AttGRUDecoder( self.decoder_embedding, hidden_size=self.hidden_size, vocab_size=self.decoder_processor.vocab_size) try: self.encoder.model().summary() self.decoder.model().summary() except: pass def build_model_generator(self, train_gen: CorpusGenerator) -> None: """ Build model with a generator, This function will do: 1. setup processor's vocab if the vocab is empty. 2. calculate the sequence length if `sequence_length` is None. 3. build up model architect. 4. compile the ``tf_model`` with default loss, optimizer and metrics. Args: train_gen: train data generator """ if self.encoder is None: self.encoder_processor.build_vocab_generator([train_gen]) self.decoder_processor.build_vocab_generator([train_gen]) self.encoder_embedding.setup_text_processor(self.encoder_processor) self.decoder_embedding.setup_text_processor(self.decoder_processor) if self.encoder_seq_length is None: self.encoder_seq_length = self.encoder_embedding.get_seq_length_from_corpus( [train_gen], cover_rate=1.0) logger.info( f"calculated encoder sequence length: {self.encoder_seq_length}" ) if self.decoder_seq_length is None: self.decoder_seq_length = self.decoder_embedding.get_seq_length_from_corpus( [train_gen], use_label=True, cover_rate=1.0) logger.info( f"calculated decoder sequence length: {self.decoder_seq_length}" ) self._build_encoder_decoder() # @tf.function def train_step( self, # type: ignore input_seq, target_seq, enc_hidden): loss = 0 with tf.GradientTape() as tape: enc_output, enc_hidden = self.encoder(input_seq, enc_hidden) dec_hidden = enc_hidden bos_token_id = self.encoder_processor.vocab2idx[ self.encoder_processor.token_bos] dec_input = tf.expand_dims([bos_token_id] * target_seq.shape[0], 1) # Teacher forcing - feeding the target as the next input for t in range(1, target_seq.shape[1]): # pass enc_output to the decoder predictions, dec_hidden, _ = self.decoder( dec_input, dec_hidden, enc_output) loss += self.loss_function(target_seq[:, t], predictions) # using teacher forcing dec_input = tf.expand_dims(target_seq[:, t], 1) batch_loss = (loss / int(target_seq.shape[1])) variables = self.encoder.trainable_variables + self.decoder.trainable_variables gradients = tape.gradient(loss, variables) self.optimizer.apply_gradients(zip(gradients, variables)) return batch_loss def fit( self, x_train: TextSamplesVar, y_train: TextSamplesVar, *, batch_size: int = 64, epochs: int = 5, callbacks: List[tf.keras.callbacks.Callback] = None ) -> tf.keras.callbacks.History: train_gen = CorpusGenerator(x_train, y_train) self.build_model_generator(train_gen) train_dataset = Seq2SeqDataSet( train_gen, batch_size=batch_size, encoder_processor=self.encoder_processor, encoder_seq_length=self.encoder_seq_length, decoder_processor=self.decoder_processor, decoder_seq_length=self.decoder_seq_length) if callbacks is None: callbacks = [] history_callback = tf.keras.callbacks.History() callbacks.append(history_callback) for c in callbacks: c.set_model(self) c.on_train_begin() for epoch in range(epochs): for c in callbacks: c.on_epoch_begin(epoch=epoch) enc_hidden = tf.zeros((batch_size, self.hidden_size)) total_loss = [] with tqdm.tqdm(total=len(train_dataset)) as p_bar: for (inputs, targets) in train_dataset.take(): p_bar.update(1) batch_loss = self.train_step(inputs, targets, enc_hidden) total_loss.append(batch_loss.numpy()) info = f"Epoch {epoch + 1}/{epochs} | Epoch Loss: {np.mean(total_loss):.4f} " \ f"Batch Loss: {batch_loss.numpy():.4f}" p_bar.set_description_str(info) logs = {'loss': np.mean(total_loss)} for c in callbacks: c.on_epoch_end(epoch=epoch, logs=logs) return history_callback def save(self, model_path: str) -> str: """ Save model Args: model_path: """ pathlib.Path(model_path).mkdir(exist_ok=True, parents=True) model_path = os.path.abspath(model_path) with open(os.path.join(model_path, 'model_config.json'), 'w') as f: f.write(json.dumps(self.to_dict(), indent=2, ensure_ascii=False)) f.close() self.encoder_embedding.embed_model.save_weights( os.path.join(model_path, 'encoder_embed_weights.h5')) self.decoder_embedding.embed_model.save_weights( os.path.join(model_path, 'decoder_embed_weights.h5')) self.encoder.save_weights( os.path.join(model_path, 'encoder_weights.h5')) self.decoder.save_weights( os.path.join(model_path, 'decoder_weights.h5')) logger.info('model saved to {}'.format(os.path.abspath(model_path))) return model_path @classmethod def load_model(cls, model_path: str) -> 'Seq2Seq': from coco_nlp.utils import load_data_object model_config_path = os.path.join(model_path, 'model_config.json') model_config = json.loads(open(model_config_path, 'r').read()) model = load_data_object(model_config) # Load processors and embeddings model.encoder_processor = load_data_object( model_config['encoder_processor']) model.decoder_processor = load_data_object( model_config['decoder_processor']) model.encoder_embedding = load_data_object( model_config['encoder_embedding']) model.decoder_embedding = load_data_object( model_config['decoder_embedding']) model._build_encoder_decoder() # Load Model Weights model.encoder_embedding.embed_model.load_weights( os.path.join(model_path, 'encoder_embed_weights.h5')) model.decoder_embedding.embed_model.load_weights( os.path.join(model_path, 'decoder_embed_weights.h5')) # ------ Fix Start ------- # load model issue on TF 2.3 # Unable to load weights saved in HDF5 format into a subclassed Model which has not created its variables yet. # Call the Model first, then load the weights. input_seq = model.encoder_processor.transform( [['hello']], seq_length=model.encoder_seq_length) dec_input = tf.expand_dims([3], 0) enc_hidden = tf.zeros((1, model.hidden_size)) dec_hidden = enc_hidden enc_output, enc_hidden = model.encoder(input_seq, enc_hidden) _ = model.decoder(dec_input, dec_hidden, enc_output) # ------ Fix End ------- model.encoder.load_weights( os.path.join(model_path, 'encoder_weights.h5')) model.decoder.load_weights( os.path.join(model_path, 'decoder_weights.h5')) return model def predict(self, x_data: TextSamplesVar) -> Tuple[List, np.ndarray]: results = [] attentions = [] bos_token_id = self.decoder_processor.vocab2idx[ self.decoder_processor.token_bos] eos_token_id = self.decoder_processor.vocab2idx[ self.decoder_processor.token_eos] for sample in x_data: input_seq = self.encoder_processor.transform( [sample], seq_length=self.encoder_seq_length) enc_hidden = tf.zeros((1, self.hidden_size)) enc_output, enc_hidden = self.encoder(input_seq, enc_hidden) dec_hidden = enc_hidden attention_plot = np.zeros( (self.decoder_seq_length, self.encoder_seq_length)) token_out = [] dec_input = tf.expand_dims([bos_token_id], 0) for t in range(self.decoder_seq_length): predictions, dec_hidden, att_weights = self.decoder( dec_input, dec_hidden, enc_output) # storing the attention weights to plot later on attention_weights = tf.reshape(att_weights, (-1, )) attention_plot[t] = attention_weights.numpy() next_tokens = tf.argmax(predictions[0]).numpy() token_out.append(next_tokens) if next_tokens == eos_token_id: break dec_input = tf.expand_dims([next_tokens], 0) r = self.decoder_processor.inverse_transform([token_out])[0] results.append(r) attentions.append(attention_plot) return results, np.array(attentions)