def __init__(self, embedding: Optional[Embedding] = None, hyper_parameters: Optional[Dict[str, Dict[str, Any]]] = None): """ Args: embedding: model embedding hyper_parameters: a dict of hyper_parameters. Examples: You could change customize hyper_parameters like this:: # get default hyper_parameters hyper_parameters = BLSTMModel.get_default_hyper_parameters() # change lstm hidden unit to 12 hyper_parameters['layer_blstm']['units'] = 12 # init new model with customized hyper_parameters labeling_model = BLSTMModel(hyper_parameters=hyper_parameters) labeling_model.fit(x, y) """ if embedding is None: self.embedding = BareEmbedding(task=self.__task__) else: self.embedding = embedding self.tf_model: keras.Model = None self.hyper_parameters = self.get_default_hyper_parameters() self.model_info = {} if hyper_parameters: self.hyper_parameters.update(hyper_parameters)
def __init__(self, embedding: ABCEmbedding = None, sequence_length: int = None, hyper_parameters: Dict[str, Dict[str, Any]] = None): """ Args: embedding: embedding object sequence_length: target sequence length hyper_parameters: hyper_parameters to overwrite """ super(ABCLabelingModel, self).__init__() if embedding is None: embedding = BareEmbedding() # type: ignore if hyper_parameters is None: hyper_parameters = self.default_hyper_parameters() self.tf_model: Optional[tf.keras.Model] = None self.embedding = embedding self.hyper_parameters = hyper_parameters self.sequence_length = sequence_length self.text_processor: SequenceProcessor = SequenceProcessor() self.label_processor: SequenceProcessor = SequenceProcessor( build_in_vocab='labeling', min_count=1, build_vocab_from_labels=True) self.crf_layer: Optional[KConditionalRandomField] = None
def test_training(self): text = ['NLP', 'Projects', 'Project', 'Name', ':'] start_of_p = [1, 2, 1, 2, 2] bold = [1, 1, 1, 1, 2] center = [1, 1, 2, 2, 2] label = [ 'B-Category', 'I-Category', 'B-ProjectName', 'I-ProjectName', 'I-ProjectName' ] text_list = [text] * 300 start_of_p_list = [start_of_p] * 300 bold_list = [bold] * 300 center_list = [center] * 300 label_list = [label] * 300 # You can use WordEmbedding or BERTEmbedding for your text embedding SEQUENCE_LEN = 100 text_embedding = BareEmbedding(task=kashgari.LABELING, sequence_length=SEQUENCE_LEN) start_of_p_embedding = NumericFeaturesEmbedding( feature_count=2, feature_name='start_of_p', sequence_length=SEQUENCE_LEN) bold_embedding = NumericFeaturesEmbedding(feature_count=2, feature_name='bold', sequence_length=SEQUENCE_LEN, embedding_size=10) center_embedding = NumericFeaturesEmbedding( feature_count=2, feature_name='center', sequence_length=SEQUENCE_LEN) # first one must be the text, embedding stack_embedding = StackedEmbedding([ text_embedding, start_of_p_embedding, bold_embedding, center_embedding ]) x = (text_list, start_of_p_list, bold_list, center_list) y = label_list stack_embedding.analyze_corpus(x, y) model = BiLSTM_Model(embedding=stack_embedding) model.build_model(x, y) model.tf_model.summary() model.fit(x, y, epochs=2) model_path = os.path.join('./saved_models/', model.__class__.__module__, model.__class__.__name__) model.save(model_path) new_model = kashgari.utils.load_model(model_path)
def test_build_and_fit(self): from kashgari.embeddings import BareEmbedding processor = MultiOutputProcessor() embedding = BareEmbedding(processor=processor) m = MultiOutputModel(embedding=embedding) m.build_model(train_x, (output_1, output_2)) m.fit(train_x, (output_1, output_2), epochs=2) res = m.predict(train_x[:10]) assert len(res) == 2 assert res[0].shape == (10, 3)
def __init__(self, encoder_embedding: ABCEmbedding = None, decoder_embedding: ABCEmbedding = None, encoder_seq_length: int = None, decoder_seq_length: int = None, hidden_size: int = 1024, **kwargs: Any): """ Init Labeling Model Args: embedding: embedding object sequence_length: target sequence length hyper_parameters: hyper_parameters to overwrite **kwargs: """ logger.warning("Seq2Seq API is experimental. It may be changed in the future without notice.") if encoder_embedding is None: encoder_embedding = BareEmbedding(embedding_size=256) # type: ignore self.encoder_embedding = encoder_embedding if decoder_embedding is None: decoder_embedding = BareEmbedding(embedding_size=256) # type: ignore self.decoder_embedding = decoder_embedding self.encoder_processor = SequenceProcessor(min_count=1) self.decoder_processor = SequenceProcessor(build_vocab_from_labels=True, min_count=1) self.encoder: GRUEncoder = None self.decoder: AttGRUDecoder = None self.hidden_size: int = hidden_size self.encoder_seq_length = encoder_seq_length self.decoder_seq_length = decoder_seq_length self.optimizer = tf.keras.optimizers.Adam() self.loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')
def test_multi_label(self): p = ClassificationProcessor(multi_label=True) embedding = BareEmbedding(task='classification', processor=p) model = self.model_class(embedding) model.fit(sample_train_x, sample_train_y, epochs=1) assert len(p.label2idx) == 3 model.evaluate(sample_eval_x, sample_eval_y) assert isinstance(model.predict(sample_eval_x)[0], tuple) report_dict = model.evaluate(sample_eval_x, sample_eval_y, output_dict=True) assert isinstance(report_dict, dict)
def test_embedding(self): text, label = ChineseDailyNerCorpus.load_data() is_bold = np.random.randint(1, 3, (len(text), 12)) text_embedding = BareEmbedding(task=kashgari.LABELING, sequence_length=12) num_feature_embedding = NumericFeaturesEmbedding(2, 'is_bold', sequence_length=12) stack_embedding = StackedEmbedding( [text_embedding, num_feature_embedding]) stack_embedding.analyze_corpus((text, is_bold), label) stack_embedding.process_x_dataset((text[:3], is_bold[:3])) r = stack_embedding.embed((text[:3], is_bold[:3])) assert r.shape == (3, 12, 116)
def train(self, tokens, tags): x, y = self.prepare_data_fit(tokens, tags, chunk_size=self.chunk_size) text_embedding = BareEmbedding(task=kashgari.LABELING, sequence_length=self.chunk_size) first_of_p_embedding = NumericFeaturesEmbedding( feature_count=2, feature_name='first_of_p', sequence_length=self.chunk_size) stack_embedding = StackedEmbedding( [text_embedding, first_of_p_embedding]) stack_embedding.analyze_corpus(x, y) from kashgari.tasks.labeling import BiLSTM_Model, BiLSTM_CRF_Model self.model = BiLSTM_CRF_Model(embedding=stack_embedding) self.model.fit(x, y, batch_size=1, epochs=20)
def test_multi_label(self): p = ClassificationProcessor(multi_label=True) embedding = BareEmbedding(task='classification', processor=p) model = self.model_class(embedding) model.fit(sample_train_x, sample_train_y, epochs=1) assert len(p.label2idx) == 3 model.evaluate(sample_eval_x, sample_eval_y) assert isinstance(model.predict(sample_eval_x)[0], tuple) report_dict = model.evaluate(sample_eval_x, sample_eval_y, output_dict=True) assert isinstance(report_dict, dict) res = model.predict(valid_x[:20]) model_path = os.path.join(tempfile.gettempdir(), str(time.time())) model.save(model_path) new_model = kashgari.utils.load_model(model_path) assert res == new_model.predict(valid_x[:20])
class BaseModel(object): """Base Sequence Labeling Model""" @classmethod def get_default_hyper_parameters(cls) -> Dict[str, Dict[str, Any]]: raise NotImplementedError def info(self): model_json_str = self.tf_model.to_json() return { 'config': { 'hyper_parameters': self.hyper_parameters, }, 'tf_model': json.loads(model_json_str), 'embedding': self.embedding.info(), 'class_name': self.__class__.__name__, 'module': self.__class__.__module__, 'tf_version': tf.__version__, 'kashgari_version': kashgari.__version__ } @property def task(self): return self.embedding.task @property def token2idx(self) -> Dict[str, int]: return self.embedding.token2idx @property def label2idx(self) -> Dict[str, int]: return self.embedding.label2idx @property def pre_processor(self): warnings.warn( "The 'pre_processor' property is deprecated, " "use 'processor' instead", DeprecationWarning, 2) """Deprecated. Use `self.processor` instead.""" return self.embedding.processor @property def processor(self): return self.embedding.processor def __init__(self, embedding: Optional[Embedding] = None, hyper_parameters: Optional[Dict[str, Dict[str, Any]]] = None): """ Args: embedding: model embedding hyper_parameters: a dict of hyper_parameters. Examples: You could change customize hyper_parameters like this:: # get default hyper_parameters hyper_parameters = BLSTMModel.get_default_hyper_parameters() # change lstm hidden unit to 12 hyper_parameters['layer_blstm']['units'] = 12 # init new model with customized hyper_parameters labeling_model = BLSTMModel(hyper_parameters=hyper_parameters) labeling_model.fit(x, y) """ if embedding is None: self.embedding = BareEmbedding(task=self.__task__) else: self.embedding = embedding self.tf_model: keras.Model = None self.hyper_parameters = self.get_default_hyper_parameters() self.model_info = {} if hyper_parameters: self.hyper_parameters.update(hyper_parameters) def build_model(self, x_train: Union[Tuple[List[List[str]], ...], List[List[str]]], y_train: Union[List[List[str]], List[str]], x_validate: Union[Tuple[List[List[str]], ...], List[List[str]]] = None, y_validate: Union[List[List[str]], List[str]] = None): """ Build model with corpus Args: x_train: Array of train feature data (if the model has a single input), or tuple of train feature data array (if the model has multiple inputs) y_train: Array of train label data x_validate: Array of validation feature data (if the model has a single input), or tuple of validation feature data array (if the model has multiple inputs) y_validate: Array of validation label data Returns: """ if x_validate is not None and not isinstance(x_validate, tuple): self.embedding.analyze_corpus(x_train + x_validate, y_train + y_validate) else: self.embedding.analyze_corpus(x_train, y_train) if self.tf_model is None: self.build_model_arc() self.compile_model() def build_multi_gpu_model(self, gpus: int, x_train: Union[Tuple[List[List[str]], ...], List[List[str]]], y_train: Union[List[List[str]], List[str]], cpu_merge: bool = True, cpu_relocation: bool = False, x_validate: Union[Tuple[List[List[str]], ...], List[List[str]]] = None, y_validate: Union[List[List[str]], List[str]] = None): """ Build multi-GPU model with corpus Args: gpus: Integer >= 2, number of on GPUs on which to create model replicas. cpu_merge: A boolean value to identify whether to force merging model weights under the scope of the CPU or not. cpu_relocation: A boolean value to identify whether to create the model's weights under the scope of the CPU. If the model is not defined under any preceding device scope, you can still rescue it by activating this option. x_train: Array of train feature data (if the model has a single input), or tuple of train feature data array (if the model has multiple inputs) y_train: Array of train label data x_validate: Array of validation feature data (if the model has a single input), or tuple of validation feature data array (if the model has multiple inputs) y_validate: Array of validation label data Returns: """ if x_validate is not None and not isinstance(x_validate, tuple): self.embedding.analyze_corpus(x_train + x_validate, y_train + y_validate) else: self.embedding.analyze_corpus(x_train, y_train) if self.tf_model is None: with utils.custom_object_scope(): self.build_model_arc() self.tf_model = tf.keras.utils.multi_gpu_model( self.tf_model, gpus, cpu_merge=cpu_merge, cpu_relocation=cpu_relocation) self.compile_model() def build_tpu_model(self, strategy: tf.contrib.distribute.TPUStrategy, x_train: Union[Tuple[List[List[str]], ...], List[List[str]]], y_train: Union[List[List[str]], List[str]], x_validate: Union[Tuple[List[List[str]], ...], List[List[str]]] = None, y_validate: Union[List[List[str]], List[str]] = None): """ Build TPU model with corpus Args: strategy: `TPUDistributionStrategy`. The strategy to use for replicating model across multiple TPU cores. x_train: Array of train feature data (if the model has a single input), or tuple of train feature data array (if the model has multiple inputs) y_train: Array of train label data x_validate: Array of validation feature data (if the model has a single input), or tuple of validation feature data array (if the model has multiple inputs) y_validate: Array of validation label data Returns: """ if x_validate is not None and not isinstance(x_validate, tuple): self.embedding.analyze_corpus(x_train + x_validate, y_train + y_validate) else: self.embedding.analyze_corpus(x_train, y_train) if self.tf_model is None: with utils.custom_object_scope(): self.build_model_arc() self.tf_model = tf.contrib.tpu.keras_to_tpu_model( self.tf_model, strategy=strategy) self.compile_model(optimizer=tf.train.AdamOptimizer()) def get_data_generator(self, x_data, y_data, batch_size: int = 64, shuffle: bool = True): """ data generator for fit_generator Args: x_data: Array of feature data (if the model has a single input), or tuple of feature data array (if the model has multiple inputs) y_data: Array of label data batch_size: Number of samples per gradient update, default to 64. shuffle: Returns: data generator """ index_list = np.arange(len(x_data)) page_count = len(x_data) // batch_size + 1 while True: if shuffle: np.random.shuffle(index_list) for page in range(page_count): start_index = page * batch_size end_index = start_index + batch_size target_index = index_list[start_index:end_index] if len(target_index) == 0: target_index = index_list[0:batch_size] x_tensor = self.embedding.process_x_dataset( x_data, target_index) y_tensor = self.embedding.process_y_dataset( y_data, target_index) yield (x_tensor, y_tensor) def fit(self, x_train: Union[Tuple[List[List[str]], ...], List[List[str]]], y_train: Union[List[List[str]], List[str]], x_validate: Union[Tuple[List[List[str]], ...], List[List[str]]] = None, y_validate: Union[List[List[str]], List[str]] = None, batch_size: int = 64, epochs: int = 5, callbacks: List[keras.callbacks.Callback] = None, fit_kwargs: Dict = None, shuffle: bool = True): """ Trains the model for a given number of epochs with fit_generator (iterations on a dataset). Args: x_train: Array of train feature data (if the model has a single input), or tuple of train feature data array (if the model has multiple inputs) y_train: Array of train label data x_validate: Array of validation feature data (if the model has a single input), or tuple of validation feature data array (if the model has multiple inputs) y_validate: Array of validation label data batch_size: Number of samples per gradient update, default to 64. epochs: Integer. Number of epochs to train the model. default 5. callbacks: fit_kwargs: fit_kwargs: additional arguments passed to ``fit_generator()`` function from ``tensorflow.keras.Model`` - https://www.tensorflow.org/api_docs/python/tf/keras/models/Model#fit_generator shuffle: Returns: """ self.build_model(x_train, y_train, x_validate, y_validate) train_generator = self.get_data_generator(x_train, y_train, batch_size, shuffle) if fit_kwargs is None: fit_kwargs = {} validation_generator = None validation_steps = None if x_validate: validation_generator = self.get_data_generator( x_validate, y_validate, batch_size, shuffle) if isinstance(x_validate, tuple): validation_steps = len(x_validate[0]) // batch_size + 1 else: validation_steps = len(x_validate) // batch_size + 1 if isinstance(x_train, tuple): steps_per_epoch = len(x_train[0]) // batch_size + 1 else: steps_per_epoch = len(x_train) // batch_size + 1 with utils.custom_object_scope(): return self.tf_model.fit_generator( train_generator, steps_per_epoch=steps_per_epoch, epochs=epochs, validation_data=validation_generator, validation_steps=validation_steps, callbacks=callbacks, **fit_kwargs) def fit_without_generator(self, x_train: Union[Tuple[List[List[str]], ...], List[List[str]]], y_train: Union[List[List[str]], List[str]], x_validate: Union[Tuple[List[List[str]], ...], List[List[str]]] = None, y_validate: Union[List[List[str]], List[str]] = None, batch_size: int = 64, epochs: int = 5, callbacks: List[keras.callbacks.Callback] = None, fit_kwargs: Dict = None): """ Trains the model for a given number of epochs (iterations on a dataset). Args: x_train: Array of train feature data (if the model has a single input), or tuple of train feature data array (if the model has multiple inputs) y_train: Array of train label data x_validate: Array of validation feature data (if the model has a single input), or tuple of validation feature data array (if the model has multiple inputs) y_validate: Array of validation label data batch_size: Number of samples per gradient update, default to 64. epochs: Integer. Number of epochs to train the model. default 5. callbacks: fit_kwargs: fit_kwargs: additional arguments passed to ``fit_generator()`` function from ``tensorflow.keras.Model`` - https://www.tensorflow.org/api_docs/python/tf/keras/models/Model#fit_generator Returns: """ self.build_model(x_train, y_train, x_validate, y_validate) tensor_x = self.embedding.process_x_dataset(x_train) tensor_y = self.embedding.process_y_dataset(y_train) validation_data = None if x_validate is not None: tensor_valid_x = self.embedding.process_x_dataset(x_validate) tensor_valid_y = self.embedding.process_y_dataset(y_validate) validation_data = (tensor_valid_x, tensor_valid_y) if fit_kwargs is None: fit_kwargs = {} if callbacks and 'callbacks' not in fit_kwargs: fit_kwargs['callbacks'] = callbacks with utils.custom_object_scope(): return self.tf_model.fit(tensor_x, tensor_y, validation_data=validation_data, epochs=epochs, batch_size=batch_size, **fit_kwargs) def compile_model(self, **kwargs): """Configures the model for training. Using ``compile()`` function of ``tf.keras.Model`` - https://www.tensorflow.org/api_docs/python/tf/keras/models/Model#compile Args: **kwargs: arguments passed to ``compile()`` function of ``tf.keras.Model`` Defaults: - loss: ``categorical_crossentropy`` - optimizer: ``adam`` - metrics: ``['accuracy']`` """ if kwargs.get('loss') is None: kwargs['loss'] = 'categorical_crossentropy' if kwargs.get('optimizer') is None: kwargs['optimizer'] = 'adam' if kwargs.get('metrics') is None: kwargs['metrics'] = ['accuracy'] self.tf_model.compile(**kwargs) if not kashgari.config.disable_auto_summary: self.tf_model.summary() def predict(self, x_data, batch_size=32, debug_info=False, predict_kwargs: Dict = None): """ Generates output predictions for the input samples. Computation is done in batches. Args: x_data: The input data, as a Numpy array (or list of Numpy arrays if the model has multiple inputs). batch_size: Integer. If unspecified, it will default to 32. debug_info: Bool, Should print out the logging info. predict_kwargs: arguments passed to ``predict()`` function of ``tf.keras.Model`` Returns: array(s) of predictions. """ if predict_kwargs is None: predict_kwargs = {} with utils.custom_object_scope(): if isinstance(x_data, tuple): lengths = [len(sen) for sen in x_data[0]] else: lengths = [len(sen) for sen in x_data] tensor = self.embedding.process_x_dataset(x_data) pred = self.tf_model.predict(tensor, batch_size=batch_size, **predict_kwargs) res = self.embedding.reverse_numerize_label_sequences( pred.argmax(-1), lengths) if debug_info: logging.info('input: {}'.format(tensor)) logging.info('output: {}'.format(pred)) logging.info('output argmax: {}'.format(pred.argmax(-1))) return res def evaluate(self, x_data, y_data, batch_size=None, digits=4, debug_info=False) -> Tuple[float, float, Dict]: """ Evaluate model Args: x_data: y_data: batch_size: digits: debug_info: Returns: """ raise NotImplementedError def build_model_arc(self): raise NotImplementedError def save(self, model_path: str): """ Save model Args: model_path: Returns: """ pathlib.Path(model_path).mkdir(exist_ok=True, parents=True) with open(os.path.join(model_path, 'model_info.json'), 'w') as f: f.write(json.dumps(self.info(), indent=2, ensure_ascii=True)) f.close() self.tf_model.save_weights(os.path.join(model_path, 'model_weights.h5')) logging.info('model saved to {}'.format(os.path.abspath(model_path)))
tensor_rnn = layer(tensor_rnn) tensor_sensors = [layer(tensor_rnn) for layer in layers_sensor] tensor_output = layer_allviews(tensor_sensors) for layer in layers_full_connect: tensor_output = layer(tensor_output) self.tf_model = tf.keras.Model(embed_model.inputs, tensor_output) if __name__ == "__main__": print(BiLSTM_Model.get_default_hyper_parameters()) logging.basicConfig(level=logging.DEBUG) from kashgari.corpus import SMP2018ECDTCorpus x, y = SMP2018ECDTCorpus.load_data() import kashgari from kashgari.processors.classification_processor import ClassificationProcessor from kashgari.embeddings import BareEmbedding processor = ClassificationProcessor(multi_label=False) embed = BareEmbedding(task=kashgari.CLASSIFICATION, sequence_length=30, processor=processor) m = BiLSTM_Model(embed) # m.build_model(x, y) m.fit(x, y, epochs=2) print(m.predict(x[:10])) # m.evaluate(x, y) print(m.predict_top_k_class(x[:10]))
def build_embedding(self): embedding = BareEmbedding() return embedding
# ou o GloVE-300 do http://nilc.icmc.usp.br/embeddings se não der certo # 2 - Ver como fazer o Predict. Temos que processar a frase para ficar igual a deles. # Eles usam um PunktSentenceTokenizer com um abbrev_list. Esses scripts estao na pasta leNer-dataset. # 3 - Ver como integrar esse codigo com o webstruct atual # 4 - Seria uma boa ideia ter uma interface tipo o Broka. Para que existesse a lista de arquivos, e que # pudesse abrir para re-treinar, abrindo com o plugin de Ramon. # Uma ideia seria ate converter o dataset deles atual para o formato do broka hoje em Html ( pode ser algo simples, como colocar cada paragrafo como um p) # 5 - Fazer a persistencia ( O kashgari tem um metodo save/load) # 2 - Aumentar epochs para treinar # You can use WordEmbedding or BERTEmbedding for your text embedding text_embedding = BareEmbedding(task=kashgari.LABELING) text_embedding.analyze_corpus(tokens, labels) # Now we can embed with this stacked embedding layer # We can build any labeling model with this embedding from kashgari.tasks.labeling import BiLSTM_CRF_Model model = BiLSTM_CRF_Model(embedding=text_embedding) model.fit(tokens, labels, batch_size=8, epochs=10) print(model.predict(tokens)) # print(model.predict_entities(x))