Beispiel #1
0
    def __init__(self,
                 embedding: Optional[Embedding] = None,
                 hyper_parameters: Optional[Dict[str, Dict[str, Any]]] = None):
        """

        Args:
            embedding: model embedding
            hyper_parameters: a dict of hyper_parameters.

        Examples:
            You could change customize hyper_parameters like this::

                # get default hyper_parameters
                hyper_parameters = BLSTMModel.get_default_hyper_parameters()
                # change lstm hidden unit to 12
                hyper_parameters['layer_blstm']['units'] = 12
                # init new model with customized hyper_parameters
                labeling_model = BLSTMModel(hyper_parameters=hyper_parameters)
                labeling_model.fit(x, y)
        """
        if embedding is None:
            self.embedding = BareEmbedding(task=self.__task__)
        else:
            self.embedding = embedding

        self.tf_model: keras.Model = None
        self.hyper_parameters = self.get_default_hyper_parameters()
        self.model_info = {}

        if hyper_parameters:
            self.hyper_parameters.update(hyper_parameters)
Beispiel #2
0
    def __init__(self,
                 embedding: ABCEmbedding = None,
                 sequence_length: int = None,
                 hyper_parameters: Dict[str, Dict[str, Any]] = None):
        """

        Args:
            embedding: embedding object
            sequence_length: target sequence length
            hyper_parameters: hyper_parameters to overwrite
        """
        super(ABCLabelingModel, self).__init__()
        if embedding is None:
            embedding = BareEmbedding()  # type: ignore

        if hyper_parameters is None:
            hyper_parameters = self.default_hyper_parameters()

        self.tf_model: Optional[tf.keras.Model] = None
        self.embedding = embedding
        self.hyper_parameters = hyper_parameters
        self.sequence_length = sequence_length
        self.text_processor: SequenceProcessor = SequenceProcessor()
        self.label_processor: SequenceProcessor = SequenceProcessor(
            build_in_vocab='labeling',
            min_count=1,
            build_vocab_from_labels=True)

        self.crf_layer: Optional[KConditionalRandomField] = None
Beispiel #3
0
    def test_training(self):
        text = ['NLP', 'Projects', 'Project', 'Name', ':']
        start_of_p = [1, 2, 1, 2, 2]
        bold = [1, 1, 1, 1, 2]
        center = [1, 1, 2, 2, 2]
        label = [
            'B-Category', 'I-Category', 'B-ProjectName', 'I-ProjectName',
            'I-ProjectName'
        ]

        text_list = [text] * 300
        start_of_p_list = [start_of_p] * 300
        bold_list = [bold] * 300
        center_list = [center] * 300
        label_list = [label] * 300

        # You can use WordEmbedding or BERTEmbedding for your text embedding
        SEQUENCE_LEN = 100
        text_embedding = BareEmbedding(task=kashgari.LABELING,
                                       sequence_length=SEQUENCE_LEN)
        start_of_p_embedding = NumericFeaturesEmbedding(
            feature_count=2,
            feature_name='start_of_p',
            sequence_length=SEQUENCE_LEN)

        bold_embedding = NumericFeaturesEmbedding(feature_count=2,
                                                  feature_name='bold',
                                                  sequence_length=SEQUENCE_LEN,
                                                  embedding_size=10)

        center_embedding = NumericFeaturesEmbedding(
            feature_count=2,
            feature_name='center',
            sequence_length=SEQUENCE_LEN)

        # first one must be the text, embedding
        stack_embedding = StackedEmbedding([
            text_embedding, start_of_p_embedding, bold_embedding,
            center_embedding
        ])

        x = (text_list, start_of_p_list, bold_list, center_list)
        y = label_list
        stack_embedding.analyze_corpus(x, y)

        model = BiLSTM_Model(embedding=stack_embedding)
        model.build_model(x, y)
        model.tf_model.summary()

        model.fit(x, y, epochs=2)

        model_path = os.path.join('./saved_models/',
                                  model.__class__.__module__,
                                  model.__class__.__name__)
        model.save(model_path)

        new_model = kashgari.utils.load_model(model_path)
Beispiel #4
0
 def test_build_and_fit(self):
     from kashgari.embeddings import BareEmbedding
     processor = MultiOutputProcessor()
     embedding = BareEmbedding(processor=processor)
     m = MultiOutputModel(embedding=embedding)
     m.build_model(train_x, (output_1, output_2))
     m.fit(train_x, (output_1, output_2), epochs=2)
     res = m.predict(train_x[:10])
     assert len(res) == 2
     assert res[0].shape == (10, 3)
Beispiel #5
0
    def __init__(self,
                 encoder_embedding: ABCEmbedding = None,
                 decoder_embedding: ABCEmbedding = None,
                 encoder_seq_length: int = None,
                 decoder_seq_length: int = None,
                 hidden_size: int = 1024,
                 **kwargs: Any):
        """
        Init Labeling Model

        Args:
            embedding: embedding object
            sequence_length: target sequence length
            hyper_parameters: hyper_parameters to overwrite
            **kwargs:
        """
        logger.warning("Seq2Seq API is experimental. It may be changed in the future without notice.")
        if encoder_embedding is None:
            encoder_embedding = BareEmbedding(embedding_size=256)  # type: ignore

        self.encoder_embedding = encoder_embedding

        if decoder_embedding is None:
            decoder_embedding = BareEmbedding(embedding_size=256)  # type: ignore

        self.decoder_embedding = decoder_embedding

        self.encoder_processor = SequenceProcessor(min_count=1)
        self.decoder_processor = SequenceProcessor(build_vocab_from_labels=True, min_count=1)

        self.encoder: GRUEncoder = None
        self.decoder: AttGRUDecoder = None

        self.hidden_size: int = hidden_size

        self.encoder_seq_length = encoder_seq_length
        self.decoder_seq_length = decoder_seq_length

        self.optimizer = tf.keras.optimizers.Adam()
        self.loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')
Beispiel #6
0
    def test_multi_label(self):
        p = ClassificationProcessor(multi_label=True)
        embedding = BareEmbedding(task='classification', processor=p)
        model = self.model_class(embedding)
        model.fit(sample_train_x, sample_train_y, epochs=1)
        assert len(p.label2idx) == 3

        model.evaluate(sample_eval_x, sample_eval_y)
        assert isinstance(model.predict(sample_eval_x)[0], tuple)
        report_dict = model.evaluate(sample_eval_x,
                                     sample_eval_y,
                                     output_dict=True)
        assert isinstance(report_dict, dict)
Beispiel #7
0
    def test_embedding(self):
        text, label = ChineseDailyNerCorpus.load_data()
        is_bold = np.random.randint(1, 3, (len(text), 12))

        text_embedding = BareEmbedding(task=kashgari.LABELING,
                                       sequence_length=12)
        num_feature_embedding = NumericFeaturesEmbedding(2,
                                                         'is_bold',
                                                         sequence_length=12)

        stack_embedding = StackedEmbedding(
            [text_embedding, num_feature_embedding])
        stack_embedding.analyze_corpus((text, is_bold), label)

        stack_embedding.process_x_dataset((text[:3], is_bold[:3]))
        r = stack_embedding.embed((text[:3], is_bold[:3]))
        assert r.shape == (3, 12, 116)
Beispiel #8
0
    def train(self, tokens, tags):

        x, y = self.prepare_data_fit(tokens, tags, chunk_size=self.chunk_size)

        text_embedding = BareEmbedding(task=kashgari.LABELING,
                                       sequence_length=self.chunk_size)
        first_of_p_embedding = NumericFeaturesEmbedding(
            feature_count=2,
            feature_name='first_of_p',
            sequence_length=self.chunk_size)

        stack_embedding = StackedEmbedding(
            [text_embedding, first_of_p_embedding])

        stack_embedding.analyze_corpus(x, y)

        from kashgari.tasks.labeling import BiLSTM_Model, BiLSTM_CRF_Model
        self.model = BiLSTM_CRF_Model(embedding=stack_embedding)
        self.model.fit(x, y, batch_size=1, epochs=20)
Beispiel #9
0
    def test_multi_label(self):
        p = ClassificationProcessor(multi_label=True)
        embedding = BareEmbedding(task='classification', processor=p)
        model = self.model_class(embedding)
        model.fit(sample_train_x, sample_train_y, epochs=1)
        assert len(p.label2idx) == 3

        model.evaluate(sample_eval_x, sample_eval_y)
        assert isinstance(model.predict(sample_eval_x)[0], tuple)
        report_dict = model.evaluate(sample_eval_x,
                                     sample_eval_y,
                                     output_dict=True)
        assert isinstance(report_dict, dict)
        res = model.predict(valid_x[:20])
        model_path = os.path.join(tempfile.gettempdir(), str(time.time()))
        model.save(model_path)

        new_model = kashgari.utils.load_model(model_path)
        assert res == new_model.predict(valid_x[:20])
Beispiel #10
0
class BaseModel(object):
    """Base Sequence Labeling Model"""
    @classmethod
    def get_default_hyper_parameters(cls) -> Dict[str, Dict[str, Any]]:
        raise NotImplementedError

    def info(self):
        model_json_str = self.tf_model.to_json()

        return {
            'config': {
                'hyper_parameters': self.hyper_parameters,
            },
            'tf_model': json.loads(model_json_str),
            'embedding': self.embedding.info(),
            'class_name': self.__class__.__name__,
            'module': self.__class__.__module__,
            'tf_version': tf.__version__,
            'kashgari_version': kashgari.__version__
        }

    @property
    def task(self):
        return self.embedding.task

    @property
    def token2idx(self) -> Dict[str, int]:
        return self.embedding.token2idx

    @property
    def label2idx(self) -> Dict[str, int]:
        return self.embedding.label2idx

    @property
    def pre_processor(self):
        warnings.warn(
            "The 'pre_processor' property is deprecated, "
            "use 'processor' instead", DeprecationWarning, 2)
        """Deprecated. Use `self.processor` instead."""
        return self.embedding.processor

    @property
    def processor(self):
        return self.embedding.processor

    def __init__(self,
                 embedding: Optional[Embedding] = None,
                 hyper_parameters: Optional[Dict[str, Dict[str, Any]]] = None):
        """

        Args:
            embedding: model embedding
            hyper_parameters: a dict of hyper_parameters.

        Examples:
            You could change customize hyper_parameters like this::

                # get default hyper_parameters
                hyper_parameters = BLSTMModel.get_default_hyper_parameters()
                # change lstm hidden unit to 12
                hyper_parameters['layer_blstm']['units'] = 12
                # init new model with customized hyper_parameters
                labeling_model = BLSTMModel(hyper_parameters=hyper_parameters)
                labeling_model.fit(x, y)
        """
        if embedding is None:
            self.embedding = BareEmbedding(task=self.__task__)
        else:
            self.embedding = embedding

        self.tf_model: keras.Model = None
        self.hyper_parameters = self.get_default_hyper_parameters()
        self.model_info = {}

        if hyper_parameters:
            self.hyper_parameters.update(hyper_parameters)

    def build_model(self,
                    x_train: Union[Tuple[List[List[str]], ...],
                                   List[List[str]]],
                    y_train: Union[List[List[str]], List[str]],
                    x_validate: Union[Tuple[List[List[str]], ...],
                                      List[List[str]]] = None,
                    y_validate: Union[List[List[str]], List[str]] = None):
        """
        Build model with corpus

        Args:
            x_train: Array of train feature data (if the model has a single input),
                or tuple of train feature data array (if the model has multiple inputs)
            y_train: Array of train label data
            x_validate: Array of validation feature data (if the model has a single input),
                or tuple of validation feature data array (if the model has multiple inputs)
            y_validate: Array of validation label data

        Returns:

        """

        if x_validate is not None and not isinstance(x_validate, tuple):
            self.embedding.analyze_corpus(x_train + x_validate,
                                          y_train + y_validate)
        else:
            self.embedding.analyze_corpus(x_train, y_train)

        if self.tf_model is None:
            self.build_model_arc()
            self.compile_model()

    def build_multi_gpu_model(self,
                              gpus: int,
                              x_train: Union[Tuple[List[List[str]], ...],
                                             List[List[str]]],
                              y_train: Union[List[List[str]], List[str]],
                              cpu_merge: bool = True,
                              cpu_relocation: bool = False,
                              x_validate: Union[Tuple[List[List[str]], ...],
                                                List[List[str]]] = None,
                              y_validate: Union[List[List[str]],
                                                List[str]] = None):
        """
        Build multi-GPU model with corpus

        Args:
            gpus: Integer >= 2, number of on GPUs on which to create model replicas.
            cpu_merge: A boolean value to identify whether to force merging model weights
                under the scope of the CPU or not.
            cpu_relocation: A boolean value to identify whether to create the model's weights
                under the scope of the CPU. If the model is not defined under any preceding device
                scope, you can still rescue it by activating this option.
            x_train: Array of train feature data (if the model has a single input),
                or tuple of train feature data array (if the model has multiple inputs)
            y_train: Array of train label data
            x_validate: Array of validation feature data (if the model has a single input),
                or tuple of validation feature data array (if the model has multiple inputs)
            y_validate: Array of validation label data

        Returns:

        """
        if x_validate is not None and not isinstance(x_validate, tuple):
            self.embedding.analyze_corpus(x_train + x_validate,
                                          y_train + y_validate)
        else:
            self.embedding.analyze_corpus(x_train, y_train)

        if self.tf_model is None:
            with utils.custom_object_scope():
                self.build_model_arc()
                self.tf_model = tf.keras.utils.multi_gpu_model(
                    self.tf_model,
                    gpus,
                    cpu_merge=cpu_merge,
                    cpu_relocation=cpu_relocation)
                self.compile_model()

    def build_tpu_model(self,
                        strategy: tf.contrib.distribute.TPUStrategy,
                        x_train: Union[Tuple[List[List[str]], ...],
                                       List[List[str]]],
                        y_train: Union[List[List[str]], List[str]],
                        x_validate: Union[Tuple[List[List[str]], ...],
                                          List[List[str]]] = None,
                        y_validate: Union[List[List[str]], List[str]] = None):
        """
        Build TPU model with corpus

        Args:
            strategy: `TPUDistributionStrategy`. The strategy to use for replicating model
                across multiple TPU cores.
            x_train: Array of train feature data (if the model has a single input),
                or tuple of train feature data array (if the model has multiple inputs)
            y_train: Array of train label data
            x_validate: Array of validation feature data (if the model has a single input),
                or tuple of validation feature data array (if the model has multiple inputs)
            y_validate: Array of validation label data

        Returns:

        """

        if x_validate is not None and not isinstance(x_validate, tuple):
            self.embedding.analyze_corpus(x_train + x_validate,
                                          y_train + y_validate)
        else:
            self.embedding.analyze_corpus(x_train, y_train)
        if self.tf_model is None:
            with utils.custom_object_scope():
                self.build_model_arc()
                self.tf_model = tf.contrib.tpu.keras_to_tpu_model(
                    self.tf_model, strategy=strategy)
                self.compile_model(optimizer=tf.train.AdamOptimizer())

    def get_data_generator(self,
                           x_data,
                           y_data,
                           batch_size: int = 64,
                           shuffle: bool = True):
        """
        data generator for fit_generator

        Args:
            x_data: Array of feature data (if the model has a single input),
                or tuple of feature data array (if the model has multiple inputs)
            y_data: Array of label data
            batch_size: Number of samples per gradient update, default to 64.
            shuffle:

        Returns:
            data generator
        """
        index_list = np.arange(len(x_data))
        page_count = len(x_data) // batch_size + 1

        while True:
            if shuffle:
                np.random.shuffle(index_list)
            for page in range(page_count):
                start_index = page * batch_size
                end_index = start_index + batch_size
                target_index = index_list[start_index:end_index]

                if len(target_index) == 0:
                    target_index = index_list[0:batch_size]
                x_tensor = self.embedding.process_x_dataset(
                    x_data, target_index)
                y_tensor = self.embedding.process_y_dataset(
                    y_data, target_index)
                yield (x_tensor, y_tensor)

    def fit(self,
            x_train: Union[Tuple[List[List[str]], ...], List[List[str]]],
            y_train: Union[List[List[str]], List[str]],
            x_validate: Union[Tuple[List[List[str]], ...],
                              List[List[str]]] = None,
            y_validate: Union[List[List[str]], List[str]] = None,
            batch_size: int = 64,
            epochs: int = 5,
            callbacks: List[keras.callbacks.Callback] = None,
            fit_kwargs: Dict = None,
            shuffle: bool = True):
        """
        Trains the model for a given number of epochs with fit_generator (iterations on a dataset).

        Args:
            x_train: Array of train feature data (if the model has a single input),
                or tuple of train feature data array (if the model has multiple inputs)
            y_train: Array of train label data
            x_validate: Array of validation feature data (if the model has a single input),
                or tuple of validation feature data array (if the model has multiple inputs)
            y_validate: Array of validation label data
            batch_size: Number of samples per gradient update, default to 64.
            epochs: Integer. Number of epochs to train the model. default 5.
            callbacks:
            fit_kwargs: fit_kwargs: additional arguments passed to ``fit_generator()`` function from
                ``tensorflow.keras.Model``
                - https://www.tensorflow.org/api_docs/python/tf/keras/models/Model#fit_generator
            shuffle:

        Returns:

        """
        self.build_model(x_train, y_train, x_validate, y_validate)
        train_generator = self.get_data_generator(x_train, y_train, batch_size,
                                                  shuffle)
        if fit_kwargs is None:
            fit_kwargs = {}

        validation_generator = None
        validation_steps = None
        if x_validate:
            validation_generator = self.get_data_generator(
                x_validate, y_validate, batch_size, shuffle)

            if isinstance(x_validate, tuple):
                validation_steps = len(x_validate[0]) // batch_size + 1
            else:
                validation_steps = len(x_validate) // batch_size + 1

        if isinstance(x_train, tuple):
            steps_per_epoch = len(x_train[0]) // batch_size + 1
        else:
            steps_per_epoch = len(x_train) // batch_size + 1
        with utils.custom_object_scope():
            return self.tf_model.fit_generator(
                train_generator,
                steps_per_epoch=steps_per_epoch,
                epochs=epochs,
                validation_data=validation_generator,
                validation_steps=validation_steps,
                callbacks=callbacks,
                **fit_kwargs)

    def fit_without_generator(self,
                              x_train: Union[Tuple[List[List[str]], ...],
                                             List[List[str]]],
                              y_train: Union[List[List[str]], List[str]],
                              x_validate: Union[Tuple[List[List[str]], ...],
                                                List[List[str]]] = None,
                              y_validate: Union[List[List[str]],
                                                List[str]] = None,
                              batch_size: int = 64,
                              epochs: int = 5,
                              callbacks: List[keras.callbacks.Callback] = None,
                              fit_kwargs: Dict = None):
        """
        Trains the model for a given number of epochs (iterations on a dataset).

        Args:
            x_train: Array of train feature data (if the model has a single input),
                or tuple of train feature data array (if the model has multiple inputs)
            y_train: Array of train label data
            x_validate: Array of validation feature data (if the model has a single input),
                or tuple of validation feature data array (if the model has multiple inputs)
            y_validate: Array of validation label data
            batch_size: Number of samples per gradient update, default to 64.
            epochs: Integer. Number of epochs to train the model. default 5.
            callbacks:
            fit_kwargs: fit_kwargs: additional arguments passed to ``fit_generator()`` function from
                ``tensorflow.keras.Model``
                - https://www.tensorflow.org/api_docs/python/tf/keras/models/Model#fit_generator

        Returns:

        """
        self.build_model(x_train, y_train, x_validate, y_validate)
        tensor_x = self.embedding.process_x_dataset(x_train)
        tensor_y = self.embedding.process_y_dataset(y_train)

        validation_data = None
        if x_validate is not None:
            tensor_valid_x = self.embedding.process_x_dataset(x_validate)
            tensor_valid_y = self.embedding.process_y_dataset(y_validate)
            validation_data = (tensor_valid_x, tensor_valid_y)

        if fit_kwargs is None:
            fit_kwargs = {}

        if callbacks and 'callbacks' not in fit_kwargs:
            fit_kwargs['callbacks'] = callbacks

        with utils.custom_object_scope():
            return self.tf_model.fit(tensor_x,
                                     tensor_y,
                                     validation_data=validation_data,
                                     epochs=epochs,
                                     batch_size=batch_size,
                                     **fit_kwargs)

    def compile_model(self, **kwargs):
        """Configures the model for training.

        Using ``compile()`` function of ``tf.keras.Model`` -
        https://www.tensorflow.org/api_docs/python/tf/keras/models/Model#compile

        Args:
            **kwargs: arguments passed to ``compile()`` function of ``tf.keras.Model``

        Defaults:
            - loss: ``categorical_crossentropy``
            - optimizer: ``adam``
            - metrics: ``['accuracy']``
        """
        if kwargs.get('loss') is None:
            kwargs['loss'] = 'categorical_crossentropy'
        if kwargs.get('optimizer') is None:
            kwargs['optimizer'] = 'adam'
        if kwargs.get('metrics') is None:
            kwargs['metrics'] = ['accuracy']

        self.tf_model.compile(**kwargs)
        if not kashgari.config.disable_auto_summary:
            self.tf_model.summary()

    def predict(self,
                x_data,
                batch_size=32,
                debug_info=False,
                predict_kwargs: Dict = None):
        """
        Generates output predictions for the input samples.

        Computation is done in batches.

        Args:
            x_data: The input data, as a Numpy array (or list of Numpy arrays if the model has multiple inputs).
            batch_size: Integer. If unspecified, it will default to 32.
            debug_info: Bool, Should print out the logging info.
            predict_kwargs: arguments passed to ``predict()`` function of ``tf.keras.Model``

        Returns:
            array(s) of predictions.
        """
        if predict_kwargs is None:
            predict_kwargs = {}
        with utils.custom_object_scope():
            if isinstance(x_data, tuple):
                lengths = [len(sen) for sen in x_data[0]]
            else:
                lengths = [len(sen) for sen in x_data]
            tensor = self.embedding.process_x_dataset(x_data)
            pred = self.tf_model.predict(tensor,
                                         batch_size=batch_size,
                                         **predict_kwargs)
            res = self.embedding.reverse_numerize_label_sequences(
                pred.argmax(-1), lengths)
            if debug_info:
                logging.info('input: {}'.format(tensor))
                logging.info('output: {}'.format(pred))
                logging.info('output argmax: {}'.format(pred.argmax(-1)))
        return res

    def evaluate(self,
                 x_data,
                 y_data,
                 batch_size=None,
                 digits=4,
                 debug_info=False) -> Tuple[float, float, Dict]:
        """
        Evaluate model
        Args:
            x_data:
            y_data:
            batch_size:
            digits:
            debug_info:

        Returns:

        """
        raise NotImplementedError

    def build_model_arc(self):
        raise NotImplementedError

    def save(self, model_path: str):
        """
        Save model
        Args:
            model_path:

        Returns:

        """
        pathlib.Path(model_path).mkdir(exist_ok=True, parents=True)

        with open(os.path.join(model_path, 'model_info.json'), 'w') as f:
            f.write(json.dumps(self.info(), indent=2, ensure_ascii=True))
            f.close()

        self.tf_model.save_weights(os.path.join(model_path,
                                                'model_weights.h5'))
        logging.info('model saved to {}'.format(os.path.abspath(model_path)))
Beispiel #11
0
            tensor_rnn = layer(tensor_rnn)
        tensor_sensors = [layer(tensor_rnn) for layer in layers_sensor]
        tensor_output = layer_allviews(tensor_sensors)
        for layer in layers_full_connect:
            tensor_output = layer(tensor_output)

        self.tf_model = tf.keras.Model(embed_model.inputs, tensor_output)


if __name__ == "__main__":
    print(BiLSTM_Model.get_default_hyper_parameters())
    logging.basicConfig(level=logging.DEBUG)
    from kashgari.corpus import SMP2018ECDTCorpus

    x, y = SMP2018ECDTCorpus.load_data()

    import kashgari
    from kashgari.processors.classification_processor import ClassificationProcessor
    from kashgari.embeddings import BareEmbedding

    processor = ClassificationProcessor(multi_label=False)
    embed = BareEmbedding(task=kashgari.CLASSIFICATION,
                          sequence_length=30,
                          processor=processor)
    m = BiLSTM_Model(embed)
    # m.build_model(x, y)
    m.fit(x, y, epochs=2)
    print(m.predict(x[:10]))
    # m.evaluate(x, y)
    print(m.predict_top_k_class(x[:10]))
Beispiel #12
0
 def build_embedding(self):
     embedding = BareEmbedding()
     return embedding
Beispiel #13
0
    # ou o GloVE-300 do http://nilc.icmc.usp.br/embeddings se não der certo

    # 2 - Ver como fazer o Predict. Temos que processar a frase para ficar igual a deles.
    # Eles usam um PunktSentenceTokenizer com um abbrev_list. Esses scripts estao na pasta leNer-dataset.

    # 3 - Ver como integrar esse codigo com o webstruct atual
    # 4 - Seria uma boa ideia ter uma interface tipo o Broka. Para que existesse a lista de arquivos, e que
    # pudesse abrir para re-treinar, abrindo com o plugin de Ramon.
    # Uma ideia seria ate converter o dataset deles atual para o formato do broka hoje em Html ( pode ser algo simples, como colocar cada paragrafo como um p)

    # 5 - Fazer a persistencia ( O kashgari tem um metodo save/load)


    # 2 - Aumentar epochs para treinar

    # You can use WordEmbedding or BERTEmbedding for your text embedding
    text_embedding = BareEmbedding(task=kashgari.LABELING)

    text_embedding.analyze_corpus(tokens, labels)

    # Now we can embed with this stacked embedding layer
    # We can build any labeling model with this embedding

    from kashgari.tasks.labeling import BiLSTM_CRF_Model

    model = BiLSTM_CRF_Model(embedding=text_embedding)
    model.fit(tokens, labels, batch_size=8, epochs=10)

    print(model.predict(tokens))
    # print(model.predict_entities(x))