Ejemplo n.º 1
0
    def __init__(self,
                 embedding: Optional[Embedding] = None,
                 hyper_parameters: Optional[Dict[str, Dict[str, Any]]] = None):
        """

        Args:
            embedding: model embedding
            hyper_parameters: a dict of hyper_parameters.

        Examples:
            You could change customize hyper_parameters like this::

                # get default hyper_parameters
                hyper_parameters = BLSTMModel.get_default_hyper_parameters()
                # change lstm hidden unit to 12
                hyper_parameters['layer_blstm']['units'] = 12
                # init new model with customized hyper_parameters
                labeling_model = BLSTMModel(hyper_parameters=hyper_parameters)
                labeling_model.fit(x, y)
        """
        if embedding is None:
            self.embedding = BareEmbedding(task=self.__task__)
        else:
            self.embedding = embedding

        self.tf_model: keras.Model = None
        self.hyper_parameters = self.get_default_hyper_parameters()
        self.model_info = {}

        if hyper_parameters:
            self.hyper_parameters.update(hyper_parameters)
Ejemplo n.º 2
0
    def __init__(self,
                 embedding: ABCEmbedding = None,
                 sequence_length: int = None,
                 hyper_parameters: Dict[str, Dict[str, Any]] = None):
        """

        Args:
            embedding: embedding object
            sequence_length: target sequence length
            hyper_parameters: hyper_parameters to overwrite
        """
        super(ABCLabelingModel, self).__init__()
        if embedding is None:
            embedding = BareEmbedding()  # type: ignore

        if hyper_parameters is None:
            hyper_parameters = self.default_hyper_parameters()

        self.tf_model: Optional[tf.keras.Model] = None
        self.embedding = embedding
        self.hyper_parameters = hyper_parameters
        self.sequence_length = sequence_length
        self.text_processor: SequenceProcessor = SequenceProcessor()
        self.label_processor: SequenceProcessor = SequenceProcessor(
            build_in_vocab='labeling',
            min_count=1,
            build_vocab_from_labels=True)

        self.crf_layer: Optional[KConditionalRandomField] = None
Ejemplo n.º 3
0
    def test_training(self):
        text = ['NLP', 'Projects', 'Project', 'Name', ':']
        start_of_p = [1, 2, 1, 2, 2]
        bold = [1, 1, 1, 1, 2]
        center = [1, 1, 2, 2, 2]
        label = [
            'B-Category', 'I-Category', 'B-ProjectName', 'I-ProjectName',
            'I-ProjectName'
        ]

        text_list = [text] * 300
        start_of_p_list = [start_of_p] * 300
        bold_list = [bold] * 300
        center_list = [center] * 300
        label_list = [label] * 300

        # You can use WordEmbedding or BERTEmbedding for your text embedding
        SEQUENCE_LEN = 100
        text_embedding = BareEmbedding(task=kashgari.LABELING,
                                       sequence_length=SEQUENCE_LEN)
        start_of_p_embedding = NumericFeaturesEmbedding(
            feature_count=2,
            feature_name='start_of_p',
            sequence_length=SEQUENCE_LEN)

        bold_embedding = NumericFeaturesEmbedding(feature_count=2,
                                                  feature_name='bold',
                                                  sequence_length=SEQUENCE_LEN,
                                                  embedding_size=10)

        center_embedding = NumericFeaturesEmbedding(
            feature_count=2,
            feature_name='center',
            sequence_length=SEQUENCE_LEN)

        # first one must be the text, embedding
        stack_embedding = StackedEmbedding([
            text_embedding, start_of_p_embedding, bold_embedding,
            center_embedding
        ])

        x = (text_list, start_of_p_list, bold_list, center_list)
        y = label_list
        stack_embedding.analyze_corpus(x, y)

        model = BiLSTM_Model(embedding=stack_embedding)
        model.build_model(x, y)
        model.tf_model.summary()

        model.fit(x, y, epochs=2)

        model_path = os.path.join('./saved_models/',
                                  model.__class__.__module__,
                                  model.__class__.__name__)
        model.save(model_path)

        new_model = kashgari.utils.load_model(model_path)
Ejemplo n.º 4
0
 def test_build_and_fit(self):
     from kashgari.embeddings import BareEmbedding
     processor = MultiOutputProcessor()
     embedding = BareEmbedding(processor=processor)
     m = MultiOutputModel(embedding=embedding)
     m.build_model(train_x, (output_1, output_2))
     m.fit(train_x, (output_1, output_2), epochs=2)
     res = m.predict(train_x[:10])
     assert len(res) == 2
     assert res[0].shape == (10, 3)
Ejemplo n.º 5
0
    def __init__(self,
                 encoder_embedding: ABCEmbedding = None,
                 decoder_embedding: ABCEmbedding = None,
                 encoder_seq_length: int = None,
                 decoder_seq_length: int = None,
                 hidden_size: int = 1024,
                 **kwargs: Any):
        """
        Init Labeling Model

        Args:
            embedding: embedding object
            sequence_length: target sequence length
            hyper_parameters: hyper_parameters to overwrite
            **kwargs:
        """
        logger.warning("Seq2Seq API is experimental. It may be changed in the future without notice.")
        if encoder_embedding is None:
            encoder_embedding = BareEmbedding(embedding_size=256)  # type: ignore

        self.encoder_embedding = encoder_embedding

        if decoder_embedding is None:
            decoder_embedding = BareEmbedding(embedding_size=256)  # type: ignore

        self.decoder_embedding = decoder_embedding

        self.encoder_processor = SequenceProcessor(min_count=1)
        self.decoder_processor = SequenceProcessor(build_vocab_from_labels=True, min_count=1)

        self.encoder: GRUEncoder = None
        self.decoder: AttGRUDecoder = None

        self.hidden_size: int = hidden_size

        self.encoder_seq_length = encoder_seq_length
        self.decoder_seq_length = decoder_seq_length

        self.optimizer = tf.keras.optimizers.Adam()
        self.loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')
Ejemplo n.º 6
0
    def test_multi_label(self):
        p = ClassificationProcessor(multi_label=True)
        embedding = BareEmbedding(task='classification', processor=p)
        model = self.model_class(embedding)
        model.fit(sample_train_x, sample_train_y, epochs=1)
        assert len(p.label2idx) == 3

        model.evaluate(sample_eval_x, sample_eval_y)
        assert isinstance(model.predict(sample_eval_x)[0], tuple)
        report_dict = model.evaluate(sample_eval_x,
                                     sample_eval_y,
                                     output_dict=True)
        assert isinstance(report_dict, dict)
Ejemplo n.º 7
0
    def test_embedding(self):
        text, label = ChineseDailyNerCorpus.load_data()
        is_bold = np.random.randint(1, 3, (len(text), 12))

        text_embedding = BareEmbedding(task=kashgari.LABELING,
                                       sequence_length=12)
        num_feature_embedding = NumericFeaturesEmbedding(2,
                                                         'is_bold',
                                                         sequence_length=12)

        stack_embedding = StackedEmbedding(
            [text_embedding, num_feature_embedding])
        stack_embedding.analyze_corpus((text, is_bold), label)

        stack_embedding.process_x_dataset((text[:3], is_bold[:3]))
        r = stack_embedding.embed((text[:3], is_bold[:3]))
        assert r.shape == (3, 12, 116)
Ejemplo n.º 8
0
    def train(self, tokens, tags):

        x, y = self.prepare_data_fit(tokens, tags, chunk_size=self.chunk_size)

        text_embedding = BareEmbedding(task=kashgari.LABELING,
                                       sequence_length=self.chunk_size)
        first_of_p_embedding = NumericFeaturesEmbedding(
            feature_count=2,
            feature_name='first_of_p',
            sequence_length=self.chunk_size)

        stack_embedding = StackedEmbedding(
            [text_embedding, first_of_p_embedding])

        stack_embedding.analyze_corpus(x, y)

        from kashgari.tasks.labeling import BiLSTM_Model, BiLSTM_CRF_Model
        self.model = BiLSTM_CRF_Model(embedding=stack_embedding)
        self.model.fit(x, y, batch_size=1, epochs=20)
Ejemplo n.º 9
0
    def test_multi_label(self):
        p = ClassificationProcessor(multi_label=True)
        embedding = BareEmbedding(task='classification', processor=p)
        model = self.model_class(embedding)
        model.fit(sample_train_x, sample_train_y, epochs=1)
        assert len(p.label2idx) == 3

        model.evaluate(sample_eval_x, sample_eval_y)
        assert isinstance(model.predict(sample_eval_x)[0], tuple)
        report_dict = model.evaluate(sample_eval_x,
                                     sample_eval_y,
                                     output_dict=True)
        assert isinstance(report_dict, dict)
        res = model.predict(valid_x[:20])
        model_path = os.path.join(tempfile.gettempdir(), str(time.time()))
        model.save(model_path)

        new_model = kashgari.utils.load_model(model_path)
        assert res == new_model.predict(valid_x[:20])
Ejemplo n.º 10
0
            tensor_rnn = layer(tensor_rnn)
        tensor_sensors = [layer(tensor_rnn) for layer in layers_sensor]
        tensor_output = layer_allviews(tensor_sensors)
        for layer in layers_full_connect:
            tensor_output = layer(tensor_output)

        self.tf_model = tf.keras.Model(embed_model.inputs, tensor_output)


if __name__ == "__main__":
    print(BiLSTM_Model.get_default_hyper_parameters())
    logging.basicConfig(level=logging.DEBUG)
    from kashgari.corpus import SMP2018ECDTCorpus

    x, y = SMP2018ECDTCorpus.load_data()

    import kashgari
    from kashgari.processors.classification_processor import ClassificationProcessor
    from kashgari.embeddings import BareEmbedding

    processor = ClassificationProcessor(multi_label=False)
    embed = BareEmbedding(task=kashgari.CLASSIFICATION,
                          sequence_length=30,
                          processor=processor)
    m = BiLSTM_Model(embed)
    # m.build_model(x, y)
    m.fit(x, y, epochs=2)
    print(m.predict(x[:10]))
    # m.evaluate(x, y)
    print(m.predict_top_k_class(x[:10]))
Ejemplo n.º 11
0
 def build_embedding(self):
     embedding = BareEmbedding()
     return embedding
Ejemplo n.º 12
0
    # ou o GloVE-300 do http://nilc.icmc.usp.br/embeddings se não der certo

    # 2 - Ver como fazer o Predict. Temos que processar a frase para ficar igual a deles.
    # Eles usam um PunktSentenceTokenizer com um abbrev_list. Esses scripts estao na pasta leNer-dataset.

    # 3 - Ver como integrar esse codigo com o webstruct atual
    # 4 - Seria uma boa ideia ter uma interface tipo o Broka. Para que existesse a lista de arquivos, e que
    # pudesse abrir para re-treinar, abrindo com o plugin de Ramon.
    # Uma ideia seria ate converter o dataset deles atual para o formato do broka hoje em Html ( pode ser algo simples, como colocar cada paragrafo como um p)

    # 5 - Fazer a persistencia ( O kashgari tem um metodo save/load)


    # 2 - Aumentar epochs para treinar

    # You can use WordEmbedding or BERTEmbedding for your text embedding
    text_embedding = BareEmbedding(task=kashgari.LABELING)

    text_embedding.analyze_corpus(tokens, labels)

    # Now we can embed with this stacked embedding layer
    # We can build any labeling model with this embedding

    from kashgari.tasks.labeling import BiLSTM_CRF_Model

    model = BiLSTM_CRF_Model(embedding=text_embedding)
    model.fit(tokens, labels, batch_size=8, epochs=10)

    print(model.predict(tokens))
    # print(model.predict_entities(x))