Ejemplo n.º 1
0
 def build_embedding(self):
     sample_w2v_path = get_file(
         'sample_w2v.txt',
         "http://s3.bmio.net/kashgari/sample_w2v.txt",
         cache_dir=DATA_PATH)
     embedding = WordEmbedding(sample_w2v_path)
     return embedding
Ejemplo n.º 2
0
 def test_w2v_model(self):
     x, y = NERCorpus.load_corpus()
     w2v_embedding = WordEmbedding(sample_w2v_path, task=kashgari.LABELING)
     model = self.model_class(embedding=w2v_embedding)
     try:
         model.fit(x, y, x, y, epochs=1)
         model.evaluate(x, y)
         assert True
     except Exception as e:
         print(model.label2idx)
         raise e
Ejemplo n.º 3
0
    def test_with_word_embedding(self):
        w2v_embedding = WordEmbedding(TestMacros.w2v_path)
        model = self.TASK_MODEL_CLASS(embedding=w2v_embedding,
                                      sequence_length=120)
        train_x, train_y = TestMacros.load_labeling_corpus()
        valid_x, valid_y = train_x, train_y

        model.fit(train_x,
                  train_y,
                  x_validate=valid_x,
                  y_validate=valid_y,
                  epochs=self.EPOCH_COUNT)
Ejemplo n.º 4
0
    def test_variable_length_model(self):
        x, y = NERCorpus.load_corpus('custom_2')
        hyper_params = self.model_class.get_default_hyper_parameters()

        for layer, config in hyper_params.items():
            for key, value in config.items():
                if isinstance(value, int):
                    hyper_params[layer][key] = value + 15

        w2v_embedding_variable_len = WordEmbedding(sample_w2v_path,
                                                   task=kashgari.LABELING,
                                                   sequence_length='variable')
        model = self.model_class(embedding=w2v_embedding_variable_len,
                                 hyper_parameters=hyper_params)
        try:
            model.fit(x, y, epochs=1)
            model.evaluate(x, y)
            assert True
        except Exception as e:
            print(model.label2idx)
            raise e
Ejemplo n.º 5
0
 def setUpClass(cls):
     cls.EPOCH_COUNT = 1
     cls.TASK_MODEL_CLASS = BiLSTM_Model
     cls.w2v_embedding = WordEmbedding(TestMacros.w2v_path)
import kashgari
from kashgari.embeddings import WordEmbedding

# need to spesify task for the downstream task,
# if use embedding for feature extraction, just set `task=kashgari.CLASSIFICATION`
bert = WordEmbedding('sgns.sogou.word',
                     sequence_length=600)
# call for bulk embed
embed_tensor = bert.embed([['语', '言', '模', '型']])

# call for single embed

print(embed_tensor)
Ejemplo n.º 7
0
from kashgari.embeddings import BareEmbedding

from tensorflow.python.keras.utils import get_file

valid_x, valid_y = SMP2018ECDTCorpus.load_data('valid')

bert_path = get_file('bert_sample_model',
                     "http://s3.bmio.net/kashgari/bert_sample_model.tar.bz2",
                     cache_dir=DATA_PATH,
                     untar=True)

sample_w2v_path = get_file('sample_w2v.txt',
                           "http://s3.bmio.net/kashgari/sample_w2v.txt",
                           cache_dir=DATA_PATH)

w2v_embedding = WordEmbedding(sample_w2v_path, task=kashgari.CLASSIFICATION)
w2v_embedding_variable_len = WordEmbedding(sample_w2v_path,
                                           task=kashgari.CLASSIFICATION,
                                           sequence_length='variable')

logging.basicConfig(level=logging.DEBUG)

sample_train_x = [
    list('语言学(英语:linguistics)是一门关于人类语言的科学研究'),
    list('语言学(英语:linguistics)是一门关于人类语言的科学研究'),
    list('语言学(英语:linguistics)是一门关于人类语言的科学研究'),
    list('语言学包含了几种分支领域。'),
    list('在语言结构(语法)研究与意义(语义与语用)研究之间存在一个重要的主题划分'),
]

sample_train_y = [['b', 'c'], ['a'], ['a', 'c'], ['a', 'b'], ['c']]
Ejemplo n.º 8
0
        tensor = embed_model.output
        for layer in layer_stack:
            tensor = layer(tensor)

        self.tf_model: keras.Model = keras.Model(embed_model.inputs, tensor)


if __name__ == "__main__":
    import logging

    logging.basicConfig(level='DEBUG')

    from kashgari.embeddings import WordEmbedding

    w2v_path = '/Users/brikerman/Desktop/nlp/language_models/w2v/sgns.weibo.bigram-char'
    w2v = WordEmbedding(w2v_path, w2v_kwargs={'limit': 10000})

    from kashgari.corpus import SMP2018ECDTCorpus

    x, y = SMP2018ECDTCorpus.load_data()

    model = BiLSTM_Model(embedding=w2v)
    model.fit(x, y)

    # 或者集成 CorpusGenerator 实现自己的数据迭代器
    # train_gen = CorpusGenerator()
    # model.fit_generator(train_gen=train_gen,
    #                     valid_gen=valid_gen,
    #                     batch_size=batch_size,
    #                     epochs=epochs)
Ejemplo n.º 9
0
import numpy as np
import kashgari
from kashgari.corpus import ChineseDailyNerCorpus
from kashgari.embeddings import WordEmbedding
from kashgari.tasks.labeling import CNN_LSTM_Model
from kashgari.macros import DATA_PATH

from tensorflow.python.keras.utils import get_file

valid_x, valid_y = ChineseDailyNerCorpus.load_data('valid')

sample_w2v_path = get_file('sample_w2v.txt',
                           "http://s3.bmio.net/kashgari/sample_w2v.txt",
                           cache_dir=DATA_PATH)

w2v_embedding = WordEmbedding(sample_w2v_path, task=kashgari.LABELING)
w2v_embedding_variable_len = WordEmbedding(sample_w2v_path,
                                           task=kashgari.LABELING,
                                           sequence_length='variable')


class TestCNN_LSTM_Model(unittest.TestCase):
    @classmethod
    def setUpClass(cls):
        cls.model_class = CNN_LSTM_Model

    def test_basic_use_build(self):
        model = self.model_class()
        model.fit(valid_x, valid_y, valid_x, valid_y, epochs=1)
        model.predict_entities(valid_x[:5])
        model.evaluate(valid_x[:100], valid_y[:100])
# print(f"train data count: {len(train_x)}")
# print(f"validate data count: {len(valid_x)}")
# print(f"test data count: {len(test_x)}")
from kashgari.embeddings import WordEmbedding
from kashgari.embeddings import BareEmbedding
from kashgari.embeddings import BERTEmbedding
from kashgari.tasks.labeling import BiLSTM_CRF_Model_Attention
from kashgari.tasks.labeling import CNN_BiLSTM_CRF_Model_WordSegmentation
from kashgari.tasks.labeling import BiLSTM_CRF_Model
from kashgari.tasks.labeling import BiLSTM_LSTMDecoder_Model
from kashgari.tasks.labeling import BiLSTM_CRF_Model_Position
from kashgari import callbacks_word


# bare_embed = BareEmbedding(task=kashgari.LABELING,sequence_length=500)
char_embed = WordEmbedding(w2v_path="/home/y182235017/law/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5",task=kashgari.LABELING,sequence_length=500)
# bert_embed = BERTEmbedding("/home/y182235017/law/chinese_L-12_H-768_A-12",task=kashgari.LABELING,sequence_length=500)
model = CNN_BiLSTM_CRF_Model_WordSegmentation(char_embed)
mycallback = callbacks_word.EvalCallBack(model,test_x,test_y,batch_size=128,path="/home/y182235017/law/model/Word_CNN_BiLSTM_CRF_Model_seg/")
mycallback={"callbacks":[mycallback]}
model.fit_without_generator_word(
          train_x,
          train_y,
          train_z,
          x_validate=test_x,
          y_validate=test_y,
          z_validate=test_z,
          epochs=20,
          batch_size=128,
          **mycallback)