def test_init_with_processor(self):
        valid_x, valid_y = SMP2018ECDTCorpus.load_data('valid')

        processor = ClassificationProcessor()
        processor.analyze_corpus(valid_x, valid_y)

        embedding = self.embedding_class(sequence_length=20,
                                         processor=processor,
                                         **self.config)
        embedding.analyze_corpus(valid_x, valid_y)
        assert embedding.embed_one(['我', '想', '看']).shape == (20, self.embedding_size)
Exemple #2
0
    def __init__(self, component_config=None, model=None):
        super(KashgariIntentClassifier, self).__init__(component_config)

        bert_model_path = self.component_config.get('bert_model_path')
        sequence_length = self.component_config.get('sequence_length')
        layer_nums = self.component_config.get('layer_nums')
        trainable = self.component_config.get('trainable')
        use_cudnn_cell = self.component_config.get('use_cudnn_cell')
        self.multi_label = self.component_config.get('multi_label')
        self.split_symbol = self.component_config.get('split_symbol')

        kashgari.config.use_cudnn_cell = use_cudnn_cell
        processor = ClassificationProcessor(multi_label=self.multi_label)

        self.classifier_model = self.component_config.get('classifier_model')

        self.bert_embedding = BERTEmbedding(bert_model_path,
                                            task=kashgari.CLASSIFICATION,
                                            layer_nums=layer_nums,
                                            trainable=trainable,
                                            processor=processor,
                                            sequence_length=sequence_length)

        self.tokenizer = self.bert_embedding.tokenizer

        self.model = model
Exemple #3
0
    def __init__(self,
                 task: str = None,
                 sequence_length: Union[int, str] = 'auto',
                 embedding_size: int = 100,
                 processor: Optional[BaseProcessor] = None,
                 from_saved_model: bool = False):
        self.task = task
        self.embedding_size = embedding_size

        if processor is None:
            if task == kashgari.CLASSIFICATION:
                self.processor = ClassificationProcessor()
            elif task == kashgari.LABELING:
                self.processor = LabelingProcessor()
            elif task == kashgari.SCORING:
                self.processor = ScoringProcessor()
            else:
                raise ValueError(
                    'Need to set the processor param, value: {labeling, classification, scoring}'
                )
        else:
            self.processor = processor

        self.sequence_length: Union[int, str] = sequence_length
        self.embed_model: Optional[keras.Model] = None
        self._tokenizer = None
Exemple #4
0
    def test_init_with_processor(self):
        valid_x, valid_y = SMP2018ECDTCorpus.load_data('valid')

        processor = ClassificationProcessor()
        processor.analyze_corpus(valid_x, valid_y)
        if self.embedding_class is BareEmbedding:
            self.config['embedding_size'] = 55

        embedding = self.embedding_class(sequence_length=20,
                                         processor=processor,
                                         **self.config)
        if self.embedding_class is BERTEmbedding:
            seq_len = 16
        else:
            seq_len = 20

        assert embedding.embed_one(['我', '想',
                                    '看']).shape == (seq_len,
                                                    embedding.embedding_size)
Exemple #5
0
 def __init__(self, hyper_parameters):
     kashgari.config.use_cudnn_cell = False
     processor = ClassificationProcessor(multi_label=False)
     self.bert_embedding = BERTEmbedding(
         hyper_parameters['model']['bert_model_path'],
         task=kashgari.CLASSIFICATION,
         layer_nums=hyper_parameters['model']['layer_nums'],
         trainable=hyper_parameters['model']['trainable'],
         processor=processor,
         sequence_length='auto')
     print(len(self.bert_embedding._tokenizer._token_dict_inv))
     self.tokenizer = self.bert_embedding.tokenizer
Exemple #6
0
    def test_multi_label_processor(self):
        p = ClassificationProcessor(multi_label=True)
        p.analyze_corpus(sample_train_x, sample_train_y)
        assert len(p.label2idx) == 3

        print(p.process_x_dataset(sample_train_x))
        print(p.process_y_dataset(sample_train_y))
Exemple #7
0
    def test_multi_label(self):
        p = ClassificationProcessor(multi_label=True)
        embedding = BareEmbedding(task='classification', processor=p)
        model = self.model_class(embedding)
        model.fit(sample_train_x, sample_train_y, epochs=1)
        assert len(p.label2idx) == 3

        model.evaluate(sample_eval_x, sample_eval_y)
        assert isinstance(model.predict(sample_eval_x)[0], tuple)
        report_dict = model.evaluate(sample_eval_x,
                                     sample_eval_y,
                                     output_dict=True)
        assert isinstance(report_dict, dict)
Exemple #8
0
    def test_processor(self):
        x_set, y_set = TestMacros.load_classification_corpus()
        processor = ClassificationProcessor()
        processor.build_vocab(x_set, y_set)
        transformed_idx = processor.transform(y_set[20:40])

        info_dict = processor.to_dict()

        p2: ClassificationProcessor = load_data_object(info_dict)
        assert (transformed_idx == p2.transform(y_set[20:40])).all()
        assert y_set[20:40] == p2.inverse_transform(transformed_idx)
Exemple #9
0
    def test_multi_label(self):
        p = ClassificationProcessor(multi_label=True)
        embedding = BareEmbedding(task='classification', processor=p)
        model = self.model_class(embedding)
        model.fit(sample_train_x, sample_train_y, epochs=1)
        assert len(p.label2idx) == 3

        model.evaluate(sample_eval_x, sample_eval_y)
        assert isinstance(model.predict(sample_eval_x)[0], tuple)
        report_dict = model.evaluate(sample_eval_x,
                                     sample_eval_y,
                                     output_dict=True)
        assert isinstance(report_dict, dict)
        res = model.predict(valid_x[:20])
        model_path = os.path.join(tempfile.gettempdir(), str(time.time()))
        model.save(model_path)

        new_model = kashgari.utils.load_model(model_path)
        assert res == new_model.predict(valid_x[:20])
Exemple #10
0
    def test_multi_label_processor(self):
        from kashgari.corpus import JigsawToxicCommentCorpus
        file_path = TestMacros.jigsaw_mini_corpus_path
        corpus = JigsawToxicCommentCorpus(file_path)
        x_set, y_set = corpus.load_data()

        corpus_gen = CorpusGenerator(x_set, y_set)

        processor = ClassificationProcessor(multi_label=True)
        processor.build_vocab_generator([corpus_gen])
        transformed_idx = processor.transform(y_set[20:40])

        info_dict = processor.to_dict()

        p2: ClassificationProcessor = load_data_object(info_dict)
        assert (transformed_idx == p2.transform(y_set[20:40])).all()

        x1s = y_set[20:40]
        x2s = p2.inverse_transform(transformed_idx)
        for sample_x1, sample_x2 in zip(x1s, x2s):
            assert sorted(sample_x1) == sorted(sample_x2)
Exemple #11
0
 def setUpClass(cls):
     cls.processor = ClassificationProcessor()
     cls.processor.analyze_corpus(class_train_x, class_train_y)