Beispiel #1
0
    def build_token2id_label2id_dict(self,
                                     x_train: List[List[str]],
                                     y_train: List[str],
                                     x_validate: List[List[str]] = None,
                                     y_validate: List[str] = None):
        if x_validate:
            x_data = [*x_train, *x_validate]
            y_data = [*y_train, *y_validate]
        else:
            x_data = x_train
            y_data = y_train
        x_data_level = depth_count(x_data)
        if x_data_level > 2:
            for _ in range(x_data_level-2):
                x_data = list(chain(*x_data))

        self.embedding.build_token2idx_dict(x_data, 3)

        if self.multi_label:
            label_set = set()
            for i in y_data:
                label_set = label_set.union(list(i))
        else:
            label_set = set(y_data)

        if not len(self.label2idx):
            label2idx = {
                k.PAD: 0,
            }
            for idx, label in enumerate(label_set):
                label2idx[label] = idx + 1
            self._label2idx = label2idx
            self._idx2label = dict([(val, key) for (key, val) in label2idx.items()])
            self.multi_label_binarizer = MultiLabelBinarizer(classes=list(self.label2idx.keys()))
Beispiel #2
0
    def build_token2idx_dict(self,
                             x_data: List[TextSeqType],
                             min_count: int = 5):
        if self.token2idx is None:
            #word_set: Dict[str, int] = {}
            # for x_item in x_data:
            #     for word in x_item:
            #         word_set[word] = word_set.get(word, 0) + 1
            data_depth = helper.depth_count(x_data)
            if data_depth > 1:
                x_items = x_data
                for _ in range(data_depth - 1):
                    x_items = list(chain(*x_items))
            word_freq = Counter(x_items)
            # word_set = {word: freq for word, freq in word_freq.items() if freq >= min_count}
            # word2idx_list = sorted(word_set.items(), key=lambda kv: -kv[1])
            word2idx_list = sorted(word_freq.items(), key=lambda kv: -kv[1])

            word2idx = self.base_dict.copy()
            offset = len(word2idx)
            # for word, count in word2idx_list:
            #     if count >= min_count:
            #         word2idx[word] = len(word2idx)
            for idx, (word, freq) in enumerate(word2idx_list):
                if freq >= min_count:
                    word2idx[word] = idx + offset

            self.token2idx = word2idx
        self.build()
Beispiel #3
0
    def predict(self,
                sentence: Union[List[str], List[List[str]], List[List[List[str]]]],
                batch_size=None,
                output_dict=False,
                multi_label_threshold=0.6,
                debug_info=False) -> Union[List[str], str, List[Dict], Dict]:
        """
        predict with model
        :param sentence: single sentence as List[str] or list of sentence as List[List[str]]
        :param batch_size: predict batch_size
        :param output_dict: return dict with result with confidence
        :param multi_label_threshold:
        :param debug_info: print debug info using logging.debug when True
        :return:
        """
        sentence_level = depth_count(sentence)
        if sentence_level == 2:
            sentence = [sentence]
        elif sentence_level == 1:
            sentence = [[sentence]]
        padded_tokens = []
        for i, sent_part in enumerate(sentence):
            tokens = self.embedding.tokenize(sent_part)
            if isinstance(self.embedding.sequence_length, int):
                padded_tokens_part = sequence.pad_sequences(tokens,
                                                   maxlen=self.embedding.sequence_length,
                                                   padding='post')
                padded_tokens.append(padded_tokens_part)
                if self.embedding.is_bert:
                    padded_tokens.append(np.zeros(shape=(len(padded_tokens_part),
                                                         self.embedding.sequence_length)))
            elif isinstance(self.embedding.sequence_length, list):
                padded_tokens_part = sequence.pad_sequences(tokens,
                                                   maxlen=self.embedding.sequence_length[i],
                                                   padding='post')
                padded_tokens.append(padded_tokens_part)
                if self.embedding.is_bert:
                    padded_tokens.append(np.zeros(shape=(len(padded_tokens_part),
                                                         self.embedding.sequence_length[i])))
 
        x = padded_tokens
        res = self.model.predict(x, batch_size=batch_size)

        if self.multi_label:
            if debug_info:
                logging.info('raw output: {}'.format(res))
            res[res >= multi_label_threshold] = 1
            res[res < multi_label_threshold] = 0
            predict_result = res
        else:
            predict_result = res.argmax(-1)

        if debug_info:
            logging.info('input: {}'.format(x))
            logging.info('output: {}'.format(res))
            logging.info('output argmax: {}'.format(predict_result))

        if output_dict:
            words_list: List[List[str]] = sentence[0]
            results = []
            for index in range(len(words_list)):
                results.append(self._format_output_dic(words_list[index], res[index]))
            if sentence_level >= 2:
                return results
            elif sentence_level == 1:
                return results[0]
        else:
            if self.multi_label:
                results = self.multi_label_binarizer.inverse_transform(predict_result)
            else:
                results = self.convert_idx_to_label(predict_result)
            if sentence_level >= 2:
                return results
            elif sentence_level == 1:
                return results[0]
Beispiel #4
0
    def fit(self,
            x_train: Union[List[List[str]], List[List[List[str]]]],
            y_train: Union[List[str], List[List[str]], List[Tuple[str]]],
            x_validate: Union[List[List[str]], List[List[List[str]]]] = None,
            y_validate: Union[List[str], List[List[str]], List[Tuple[str]]] = None,
            batch_size: int = 64,
            epochs: int = 5,
            class_weight: bool = False,
            fit_kwargs: Dict = None,
            **kwargs):
        """

        :param x_train: list of training data.
        :param y_train: list of training target label data.
        :param x_validate: list of validation data.
        :param y_validate: list of validation target label data.
        :param batch_size: batch size for trainer model
        :param epochs: Number of epochs to train the model.
        :param class_weight: set class weights for imbalanced classes
        :param fit_kwargs: additional kwargs to be passed to
               :func:`~keras.models.Model.fit`
        :param kwargs:
        :return:
        """
        x_train_level = depth_count(x_train)
        if x_train_level == 2:
            assert len(x_train) == len(y_train)
        elif x_train_level > 2:
            for x_part in x_train:
                assert len(x_part) == len(y_train)
        else:
            raise Exception('x_train type error')
       
        if len(y_train) < batch_size:
            batch_size = len(y_train) // 2

        if not self.model:
            if isinstance(self.embedding.sequence_length, int):
                if self.embedding.sequence_length == 0:
                    self.embedding.sequence_length = sorted([len(x) for x in x_train])[int(0.95 * len(x_train))]
                    logging.info('sequence length set to {}'.format(self.embedding.sequence_length))
            elif isinstance(self.embedding.sequence_length, list):
                seq_len = []
                for i, x_part in enumerate(x_train):
                    if self.embedding.sequence_length[i] == 0:
                        seq_len.append(max(sorted([len(x) for x in x_part])[int(0.95 * len(x_part))], 1))
                        logging.info(f'sequence_{i} length set to {self.embedding.sequence_length[i]}')
                    else:
                        seq_len.append(self.embedding.sequence_length[i])
                self.embedding.sequence_length = seq_len
            self.build_model(x_train, y_train, x_validate, y_validate)

        train_generator = self.get_data_generator(x_train,
                                                  y_train,
                                                  batch_size,
                                                  is_bert=self.embedding.is_bert)

        if fit_kwargs is None:
            fit_kwargs = {}

        if x_validate:
            validation_generator = self.get_data_generator(x_validate,
                                                           y_validate,
                                                           batch_size,
                                                           is_bert=self.embedding.is_bert)
            fit_kwargs['validation_data'] = validation_generator
            fit_kwargs['validation_steps'] = max(len(y_validate) // batch_size, 1)

        if class_weight:
            if self.multi_label:
                y_list = [self.convert_label_to_idx(y) for y in y_train]
                y_list = [y for ys in y_list for y in ys]
            else:
                y_list = self.convert_label_to_idx(y_train)
            class_weights = class_weight_calculte.compute_class_weight('balanced',
                                                                       np.unique(y_list),
                                                                       y_list)
        else:
            class_weights = None

        self.model.fit_generator(train_generator,
                                 steps_per_epoch=len(y_train) // batch_size,
                                 epochs=epochs,
                                 class_weight=class_weights,
                                 **fit_kwargs)
Beispiel #5
0
    def get_data_generator(self,
                           x_data: Union[List[List[str]], List[List[List[str]]]],
                           y_data: List[str],
                           batch_size: int = 64,
                           is_bert: bool = False):
        x_data_level = depth_count(x_data)
        if x_data_level == 2:
            x_data = [x_data]
        data_len = len(y_data)
        for x in x_data:
            assert len(x) == data_len
        while True:
            page_list = list(range((data_len // batch_size) + 1))
            random.shuffle(page_list)
            for page in page_list:
                start_index = page * batch_size
                end_index = start_index + batch_size
                target_x = []
                for x in x_data:
                    target_x.append(x[start_index: end_index])
                target_y = y_data[start_index: end_index]
                if len(target_x[0]) == 0:
                    target_x.pop()
                    for x in x_data:
                        target_x.append(x[0: batch_size])
                    target_y = y_data[0: batch_size]

                padded_x = []
                for i, x in enumerate(target_x):
                    tokenized_x = self.embedding.tokenize(x)

                    if isinstance(self.embedding.sequence_length, int):
                        padded_x.append(sequence.pad_sequences(tokenized_x,
                                                      maxlen=self.embedding.sequence_length,
                                                      padding='post')
                                )
                    elif isinstance(self.embedding.sequence_length, list):
                        padded_x.append(sequence.pad_sequences(tokenized_x,
                                                      maxlen=self.embedding.sequence_length[i],
                                                      padding='post')
                                )

                if self.multi_label:
                    padded_y = self.multi_label_binarizer.fit_transform(target_y)
                else:
                    tokenized_y = self.convert_label_to_idx(target_y)
                    padded_y = to_categorical(tokenized_y,
                                              num_classes=len(self.label2idx),
                                              dtype=np.int)
                if is_bert:
                    if isinstance(self.embedding.sequence_length, int):
                        padded_x_seg = [np.zeros(shape=(len(padded_x_i),
                                                        self.embedding.sequence_length))
                                                            for padded_x_i in padded_x]
                    elif isinstance(self.embedding.sequence_length, list):
                        padded_x_seg = [np.zeros(shape=(len(padded_x_i),
                                                        self.embedding.sequence_length[i]))
                                                            for i, padded_x_i in enumerate(padded_x)]
                    x_input_data = list(chain(*[(x, x_seg)
                                    for x, x_seg in zip(padded_x, padded_x_seg)]))
                else:
                    x_input_data = padded_x[0] if x_data_level == 2 else padded_x
                yield (x_input_data, padded_y)