def build_vocab(self, train_path):
        """
        根据训练集生成词表
        :param train_path:
        :return:
        """
        df_train = read_csv(train_path,
                            names=['token', 'label'],
                            delimiter=self.configs.delimiter)
        token2id, id2token = {}, {}
        if not self.configs.use_bert:
            tokens = list(set(df_train['token'][df_train['token'].notnull()]))
            token2id = dict(zip(tokens, range(1, len(tokens) + 1)))
            id2token = dict(zip(range(1, len(tokens) + 1), tokens))
            id2token[0] = self.PADDING
            token2id[self.PADDING] = 0
            # 向生成的词表中加入[UNK]
            id2token[len(tokens) + 1] = self.UNKNOWN
            token2id[self.UNKNOWN] = len(tokens) + 1
            # 保存词表及标签表
            with open(self.token2id_file, 'w', encoding='utf-8') as outfile:
                for idx in id2token:
                    outfile.write(id2token[idx] + '\t' + str(idx) + '\n')

        labels = list(set(df_train['label'][df_train['label'].notnull()]))
        label2id = dict(zip(labels, range(1, len(labels) + 1)))
        id2label = dict(zip(range(1, len(labels) + 1), labels))
        id2label[0] = self.PADDING
        label2id[self.PADDING] = 0
        with open(self.label2id_file, 'w', encoding='utf-8') as outfile:
            for idx in id2label:
                outfile.write(id2label[idx] + '\t' + str(idx) + '\n')
        return token2id, id2token, label2id, id2label
 def get_valid_set(self):
     """
     获取验证集
     :return:
     """
     df_val = read_csv(self.dev_file,
                       names=['token', 'label'],
                       delimiter=self.configs.delimiter)
     X_val, y_val, att_mask_val = self.prepare(df_val)
     return X_val, y_val, att_mask_val
 def get_valid_set(self):
     """
     获取验证集
     :return:
     """
     df_val = read_csv(self.dev_file,
                       names=['token', 'label'],
                       delimiter=self.configs.delimiter)
     df_val['token_id'] = df_val.token.map(
         lambda x: self.map_func(x, self.token2id))
     df_val['label_id'] = df_val.label.map(
         lambda x: -1 if str(x) == str(np.nan) else self.label2id[x])
     X_val, y_val = self.prepare(df_val['token_id'], df_val['label_id'])
     return X_val, y_val
 def build_labels(self, train_path):
     """
     根据训练集生成词表
     :param train_path:
     :return:
     """
     df_train = read_csv(train_path,
                         names=['token', 'label'],
                         delimiter=self.configs.delimiter)
     labels = list(set(df_train['label'][df_train['label'].notnull()]))
     label2id = dict(zip(labels, range(1, len(labels) + 1)))
     id2label = dict(zip(range(1, len(labels) + 1), labels))
     # 向生成的词表和标签表中加入[PAD]
     id2label[0] = self.PADDING
     label2id[self.PADDING] = 0
     # 保存标签表
     with open(self.label2id_file, 'w', encoding='utf-8') as outfile:
         for idx in id2label:
             outfile.write(id2label[idx] + '\t' + str(idx) + '\n')
     return label2id, id2label
 def get_training_set(self, train_val_ratio=0.9):
     """
     获取训练数据集、验证集
     :param train_val_ratio:
     :return:
     """
     df_train = read_csv(self.train_file,
                         names=['token', 'label'],
                         delimiter=self.configs.delimiter)
     # map the token and label into id
     df_train['token_id'] = df_train.token.map(
         lambda x: -1 if str(x) == str(np.nan) else self.token2id[x])
     df_train['label_id'] = df_train.label.map(
         lambda x: -1 if str(x) == str(np.nan) else self.label2id[x])
     # convert the data in matrix
     X, y = self.prepare(df_train['token_id'], df_train['label_id'])
     # shuffle the samples
     num_samples = len(X)
     indices = np.arange(num_samples)
     np.random.shuffle(indices)
     X = X[indices]
     y = y[indices]
     if self.dev_file is not None:
         X_train = X
         y_train = y
         X_val, y_val = self.get_valid_set()
     else:
         # split the data into train and validation set
         X_train = X[:int(num_samples * train_val_ratio)]
         y_train = y[:int(num_samples * train_val_ratio)]
         X_val = X[int(num_samples * train_val_ratio):]
         y_val = y[int(num_samples * train_val_ratio):]
         self.logger.info('validating set is not exist, built...')
     self.logger.info(
         'training set size: {}, validating set size: {}'.format(
             len(X_train), len(X_val)))
     return X_train, y_train, X_val, y_val
    def get_training_set(self, train_val_ratio=0.9):
        """
        获取训练数据集、验证集
        :param train_val_ratio:
        :return:
        """
        df_train = read_csv(self.train_file,
                            names=['token', 'label'],
                            delimiter=self.configs.delimiter)
        X, y, att_mask = self.prepare(df_train)
        # shuffle the samples
        num_samples = len(X)
        indices = np.arange(num_samples)
        np.random.shuffle(indices)
        X = X[indices]
        y = y[indices]
        att_mask = att_mask[indices]

        if self.dev_file is not None:
            X_train = X
            y_train = y
            att_mask_train = att_mask
            X_val, y_val, att_mask_val = self.get_valid_set()
        else:
            # split the data into train and validation set
            X_train = X[:int(num_samples * train_val_ratio)]
            y_train = y[:int(num_samples * train_val_ratio)]
            att_mask_train = att_mask[:int(num_samples * train_val_ratio)]
            X_val = X[int(num_samples * train_val_ratio):]
            y_val = y[int(num_samples * train_val_ratio):]
            att_mask_val = att_mask[int(num_samples * train_val_ratio):]
            self.logger.info('validating set is not exist, built...')
        self.logger.info(
            'training set size: {}, validating set size: {}'.format(
                len(X_train), len(X_val)))
        return X_train, y_train, att_mask_train, X_val, y_val, att_mask_val
    def get_training_set(self, train_val_ratio=0.9):
        """
        获取训练数据集、验证集
        :param train_val_ratio:
        :return:
        """
        df_train = read_csv(self.train_file,
                            names=['token', 'label'],
                            delimiter=self.configs.delimiter)
        if self.configs.use_bert:
            X, y, att_mask = self.prepare_bert_embedding(df_train)
            # shuffle the samples
            num_samples = len(X)
            indices = np.arange(num_samples)
            np.random.shuffle(indices)
            X = X[indices]
            y = y[indices]
            att_mask = att_mask[indices]

            if self.dev_file is not None:
                X_train = X
                y_train = y
                att_mask_train = att_mask
                X_val, y_val, att_mask_val = self.get_valid_set()
            else:
                # split the data into train and validation set
                X_train = X[:int(num_samples * train_val_ratio)]
                y_train = y[:int(num_samples * train_val_ratio)]
                att_mask_train = att_mask[:int(num_samples * train_val_ratio)]
                X_val = X[int(num_samples * train_val_ratio):]
                y_val = y[int(num_samples * train_val_ratio):]
                att_mask_val = att_mask[int(num_samples * train_val_ratio):]
                self.logger.info('validating set is not exist, built...')
            self.logger.info(
                'training set size: {}, validating set size: {}'.format(
                    len(X_train), len(X_val)))
            train_dataset = tf.data.Dataset.from_tensor_slices(
                (X_train, y_train, att_mask_train))
            val_dataset = tf.data.Dataset.from_tensor_slices(
                (X_val, y_val, att_mask_val))
        else:
            # map the token and label into id
            df_train['token_id'] = df_train.token.map(
                lambda x: self.map_func(x, self.token2id))
            df_train['label_id'] = df_train.label.map(
                lambda x: -1 if str(x) == str(np.nan) else self.label2id[x])
            # convert the data in matrix
            X, y = self.prepare(df_train['token_id'], df_train['label_id'])
            # shuffle the samples
            num_samples = len(X)
            indices = np.arange(num_samples)
            np.random.shuffle(indices)
            X = X[indices]
            y = y[indices]
            if self.dev_file is not None:
                X_train = X
                y_train = y
                X_val, y_val = self.get_valid_set()
            else:
                # split the data into train and validation set
                X_train = X[:int(num_samples * train_val_ratio)]
                y_train = y[:int(num_samples * train_val_ratio)]
                X_val = X[int(num_samples * train_val_ratio):]
                y_val = y[int(num_samples * train_val_ratio):]
                self.logger.info('validating set is not exist, built...')
            self.logger.info(
                'training set size: {}, validating set size: {}'.format(
                    len(X_train), len(X_val)))
            train_dataset = tf.data.Dataset.from_tensor_slices(
                (X_train, y_train))
            val_dataset = tf.data.Dataset.from_tensor_slices((X_val, y_val))
        return train_dataset, val_dataset