def build_vocab(self, train_path): """ 根据训练集生成词表 :param train_path: :return: """ df_train = read_csv(train_path, names=['token', 'label'], delimiter=self.configs.delimiter) token2id, id2token = {}, {} if not self.configs.use_bert: tokens = list(set(df_train['token'][df_train['token'].notnull()])) token2id = dict(zip(tokens, range(1, len(tokens) + 1))) id2token = dict(zip(range(1, len(tokens) + 1), tokens)) id2token[0] = self.PADDING token2id[self.PADDING] = 0 # 向生成的词表中加入[UNK] id2token[len(tokens) + 1] = self.UNKNOWN token2id[self.UNKNOWN] = len(tokens) + 1 # 保存词表及标签表 with open(self.token2id_file, 'w', encoding='utf-8') as outfile: for idx in id2token: outfile.write(id2token[idx] + '\t' + str(idx) + '\n') labels = list(set(df_train['label'][df_train['label'].notnull()])) label2id = dict(zip(labels, range(1, len(labels) + 1))) id2label = dict(zip(range(1, len(labels) + 1), labels)) id2label[0] = self.PADDING label2id[self.PADDING] = 0 with open(self.label2id_file, 'w', encoding='utf-8') as outfile: for idx in id2label: outfile.write(id2label[idx] + '\t' + str(idx) + '\n') return token2id, id2token, label2id, id2label
def get_valid_set(self): """ 获取验证集 :return: """ df_val = read_csv(self.dev_file, names=['token', 'label'], delimiter=self.configs.delimiter) X_val, y_val, att_mask_val = self.prepare(df_val) return X_val, y_val, att_mask_val
def get_valid_set(self): """ 获取验证集 :return: """ df_val = read_csv(self.dev_file, names=['token', 'label'], delimiter=self.configs.delimiter) df_val['token_id'] = df_val.token.map( lambda x: self.map_func(x, self.token2id)) df_val['label_id'] = df_val.label.map( lambda x: -1 if str(x) == str(np.nan) else self.label2id[x]) X_val, y_val = self.prepare(df_val['token_id'], df_val['label_id']) return X_val, y_val
def build_labels(self, train_path): """ 根据训练集生成词表 :param train_path: :return: """ df_train = read_csv(train_path, names=['token', 'label'], delimiter=self.configs.delimiter) labels = list(set(df_train['label'][df_train['label'].notnull()])) label2id = dict(zip(labels, range(1, len(labels) + 1))) id2label = dict(zip(range(1, len(labels) + 1), labels)) # 向生成的词表和标签表中加入[PAD] id2label[0] = self.PADDING label2id[self.PADDING] = 0 # 保存标签表 with open(self.label2id_file, 'w', encoding='utf-8') as outfile: for idx in id2label: outfile.write(id2label[idx] + '\t' + str(idx) + '\n') return label2id, id2label
def get_training_set(self, train_val_ratio=0.9): """ 获取训练数据集、验证集 :param train_val_ratio: :return: """ df_train = read_csv(self.train_file, names=['token', 'label'], delimiter=self.configs.delimiter) # map the token and label into id df_train['token_id'] = df_train.token.map( lambda x: -1 if str(x) == str(np.nan) else self.token2id[x]) df_train['label_id'] = df_train.label.map( lambda x: -1 if str(x) == str(np.nan) else self.label2id[x]) # convert the data in matrix X, y = self.prepare(df_train['token_id'], df_train['label_id']) # shuffle the samples num_samples = len(X) indices = np.arange(num_samples) np.random.shuffle(indices) X = X[indices] y = y[indices] if self.dev_file is not None: X_train = X y_train = y X_val, y_val = self.get_valid_set() else: # split the data into train and validation set X_train = X[:int(num_samples * train_val_ratio)] y_train = y[:int(num_samples * train_val_ratio)] X_val = X[int(num_samples * train_val_ratio):] y_val = y[int(num_samples * train_val_ratio):] self.logger.info('validating set is not exist, built...') self.logger.info( 'training set size: {}, validating set size: {}'.format( len(X_train), len(X_val))) return X_train, y_train, X_val, y_val
def get_training_set(self, train_val_ratio=0.9): """ 获取训练数据集、验证集 :param train_val_ratio: :return: """ df_train = read_csv(self.train_file, names=['token', 'label'], delimiter=self.configs.delimiter) X, y, att_mask = self.prepare(df_train) # shuffle the samples num_samples = len(X) indices = np.arange(num_samples) np.random.shuffle(indices) X = X[indices] y = y[indices] att_mask = att_mask[indices] if self.dev_file is not None: X_train = X y_train = y att_mask_train = att_mask X_val, y_val, att_mask_val = self.get_valid_set() else: # split the data into train and validation set X_train = X[:int(num_samples * train_val_ratio)] y_train = y[:int(num_samples * train_val_ratio)] att_mask_train = att_mask[:int(num_samples * train_val_ratio)] X_val = X[int(num_samples * train_val_ratio):] y_val = y[int(num_samples * train_val_ratio):] att_mask_val = att_mask[int(num_samples * train_val_ratio):] self.logger.info('validating set is not exist, built...') self.logger.info( 'training set size: {}, validating set size: {}'.format( len(X_train), len(X_val))) return X_train, y_train, att_mask_train, X_val, y_val, att_mask_val
def get_training_set(self, train_val_ratio=0.9): """ 获取训练数据集、验证集 :param train_val_ratio: :return: """ df_train = read_csv(self.train_file, names=['token', 'label'], delimiter=self.configs.delimiter) if self.configs.use_bert: X, y, att_mask = self.prepare_bert_embedding(df_train) # shuffle the samples num_samples = len(X) indices = np.arange(num_samples) np.random.shuffle(indices) X = X[indices] y = y[indices] att_mask = att_mask[indices] if self.dev_file is not None: X_train = X y_train = y att_mask_train = att_mask X_val, y_val, att_mask_val = self.get_valid_set() else: # split the data into train and validation set X_train = X[:int(num_samples * train_val_ratio)] y_train = y[:int(num_samples * train_val_ratio)] att_mask_train = att_mask[:int(num_samples * train_val_ratio)] X_val = X[int(num_samples * train_val_ratio):] y_val = y[int(num_samples * train_val_ratio):] att_mask_val = att_mask[int(num_samples * train_val_ratio):] self.logger.info('validating set is not exist, built...') self.logger.info( 'training set size: {}, validating set size: {}'.format( len(X_train), len(X_val))) train_dataset = tf.data.Dataset.from_tensor_slices( (X_train, y_train, att_mask_train)) val_dataset = tf.data.Dataset.from_tensor_slices( (X_val, y_val, att_mask_val)) else: # map the token and label into id df_train['token_id'] = df_train.token.map( lambda x: self.map_func(x, self.token2id)) df_train['label_id'] = df_train.label.map( lambda x: -1 if str(x) == str(np.nan) else self.label2id[x]) # convert the data in matrix X, y = self.prepare(df_train['token_id'], df_train['label_id']) # shuffle the samples num_samples = len(X) indices = np.arange(num_samples) np.random.shuffle(indices) X = X[indices] y = y[indices] if self.dev_file is not None: X_train = X y_train = y X_val, y_val = self.get_valid_set() else: # split the data into train and validation set X_train = X[:int(num_samples * train_val_ratio)] y_train = y[:int(num_samples * train_val_ratio)] X_val = X[int(num_samples * train_val_ratio):] y_val = y[int(num_samples * train_val_ratio):] self.logger.info('validating set is not exist, built...') self.logger.info( 'training set size: {}, validating set size: {}'.format( len(X_train), len(X_val))) train_dataset = tf.data.Dataset.from_tensor_slices( (X_train, y_train)) val_dataset = tf.data.Dataset.from_tensor_slices((X_val, y_val)) return train_dataset, val_dataset