def _wrong_message(_idx, ts): if verbose: logging.info(token_list) logging.info(tags) logging.warning('wrong tag: {}'.format( ts[start if start is not None else max(0, _idx - 2): _idx + 2]))
def analyse_dataset(dataset_x, dataset_y, ratio=[0.8, 0.05, 0.15], shuffle=True, multi_label=False): """ 将数据集按照训练、验证、测试进行划分,统计数据集中各个类别的数量和占比,计算训练、 验证、测试集的相对熵,判断数据集分割是否合理。其中,dismatch 信息比例越低,证明数据集 划分的各类别比例越贴近数据全集的分布。 Args: dataset_x: 数据集的输入数据部分 dataset_y: 数据集的输出标签 ratio: 训练集、验证集、测试集的比例 shuffle: 打散数据集 multi_label: 数据集为多标签 Return: train_x, train_y, valid_x, valid_y, test_x, test_y, stats(dict): stats 为数据集的统计信息(数量、占比、相对熵) Examples: >>> import jionlp as jio >>> dataset_x = ['美股大涨...', '金融市场开放...', '小米无屏电视...', ...] >>> dataset_y = ['财经', '财经', '科技', ...] >>> train_x, train_y, valid_x, valid_y, test_x, test_y, stats = \ ... jio.text_classification.analyse_dataset(dataset_x, dataset_y) >>> print(stats) whole dataset: 财经 32,268 84.52% 科技 5,910 15.48% total 38,178 100.00% train dataset: 80.00% 财经 25,848 84.63% 科技 4,694 15.37% total 30,542 100.00% valid dataset: 5.00% 财经 32,268 84.52% 科技 5,910 15.48% total 1,908 100.00% test dataset: 15.00% 财经 4,840 84.53% 科技 886 15.47% total 5,726 100.00% train KL divergence: 0.000007, info dismatch: 0.00% valid KL divergence: 0.001616, info dismatch: 0.26% test KL divergence: 0.000000, info dismatch: 0.00% """ dataset = [[sample_x, sample_y] for sample_x, sample_y in zip(dataset_x, dataset_y)] if shuffle: random.shuffle(dataset) has_kl = False for i in range(3): # 为获得最佳的数据子集切分,在切分情况不好(相对熵较高,类别不全)时,需要重新 # 切分,以获得最佳的子集类别分布。在三次都不满足的情况下,则照常返回。 # 统计各个类别的数据数量及占比 stats = {'train': None, 'valid': None, 'test': None, 'total': None} dataset_stat = _stat_class(dataset_y, multi_label=multi_label) stats['total'] = dataset_stat tmp_ds = list() current = 0 for s in ratio: num = int(len(dataset) * s) tmp_ds.append(dataset[current:current + num]) current += num train_x = [item[0] for item in tmp_ds[0]] train_y = [item[1] for item in tmp_ds[0]] valid_x = [item[0] for item in tmp_ds[1]] valid_y = [item[1] for item in tmp_ds[1]] test_x = [item[0] for item in tmp_ds[2]] test_y = [item[1] for item in tmp_ds[2]] # 统计各数据子集的统计信息 train_stat = _stat_class(train_y, multi_label=multi_label) stats['train'] = train_stat valid_stat = _stat_class(valid_y, multi_label=multi_label) stats['valid'] = valid_stat test_stat = _stat_class(test_y, multi_label=multi_label) stats['test'] = test_stat if not (len(train_stat) == len(valid_stat) == len(test_stat)): # 各子集的类别数量不一致,则重新进行切分 continue # 计算 KL 散度 has_kl = True train_kl_value, train_ratio = _compute_kl_divergence( np.array([item[1][1] for item in sorted(dataset_stat.items())]), np.array([item[1][1] for item in sorted(train_stat.items())])) valid_kl_value, valid_ratio = _compute_kl_divergence( np.array([item[1][1] for item in sorted(dataset_stat.items())]), np.array([item[1][1] for item in sorted(valid_stat.items())])) test_kl_value, test_ratio = _compute_kl_divergence( np.array([item[1][1] for item in sorted(dataset_stat.items())]), np.array([item[1][1] for item in sorted(test_stat.items())])) if (train_ratio > 0.05) or (valid_ratio > 0.05) or (test_ratio > 0.05): # kl 散度阈值过大,说明切分的类别分布比例不一致,需要重新切分 continue break # 打印信息 stats_fmt = '{0:<20s}\t{1:>8,d}\t{2:>2.2%}' total_fmt = stats_fmt + '\n' logging.info('whole dataset:') for _class, info in stats['total'].items(): logging.info(stats_fmt.format(_class, info[0], info[1])) sum_res = sum([info[1] for info in stats['total'].values()]) logging.info(total_fmt.format('total', len(dataset_y), sum_res)) logging.info('train dataset: {:.2%}'.format(ratio[0])) for _class, info in stats['train'].items(): logging.info(stats_fmt.format(_class, info[0], info[1])) sum_res = sum([info[1] for info in stats['train'].values()]) logging.info(total_fmt.format('total', len(train_y), sum_res)) logging.info('valid dataset: {:.2%}'.format(ratio[1])) for _class, info in stats['valid'].items(): logging.info(stats_fmt.format(_class, info[0], info[1])) sum_res = sum([info[1] for info in stats['valid'].values()]) logging.info(total_fmt.format('total', len(valid_y), sum_res)) logging.info('test dataset: {:.2%}'.format(ratio[2])) for _class, info in stats['test'].items(): logging.info(stats_fmt.format(_class, info[0], info[1])) sum_res = sum([info[1] for info in stats['test'].values()]) logging.info(total_fmt.format('total', len(test_y), sum_res)) if has_kl: kl_fmt = 'KL divergence: {0:.>2f}, info dismatch: {1:.2%}' logging.info('train ' + kl_fmt.format(train_kl_value, train_ratio)) logging.info('valid ' + kl_fmt.format(valid_kl_value, valid_ratio)) logging.info('test ' + kl_fmt.format(test_kl_value, test_ratio)) return train_x, train_y, valid_x, valid_y, test_x, test_y, stats
def analyse_dataset(dataset_x, dataset_y, ratio=[0.8, 0.05, 0.15], shuffle=True): ''' 将 NER 数据集按照训练、验证、测试进行划分,统计数据集中各个类别实体的数量和占比, 计算训练、验证、测试集的相对熵,判断数据集分割是否合理。其中,dismatch 信息比例越低, 证明数据集划分的各类别比例越贴近数据全集的分布。 Args: dataset_x: 数据集的输入数据部分 dataset_y: 数据集的输出标签 ratio: 训练集、验证集、测试集的比例 shuffle: 打散数据集 Return: train_x, train_y, valid_x, valid_y, test_x, test_y, stats(dict): stats 为数据集的统计信息(数量、占比、相对熵) Examples: >>> import jionlp as jio >>> dataset_x = ['马成宇在...', '金融国力教育公司...', '延平区人民法院曾经...', ...] >>> dataset_y = [[{'type': 'Person', 'text': '马成宇', 'offset': (0, 3)}], [{'type': 'Company', 'text': '国力教育公司', 'offset': (2, 8)}], [{'type': 'Organization', 'text': '延平区人民法院', 'offset': (0, 7)}], ...] >>> train_x, train_y, valid_x, valid_y, test_x, test_y, stats = \ ... jio.ner.analyse_dataset(dataset_x, dataset_y) >>> print(stats) whole dataset: Company 573 39.68% Person 495 34.28% Organization 376 26.04% total 3,000 100.00% train dataset: 80.00% Company 464 40.38% Person 379 32.99% Organization 306 26.63% total 2,400 100.00% valid dataset: 5.00% Person 32 47.06% Company 22 32.35% Organization 14 20.59% total 150 100.00% test dataset: 15.00% Company 87 38.33% Person 84 37.00% Organization 56 24.67% total 450 100.00% train KL divergence: 0.000546, info dismatch: 0.03% valid KL divergence: 0.048423, info dismatch: 3.10% test KL divergence: 0.002364, info dismatch: 0.15% ''' dataset = [[sample_x, sample_y] for sample_x, sample_y in zip(dataset_x, dataset_y)] if shuffle: random.shuffle(dataset) has_kl = False for i in range(3): # 为获得最佳的数据子集切分,在切分情况不好(相对熵较高,类别不全)时,需要重新 # 切分,以获得最佳的子集类别分布。在三次都不满足的情况下,则照常返回。 # 统计各个类别的数据数量及占比 stats = {'train': None, 'valid': None, 'test': None, 'total': None} dataset_stat = _stat_class(dataset_y) stats['total'] = dataset_stat tmp_ds = list() current = 0 for s in ratio: num = int(len(dataset) * s) tmp_ds.append(dataset[current: current + num]) current += num train_x = [item[0] for item in tmp_ds[0]] train_y = [item[1] for item in tmp_ds[0]] valid_x = [item[0] for item in tmp_ds[1]] valid_y = [item[1] for item in tmp_ds[1]] test_x = [item[0] for item in tmp_ds[2]] test_y = [item[1] for item in tmp_ds[2]] # 统计各数据子集的统计信息 train_stat = _stat_class(train_y) stats['train'] = train_stat valid_stat = _stat_class(valid_y) stats['valid'] = valid_stat test_stat = _stat_class(test_y) stats['test'] = test_stat if not (len(train_stat) == len(valid_stat) == len(test_stat)): # 各子集的类别数量不一致,则重新进行切分 continue # 计算 KL 散度 has_kl = True train_kl_value, train_ratio = _compute_kl_divergence( np.array([item[1][1] for item in sorted(dataset_stat.items())]), np.array([item[1][1] for item in sorted(train_stat.items())])) valid_kl_value, valid_ratio = _compute_kl_divergence( np.array([item[1][1] for item in sorted(dataset_stat.items())]), np.array([item[1][1] for item in sorted(valid_stat.items())])) test_kl_value, test_ratio = _compute_kl_divergence( np.array([item[1][1] for item in sorted(dataset_stat.items())]), np.array([item[1][1] for item in sorted(test_stat.items())])) if (train_ratio > 0.05) or (valid_ratio > 0.05) or (test_ratio > 0.05): # kl 散度阈值过大,说明切分的类别分布比例不一致,需要重新切分 continue break # 打印信息 stats_fmt = '{0:<20s}\t{1:>8,d}\t{2:>2.2%}' total_fmt = stats_fmt + '\n' logging.info('whole dataset:') for _class, info in stats['total'].items(): logging.info(stats_fmt.format(_class, info[0], info[1])) sum_res = sum([info[0] for info in stats['total'].values()]) logging.info(total_fmt.format('total', sum_res, 1.)) logging.info('train dataset: {:.2%}'.format(ratio[0])) for _class, info in stats['train'].items(): logging.info(stats_fmt.format(_class, info[0], info[1])) sum_res = sum([info[0] for info in stats['train'].values()]) logging.info(total_fmt.format('total', sum_res, 1.)) logging.info('valid dataset: {:.2%}'.format(ratio[1])) for _class, info in stats['valid'].items(): logging.info(stats_fmt.format(_class, info[0], info[1])) sum_res = sum([info[0] for info in stats['valid'].values()]) logging.info(total_fmt.format('total', sum_res, 1.)) logging.info('test dataset: {:.2%}'.format(ratio[2])) for _class, info in stats['test'].items(): logging.info(stats_fmt.format(_class, info[0], info[1])) sum_res = sum([info[0] for info in stats['test'].values()]) logging.info(total_fmt.format('total', sum_res, 1.)) if has_kl: kl_fmt = 'KL divergence: {0:.>2f}, info dismatch: {1:.2%}' logging.info('train ' + kl_fmt.format(train_kl_value, train_ratio)) logging.info('valid ' + kl_fmt.format(valid_kl_value, valid_ratio)) logging.info('test ' + kl_fmt.format(test_kl_value, test_ratio)) return train_x, train_y, valid_x, valid_y, test_x, test_y, stats
def analyse_freq_words(dataset_x: List[List[str]], dataset_y: List[Any], min_word_freq=10, min_word_threshold=0.8): """ 采用朴素贝叶斯的概率分布,分析文本分类语料中,各个类别的高频特征词汇,用于制作类型词典, 分析完毕后,方便加入模型当中,形成有效的模型和规则词典相结合的模型,提高模型的稳定性以及 F1 值。具体来讲,获取每个类别 y 对应的词汇 x 的条件分布,即 p(x|y),找出其中概率最高,即 大于 min_word_threshold 的词汇表,即该类的特征词。对于一般的判别式神经网络模型而言,这些 词汇的概率在数据量偏少的情况下,或模型参数量过大的情况下,是很难学习得到的。因此,将这些词 信息以各种形式融入模型,可以有效提升模型的 F1 值。根据经验,往往能提升 2% ~ 8%。 Args: dataset_x: 分词、停用词处理后的词汇列表 dataset_y: 文本对应的标签类型 min_word_freq: 最小词频,若语料中词频小于 min_word_freq,则不予考虑其分布 min_word_threshold: 每个类别返回高频特征词最低阈值。 Return: Dict[Dict[str, List[int, float]]]: 各个类别对应的高频特征词汇,以及其统计词频 和概率。 Examples: >>> import jieba >>> import jionlp as jio >>> dataset_x = ['房间比较差,挺糟糕的,尤其是洗手间。', '真糟糕!连热水都没有。', '价格比比较不错的酒店。'] >>> dataset_y = ['负', '负', '正'] >>> dataset_x = [jieba.lcut(text) for text in dataset_x] # 采用任何分词器处理均可 >>> dataset_x = [jio.remove_stopwords(text_segs) for text in dataset_x] # 去停用词 >>> result = jio.text_classification.analyse_freq_words( ... dataset_x, dataset_y, min_word_freq=1) { '负': { '糟糕': [2, 1.0], '没有': [1, 1.0], '差': [1, 1.0] }, '正': { '不错': [1, 1.0] } } """ # 统计分类类型 class_list = list(set(dataset_y)) logging.info('当前包含的类型包括:{}'.format(class_list)) # 统计词汇数量和词频 word_list = list() for item in dataset_x: word_list.extend(item) word_dict = dict([ item for item in collections.Counter(word_list).most_common() if item[1] >= min_word_freq ]) # 统计各词在各类别中占比 tmp_word_dict = dict([tuple([word, [0, 0]]) for word in word_dict]) class_words_statistics = dict() for _class in class_list: class_words_statistics.update({_class: copy.deepcopy(tmp_word_dict)}) for text_segs, label in zip(dataset_x, dataset_y): for word in text_segs: if word in word_dict: class_words_statistics[label][word][0] += 1 result = dict() for label, words_statistics in class_words_statistics.items(): for word, stats in words_statistics.items(): stats[1] = stats[0] / word_dict[word] sorted_result = sorted([ item for item in words_statistics.items() if item[1][1] > min_word_threshold ], key=lambda i: i[1][1], reverse=True) result.update({label: dict(sorted_result)}) return result