def text_split(text, limited=True): """将长句按照标点分割为多个子句。 """ texts = text_segmentate(text, 1, u'\n。;:,') if limited: texts = texts[-maxlen:] return texts
def text_segmentate(text, maxlen, seps='\n', strips=None): """将文本按照标点符号划分为若干个短句 """ text = text.strip().strip(strips) if seps and len(text) > maxlen: pieces = text.split(seps[0]) text, texts = '', [] for i, p in enumerate(pieces): if text and p and len(text) + len(p) > maxlen - 1: texts.extend(text_segmentate(text, maxlen, seps[1:], strips)) text = '' if i + 1 == len(pieces): text = text + p else: text = text + p + seps[0] if text: texts.extend(text_segmentate(text, maxlen, seps[1:], strips)) return texts else: return [text]
def load_data(filenames): """加载数据,并尽量划分为不超过maxlen的句子 """ D = [] seps, strips = u'\n。!?!?;;,, ', u';;,, ' for filename in filenames: with open(filename, encoding='utf-8') as f: for l in f: text, label = l.strip().split('\t') for t in text_segmentate(text, maxlen - 2, seps, strips): D.append((t, int(label))) return D
def text_process(text): """分割文本 """ texts = text_segmentate(text, 32, u'\n。') result, length = '', 0 for text in texts: if result and len(result) + len(text) > maxlen * 1.3: yield result result, length = '', 0 result += text if result: yield result
def text_process(text): """分割文本 """ texts = text_segmentate(text, 32, u'\n。') result, length = [], 0 for text in texts: if length + len(text) > maxlen * 1.5 and len(result) >= 3: yield result result, length = [], 0 result.append(text) length += len(text) if result and len(result) >= 3: yield result
# bert配置 config_path = '/root/kg/bert/chinese_L-12_H-768_A-12/bert_config.json' checkpoint_path = '/root/kg/bert/chinese_L-12_H-768_A-12/bert_model.ckpt' dict_path = '/root/kg/bert/chinese_L-12_H-768_A-12/vocab.txt' # 标注数据 webqa_data = json.load(open('/root/qa_datasets/WebQA.json')) sogou_data = json.load(open('/root/qa_datasets/SogouQA.json')) # 筛选数据 seps, strips = u'\n。!?!?;;,, ', u';;,, ' data = [] for d in webqa_data + sogou_data: for p in d['passages']: if p['answer']: for t in text_segmentate(p['passage'], max_p_len - 2, seps, strips): if p['answer'] in t: data.append((t, d['question'], p['answer'])) del webqa_data del sogou_data # 保存一个随机序(供划分valid用) if not os.path.exists('../random_order.json'): random_order = list(range(len(data))) np.random.shuffle(random_order) json.dump(random_order, open('../random_order.json', 'w'), indent=4) else: random_order = json.load(open('../random_order.json')) # 划分valid
# max_a_len = 16 # batch_size = 32 # epochs = 100 # bert配置 config_path = '/data/xyang/NLP/Bert_model/tf/chinese_roberta_wwm_ext/bert_config.json' checkpoint_path = '/data/xyang/NLP/Bert_model/tf/chinese_roberta_wwm_ext/bert_model.ckpt' dict_path = '/data/xyang/NLP/Bert_model/tf/chinese_roberta_wwm_ext/vocab.txt' # 筛选数据 seps, strips = u'\n。!?!?;;,, ', u';;,, ' data = [] for idx in range(train_data.shape[0]): if train_data['answer'][idx]: for t in text_segmentate(train_data['text'][idx], max_p_len - 2, seps, strips): if train_data['answer'][idx] in t: data.append((t, train_data['question'][idx], train_data['answer'][idx])) random_order = list(range(len(data))) np.random.shuffle(random_order) json.dump(random_order, open('../random_order.json', 'w'), indent=4) # 划分valid train_data = [data[j] for i, j in enumerate(random_order) if i % 10 != 0] valid_data = [data[j] for i, j in enumerate(random_order) if i % 10 == 0] # 加载并精简词表,建立分词器 token_dict, keep_tokens = load_vocab( dict_path=dict_path,
def truncate(text): """截断句子 """ seps, strips = u'\n。!?!?;;,, ', u';;,, ' return text_segmentate(text, maxlen - 2, seps, strips)[0]