Example #1
0
def text_split(text, limited=True):
    """将长句按照标点分割为多个子句。
    """
    texts = text_segmentate(text, 1, u'\n。;:,')
    if limited:
        texts = texts[-maxlen:]
    return texts
Example #2
0
def text_segmentate(text, maxlen, seps='\n', strips=None):
    """将文本按照标点符号划分为若干个短句
    """
    text = text.strip().strip(strips)
    if seps and len(text) > maxlen:
        pieces = text.split(seps[0])
        text, texts = '', []
        for i, p in enumerate(pieces):
            if text and p and len(text) + len(p) > maxlen - 1:
                texts.extend(text_segmentate(text, maxlen, seps[1:], strips))
                text = ''
            if i + 1 == len(pieces):
                text = text + p
            else:
                text = text + p + seps[0]
        if text:
            texts.extend(text_segmentate(text, maxlen, seps[1:], strips))
        return texts
    else:
        return [text]
def load_data(filenames):
    """加载数据,并尽量划分为不超过maxlen的句子
    """
    D = []
    seps, strips = u'\n。!?!?;;,, ', u';;,, '
    for filename in filenames:
        with open(filename, encoding='utf-8') as f:
            for l in f:
                text, label = l.strip().split('\t')
                for t in text_segmentate(text, maxlen - 2, seps, strips):
                    D.append((t, int(label)))
    return D
Example #4
0
def text_process(text):
    """分割文本
    """
    texts = text_segmentate(text, 32, u'\n。')
    result, length = '', 0
    for text in texts:
        if result and len(result) + len(text) > maxlen * 1.3:
            yield result
            result, length = '', 0
        result += text
    if result:
        yield result
Example #5
0
def text_process(text):
    """分割文本
    """
    texts = text_segmentate(text, 32, u'\n。')
    result, length = [], 0
    for text in texts:
        if length + len(text) > maxlen * 1.5 and len(result) >= 3:
            yield result
            result, length = [], 0
        result.append(text)
        length += len(text)
    if result and len(result) >= 3:
        yield result
Example #6
0
# bert配置
config_path = '/root/kg/bert/chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path = '/root/kg/bert/chinese_L-12_H-768_A-12/bert_model.ckpt'
dict_path = '/root/kg/bert/chinese_L-12_H-768_A-12/vocab.txt'

# 标注数据
webqa_data = json.load(open('/root/qa_datasets/WebQA.json'))
sogou_data = json.load(open('/root/qa_datasets/SogouQA.json'))

# 筛选数据
seps, strips = u'\n。!?!?;;,, ', u';;,, '
data = []
for d in webqa_data + sogou_data:
    for p in d['passages']:
        if p['answer']:
            for t in text_segmentate(p['passage'], max_p_len - 2, seps,
                                     strips):
                if p['answer'] in t:
                    data.append((t, d['question'], p['answer']))

del webqa_data
del sogou_data

# 保存一个随机序(供划分valid用)
if not os.path.exists('../random_order.json'):
    random_order = list(range(len(data)))
    np.random.shuffle(random_order)
    json.dump(random_order, open('../random_order.json', 'w'), indent=4)
else:
    random_order = json.load(open('../random_order.json'))

# 划分valid
Example #7
0
# max_a_len = 16
# batch_size = 32
# epochs = 100

# bert配置
config_path = '/data/xyang/NLP/Bert_model/tf/chinese_roberta_wwm_ext/bert_config.json'
checkpoint_path = '/data/xyang/NLP/Bert_model/tf/chinese_roberta_wwm_ext/bert_model.ckpt'
dict_path = '/data/xyang/NLP/Bert_model/tf/chinese_roberta_wwm_ext/vocab.txt'

# 筛选数据
seps, strips = u'\n。!?!?;;,, ', u';;,, '
data = []

for idx in range(train_data.shape[0]):
    if train_data['answer'][idx]:
        for t in text_segmentate(train_data['text'][idx], max_p_len - 2, seps,
                                 strips):
            if train_data['answer'][idx] in t:
                data.append((t, train_data['question'][idx],
                             train_data['answer'][idx]))

random_order = list(range(len(data)))
np.random.shuffle(random_order)
json.dump(random_order, open('../random_order.json', 'w'), indent=4)

# 划分valid
train_data = [data[j] for i, j in enumerate(random_order) if i % 10 != 0]
valid_data = [data[j] for i, j in enumerate(random_order) if i % 10 == 0]

# 加载并精简词表,建立分词器
token_dict, keep_tokens = load_vocab(
    dict_path=dict_path,
Example #8
0
def truncate(text):
    """截断句子
    """
    seps, strips = u'\n。!?!?;;,, ', u';;,, '
    return text_segmentate(text, maxlen - 2, seps, strips)[0]