Example #1
0
    def __init__(self,
                 raw_path,
                 punc_conf_path,
                 seg_conf_path,
                 seed=12,
                 max_num=200000):
        """

        :param raw_path:
        :param punc_conf_path:
        :param seg_conf_path:
        :param seed:
        :param max_num: maximum paragraph number in one file
        """

        random.seed(seed)

        logging.basicConfig(
            format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
            datefmt='%m/%d/%Y %H:%M:%S',
            level=logging.INFO)
        self.logger = logging.getLogger(__name__)

        self.dir = raw_path
        self.logger.info("  root = %s", self.dir)

        self.book_list = load_file_name(self.dir)[2]
        self.logger.info("  training books = %s", ' '.join(self.book_list))

        self.sentence_end_symbol = list("。!??!")

        self.max_num = max_num

        self.max_sentence_in_paragraph = 8
        self.max_char_in_sentence = 128
        self.min_char_in_paragraph = 64
        self.max_char_in_paragraph = 128
        self.max_char_in_batch = 512

        self.puncs = set([x for x in load_txt_data(punc_conf_path)
                          ])  # TODO: 用set数据结构
        self.puncs_pattern = ['\\' + x for x in self.puncs]

        self.segment_labels = [
            x.split('\t')[0] for x in load_txt_data(seg_conf_path)
        ]
        self.segment_symbol = [
            x.split('\t')[1] for x in load_txt_data(seg_conf_path)
        ]
        self.segment_label_symbol_dict = dict(
            zip(self.segment_labels, self.segment_symbol))

        self.place_holder = 'b'

        self.raw_data = self.read_file(self.book_list)

        self.raw_data = self.format_data()
        self.punc_data = self.tag_punc_data()
def split_doc2(data_path, out_path):
    import re
    data = load_txt_data(data_path)
    doc_index = 0
    for i in tqdm(range(len(data))):
        try:
            line = data[i].split(',')
            if len(line[0]) < 100:
                continue
            abstract = re.sub("[\" ]", "", line[1])
            abstract = ' '.join(abstract)
            tmp = re.sub("[\" ]", "", line[0])
            tmp = tmp.split('。')
            document = []
            for x in tmp:
                document.append(' '.join(x))
        except IndexError:
            continue

        # print(document)
        for j in range(len(document)):
            document[j] = document[j] + '\n'
        new_doc = document + ['@highlight\n'] + [abstract]
        save_txt_file(new_doc, out_path + str(doc_index) + '.story')
        doc_index += 1
Example #3
0
def tokenize_format_lines(args):
    input_path = args.raw_path
    output_path = args.json_path + args.data_name + '.json'
    input_data = load_txt_data(input_path)
    pun = oneOf(list("。,,;:()()!?\\—、丨/"))
    json_data_set = []
    for i in tqdm(range(len(input_data))):
        json_dict = {}
        raw = input_data[i].split(',', 4)
        if args.vy_predict:
            abstract = [list('NONE')]
            sentence = pun.split(re.sub("\"", '', raw[4]))
        else:
            abstract = [list(raw[0])]
            sentence = pun.split(raw[1])
        split_sentence = []
        for split_content in sentence:
            split_content = list(split_content)
            if split_content:
                split_sentence.append(split_content)

        json_dict['src'] = split_sentence
        json_dict['tgt'] = abstract
        if args.vy_predict:
            json_dict['doc_id'] = raw[0]
        json_data_set.append(json_dict)
    # with open(output_path, 'w', encoding='utf-8') as save:
    #     save.write(json.dumps(json_data_set, ensure_ascii=False))
    return json_data_set
Example #4
0
 def get_labels(self):
     # return ["O", "B-MISC", "I-MISC", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC", "[CLS]", "[SEP]"]
     # TODO: define label
     # puncs = [',', '。', '?', ',', '.', '\?', "、", '》', '《', ';', ':', ';', ':', '‘', '“', "\"", "'",
     #          '【', '】', '{', '}', '…', '\|', '~', '·', '!', '\!', '/', '「', '」', '(', ')', '〔', '〕',
     #          '\(', '\)', '\[', '\]', '#other#']
     from utils.dataio import load_txt_data
     puncs = [x for x in load_txt_data('./utils/config/punctuation.dat')]
     segs = [
         x.split('\t')[1]
         for x in load_txt_data('./utils/config/segmentation.dat')
     ]
     # puncs = puncs + [x + '#end#' for x in puncs] + ["[CLS]", "[SEP]"]
     puncs += ["[CLS]", "[SEP]"]
     segs += ["[CLS]", "[SEP]"]
     return puncs
def filter_data(path):
    data = load_txt_data(path)
    res = []
    for item in tqdm(data, desc='Filter'):
        raw = item.split(',')
        doc = raw[1]
        abst = raw[0]
        if len(doc) >= 100:
            res.append('{},{}'.format(abst, doc))
    save_txt_file(res, path)
def revers_index(path):
    data = load_txt_data(path)
    res = []
    for item in data:
        raw = item.split(',')
        doc = raw[0]
        try:
            abst = raw[1]
        except IndexError:
            continue
        res.append('{},{}'.format(abst, doc))
    save_txt_file(res, path)
def split_doc(data_path, out_path):
    data = load_txt_data(data_path)
    doc_index = 0
    for i in tqdm(range(len(data)), desc='split_doc'):
        line = data[i].split(',')
        abstract = " ".join(line[0])
        from pyparsing import oneOf
        punc = oneOf(list("。,;;!?"))
        document = [' '.join(x) for x in punc.split(line[1])]
        # print(document)
        for j in range(len(document)):
            document[j] = document[j] + '\n'
        new_doc = document + ['@highlight\n'] + [abstract]
        _doc_index = str(doc_index)
        while len(_doc_index) <= 8:
            _doc_index = '0' + _doc_index
        save_txt_file(new_doc, out_path + _doc_index + '.story')
        doc_index += 1
Example #8
0
    def read_file(self, file_names):
        """

        :param file_names:
        :return:

        load data: 按文本自然段读取
                [
                    ["para1"],
                    ["para2"],
                    ]
        """
        res = []
        count = 0
        for file_name in tqdm(file_names, desc='Load Data'):
            raw = load_txt_data(self.data_dir + file_name, origin=True)
            count += len(raw)
            for item in tqdm(raw,
                             desc='adding data from {}'.format(file_name)):
                res.append(item)
        self.logger.info("  Num paragraph = %d", count)
        return res
Example #9
0
    def read_file(self, file_names):
        """
        读取文档并筛选优质数据
        :param file_names:
        :return:

        load data: [
                    ["sentence1", "sentence2", "sentence3"],
                    ["sentence1", "sentence2", "sentence3"],
                    ]
        """
        res = []

        for file_name in file_names:
            raw = load_txt_data(self.dir + file_name,
                                origin=True)[:self.max_num]
            # raw = load_txt_data(self.dir + file_name, origin=True)[-50:]

            for i in trange(len(raw), desc='Load {}'.format(file_name)):
                paragraph: str = raw[i].strip()
                paragraph_char_num = self.count_info(paragraph)
                if self.min_char_in_paragraph < paragraph_char_num < self.max_char_in_paragraph:  # 筛选字数范围内段落
                    paragraph: str = self.remove_none_puncs_paragraph(
                        paragraph)
                    if paragraph:
                        paragraph: str = re.sub('\\u3000', ':', paragraph)
                        paragraph: str = self.format_puncs_space(paragraph)
                        paragraph: list = self.split_paragraph(paragraph)
                        paragraph: list = [
                            x for x in paragraph
                            if len(x) <= self.max_char_in_sentence
                        ]
                        res.append(paragraph)
        import random
        random.seed(1024)
        random.shuffle(res)
        self.logger.info("  Num paragraph = %d", len(res))
        return res
def merge_files(path_list, merge_path):
    data = []
    for path in path_list:
        data += load_txt_data(path)
    save_txt_file(data, merge_path)