def do_check(self, data_path_list=[]): try: if not data_path_list: data_path_list = common.Cfg().get('local', 'data_paths') data_path_list = common.expand_path(data_path_list) data_path_list = common.get_yml_path_list(data_path_list) for data_path in data_path_list: print('') LOG.info('Checking %s' % data_path) with open(data_path, 'r') as f: qas = yaml.load(f.read()) for qa in qas: if not qa: self._error(qa, 'qa is none') for value in qa.values(): if not value: self._error(qa, 'value is none') else: for item in value: if type(item) is dict: self._error(qa, 'item is dict') if not item: self._error(qa, 'item is none') LOG.info('Check Passed!') except Exception as e: LOG.error(e)
def get_path_list(): path_list = common.Cfg().get('local', 'data_paths') path_list = common.expand_path(path_list) path_list = common.get_md_path_list(path_list) return path_list
def do_transform(self, file_path_list=[]): file_path_list = common.expand_path(file_path_list) for file_path in file_path_list: LOG.info('Transforming %s' % file_path) text = '' with open(file_path, 'r') as fp: text = fp.read() if not text: raise Exception('empty file!') text = re.sub(r'[^\u000A-\u007E]', '', text) text = re.sub(r'#*', '', text) text = re.sub(r':', ',', text) text = re.sub(r'//*', r'/', text) text = re.sub(r'!!*', '!', text) text = re.sub(r',,*', ',', text) text = re.sub(r'::*', ':', text) text = re.sub(r';;*', ';', text) text = re.sub(r'\?\?*', '?', text) text = re.sub(r'\(\(*', '(', text) text = re.sub(r'\)\)*', ')', text) text = re.sub(r'\(.*\)', '', text) text = re.sub(r'\[\[*', '[', text) text = re.sub(r'\]\]*', ']', text) text = re.sub(r'\[.*\]', '', text) text = re.sub(r'\{\{*', '{', text) text = re.sub(r'\}\}*', '}', text) text = re.sub(r'{.*}', '', text) text = re.sub(r'\<\<*', '<', text) text = re.sub(r'\>\>*', '>', text) text = re.sub(r'<.*>', '', text) text = re.sub(r' *', ' ', text) text = re.sub(r'^ *', '', text) text = re.sub(r' *$', '', text) text = re.sub(r'\? ', r'?\n', text) text = re.sub(r'! ', r'!\n', text) text = re.sub(r'\. ', r'.\n', text) text = re.sub(r'&.*;', '', text) text = re.sub(r'\n\n*\n', '\n', text) text = re.sub(r'.{140,9999}\n', '', text) text = re.sub(r'\n[^a-zA-Z]*\n', r'\n', text) text = re.sub(r'\n[^a-zA-Z0-9]*', r'\n', text) text = re.sub(r'(.*\?\n)(.*[^\?]\n)', r'\1\n\2', text) text = re.sub(r'(.*[^\?]\n)(.*\?\n)', r'\1\n\n\2', text) list1 = text.split('\n\n\n') list2 = [] for item in list1: sub_item1 = item.split('\n\n') sub_item2 = [sub_item1[0].split('\n'), sub_item1[-1].split('\n')] list2.append(sub_item2) yml_path = file_path + '.yml' with open(yml_path, 'w') as fp: for item in list2: fp.write('\n- que:\n') for que in item[0]: if not que: continue fp.write(' - %s\n' % que) fp.write(' ans:\n') for ans in item[-1]: if not ans: continue fp.write(' - %s\n' % ans)
def do_transform(self, file_path_list=[]): file_path_list = common.expand_path(file_path_list) new_file_path_list = [] # Copy file for file_path in file_path_list: file_name = os.path.basename(file_path) new_file_path = os.path.join(self.out_path, file_name) if not os.path.exists(new_file_path): LOG.info('Copy %s' % file_path) shutil.copyfile(file_path, new_file_path) new_file_path_list.append(new_file_path) # Initilization for file_path in new_file_path_list: LOG.info('Initializing %s' % file_path) text = '' with open(file_path, 'r') as fr: text = fr.read() if not text: raise Exception('empty file!') text = re.sub(r'[^\u000A-\u007E]', '', text) text = re.sub(r'#*', '', text) text = re.sub(r':', ',', text) text = re.sub(r'//*', r'/', text) text = re.sub(r'!!*', '!', text) text = re.sub(r',,*', ',', text) text = re.sub(r'::*', ':', text) text = re.sub(r';;*', ';', text) text = re.sub(r'\?\?*', '?', text) text = re.sub(r'\(\(*', '(', text) text = re.sub(r'\)\)*', ')', text) text = re.sub(r'\(.*\)', '', text) text = re.sub(r'\[\[*', '[', text) text = re.sub(r'\]\]*', ']', text) text = re.sub(r'\[.*\]', '', text) text = re.sub(r'\{\{*', '{', text) text = re.sub(r'\}\}*', '}', text) text = re.sub(r'{.*}', '', text) text = re.sub(r'\<\<*', '<', text) text = re.sub(r'\>\>*', '>', text) text = re.sub(r'<.*>', '', text) text = re.sub(r' *', ' ', text) text = re.sub(r'^ *', '', text) text = re.sub(r' *$', '', text) text = re.sub(r'\? ', r'?\n', text) text = re.sub(r'! ', r'!\n', text) text = re.sub(r'\. ', r'.\n', text) text = re.sub(r'&.*;', '', text) text = re.sub(r'\n\n*\n', '\n', text) text = re.sub(r'.{140,9999}\n', '', text) text = re.sub(r'\n[^a-zA-Z]*\n', r'\n', text) text = re.sub(r'\n[^a-zA-Z0-9]*', r'\n', text) text = re.sub(r'(.*\?\n)(.*[^\?]\n)', r'\1\n\2', text) text = re.sub(r'(.*[^\?]\n)(.*\?\n)', r'\1\n\n\2', text) with open(file_path, 'w') as fw: fw.write(text) # Transform for file_path in new_file_path_list: LOG.info('Transforming %s' % file_path) with open(file_path, 'r') as fr: text = fr.read() list1 = text.split('\n\n\n') list2 = [] for item in list1: sub_item1 = item.split('\n\n') sub_item2 = [ sub_item1[0].split('\n'), sub_item1[-1].split('\n') ] list2.append(sub_item2) yml_name = os.path.splitext(file_path)[0] + '.yml' with open(os.path.join(self.out_path, yml_name), 'w') as fw: fw.write('qas:\n') for item in list2: fw.write('\n- que:\n') for que in item[0]: if not que: continue fw.write(' - %s\n' % que) fw.write(' ans:\n') for ans in item[-1]: if not ans: continue fw.write(' - %s\n' % ans)