def final_check_space_in_solutions(self): """ 检查解答中的空格, 开头和结尾的空格不算 带公式的数学、化学较多,暂时不支持数学、化学 :return:z """ log_server.logging('>>>>>>Checking extra space in solution<<<<<<') start = time.time() result_list = [] for i, text in enumerate(self.solution): htmlPattern = re.compile(r'(html\s*{.*}\s*)') if len(re.findall(htmlPattern, text)) > 0: text = re.sub(htmlPattern, '', text).strip() if self.subject not in ['数学', '化学']: latex_p = re.compile(r"(\${.*?}\$)") space_p = re.compile(r'(?<=\S) {3,}(?=\S)') if len(re.findall(latex_p, text)) != 0: for latex in re.findall(latex_p, text): l_len = len(latex) text = text.replace(latex, l_len * '#') res = [(m.start(), m.start() + len(m.group())) for m in re.finditer(space_p, text)] _, ress = sortList(res, None) result_list += [[i, res] for res in ress] if len(result_list) > 0: self.adderrorList( 'zh_format_error', 'solutions', replace=0, description=zh_check_config['description']['zh_format_error']) # print('space checking {}'.format(time.time() - start)) log_server.logging( ">>> Finished space checking! Total time is {}.<<< ".format( time.time() - start))
def summarize(self, query, symbol_error_dict, spell_error_dict, unmatch_error_dict): """ 总结错误 :param symbol_error_dict: :param spell_error_dict: :param unmatch_error_dict: :return: """ log_server.logging('>>>>>Start summarize all error type<<<<<') tid, question, answers, opts, solutions, _type = parse_data(query) # keys = symbol_error_dict.keys() # for key in tqdm(spell_error_dict.keys()): # if key in keys: # symbol_error_dict[key].extend(spell_error_dict[key]) # else: # symbol_error_dict[key] = spell_error_dict[key] # for key in tqdm(unmatch_error_dict.keys()): # if key in keys: # symbol_error_dict[key].extend(unmatch_error_dict[key]) # else: # symbol_error_dict[key] = unmatch_error_dict[key] all_error_dict = dict() all_error_list = list() if len(symbol_error_dict) > 0: all_error_list.extend(symbol_error_dict[tid]) if len(spell_error_dict) > 0: all_error_list.extend(spell_error_dict[tid]) if len(unmatch_error_dict) > 0: all_error_list.extend(unmatch_error_dict[tid]) all_error_dict[tid] = all_error_list print(all_error_dict) print('Finish summarizing! All error dict saved.') return all_error_dict
def __init__(self, mode): self.mode = mode self.mc = MatchChecker() self.et = EnchantTool() self.nt = NltkTool() self.multi_word_list = self.nt.get_multi_word_list() self.st = SequenceTagger() self.fc = FormatChecker() log_server.logging("Init en_checker over!")
def delTab(self, text): """ 去除制表符, 换行符, 去除两个以上的英文字符 """ log_server.logging(">>> Checking zh deltab error !") start = time.time() sentence = re.sub(r'[\t\n\xa0\s]', '', text) sentence = re.sub(r'[a-zA-Z]{2,}', '', sentence) log_server.logging( "Deltab finish! Total time is {} >>>".format(time.time() - start)) return sentence
def check_spell(self, query): """ 检查英文拼写(NER采用LSTM) """ log_server.logging('>>> checking spelling error ') start_time = time.time() spell_error_dict = self.et.spell_checker(query) end_time = time.time() log_server.logging( 'Spelling check finish! Total time is {} >>>'.format(end_time - start_time)) # write_to_json(all_error_dict, error_save_file) return spell_error_dict
def final_check_confusion_symbol(self): """ 优化终检问题,解决多个符号乱用 eg: 。,|。,|!。 :return: """ log_server.logging(">>> Check confusion symbol !! <<<<") start = time.time() check_list = [(self.description, 'description'), (self.stems, 'stems'), (self.solution, 'solutions')] for value in check_list: if isinstance(value[0], str) and len(value[0]) > 0 and value[0] is not None: errors_list, result_list = self.process.specialSymbol( value[0], -1) if len(errors_list) > 0 and len(result_list) > 0: self.adderrorList( 'zh_symbol_error_1', value[1], replace=0, description=zh_check_config['description'] ['zh_symbol_error_1'], source=errors_list, pos=result_list) elif isinstance(value[0], list): errors_list, result_list = [], [] for i, data in enumerate(value[0]): if isinstance(data, dict): e, r = self.process.specialSymbol(data['stem'], i) errors_list += e result_list += r else: e, r = self.process.specialSymbol(data, i) errors_list += e result_list += r if len(errors_list) > 0 and len(result_list) > 0: self.adderrorList( 'zh_symbol_error_1', value[1], replace=0, description=zh_check_config['description'] ['zh_symbol_error_1'], source=errors_list, pos=result_list) # print('confusion Symbol {}'.format(time.time() - start)) log_server.logging( ">>> Finished confusion symbol !! Total time is {} <<<<".format( time.time() - start))
def check_symbol_format(self, query): """ 符号、格式和全半角检查 :param query: :return: """ log_server.logging(">>> checking symbol and full-width error ") start_time = time.time() symbol_error_dict = self.fc.format_checker(query) end_time = time.time() log_server.logging( 'Symbol and full-width check finish and saved! Total time is {} >>>' .format(end_time - start_time)) return symbol_error_dict
def check_match(self, query, grade_subject): """ 匹配答案解析等各种对应关系质检 :param query: :return: """ log_server.logging(">>> Checking matching error") start_time = time.time() unmatch_error_dict = dict() unmatch_error_dict = self.mc.parser(query, grade_subject) end_time = time.time() log_server.logging( 'Matching check finish and saved! Total time is {} >>>'.format( end_time - start_time)) return unmatch_error_dict
def parse(self, data): """ 根据不同 grade、subject 调用相应的检查器进行质检 :param data: :return: """ log_server.logging('====== Allocation Checker! ======') grade, subject, query = data['grade'], data['subject'], data['query'] query = self.text_filter(query) if subject in ['history', 'politics', 'chinese']: faspell = self.com_spellchecker elif subject in ['biology']: faspell = self.bio_spellchecker elif subject in ['physics', 'math', 'chemistry']: faspell = self.science_spellchecker else: faspell = None #modified if subject == 'english': en_error_set = self.en_checker.checker(grade, subject, query) return en_error_set else: self.zh_checker = ZhCheck(dictionary=query, subject=config['trans_subject'][subject], lac=self.lac_mode, faspell=faspell, ancient=self.ancient) zh_position, zh_error_set = [], [] #modified p, errors = self.zh_checker(tradWordDetect=self.tradWordDetect, wrongWordDetect=self.wrongWordDetect, keywordMatch=self.keywordMatch, contentMatch=self.contentMatch, symbolCheck=self.symbolCheck, typeMatch=self.typeMatch, contTypeMatch=self.contTypeMatch, enSymbolDetect=self.enSymbolDetect, serialCheck=self.serialCheck) if len(p) > 0: zh_position.append(p) if len(errors) > 0: zh_error_set.append(errors) return zh_error_set
def matchbracket(self): """ 检测括号,引号是否匹配 """ log_server.logging(">>> Checking Bracket <<<") start = time.time() check_list = [(self.description, 'description'), (self.stems, 'stems'), (self.solution, 'solutions')] for value in check_list: symbol, position = [], [] if isinstance(value[0], list): for i, text in enumerate(value[0]): if isinstance(text, dict): s, p = self.process.bracketMatch(text['stem'], i) symbol += s position += p else: s, p = self.process.bracketMatch(text, i) symbol += s position += p if len(symbol) > 0 and len(position) > 0: self.adderrorList( 'zh_symbol_error', sent=value[1], replace=0, description=zh_check_config['description'] ['zh_symbol_error'], source=symbol, pos=position) else: if len(value[0]) > 0: s, p = self.process.bracketMatch(value[0], -1) if len(s) > 0 and len(p) > 0: self.adderrorList( 'zh_symbol_error', sent=value[1], replace=0, description=zh_check_config['description'] ['zh_symbol_error'], source=s, pos=p) # print('bracket checking {}'.format(time.time() - start)) log_server.logging( '>>> Finished bracket checking!! Total time is {}<<<'.format( time.time() - start))
def get_glove_vocab(filename): """Load vocab from file Args: filename: path to the glove vectors Returns: vocab: set() of strings """ log_server.logging("Building vocab...") vocab = set() with open(filename) as f: for line in f: word = line.strip().split(' ')[0] vocab.add(word) log_server.logging("- done. {} tokens".format(len(vocab))) return vocab
def get_vocabs(datasets): """Build vocabulary from an iterable of datasets objects Args: datasets: a list of dataset objects Returns: a set of all the words in the dataset """ log_server.logging("Building vocab...") vocab_words = set() vocab_tags = set() for dataset in datasets: for words, tags in dataset: vocab_words.update(words) vocab_tags.update(tags) log_server.logging("- done. {} tokens".format(len(vocab_words))) return vocab_words, vocab_tags
def extractLatex(self, text): """ 提取Latext公式, 并记录位置。 """ log_server.logging(">>> Extrace Latex !") start = time.time() latex = re.findall(r'\$.+?\$', text) alphabet = re.findall(r'(\$\{\s*[A-F]{1,4}\s*\}\$)', text) alphabet1 = re.findall(r'(\$\{\s*(\\times)\s*\}\$)', text) diff = set(latex).difference(set(alphabet + alphabet1)) idx = list() # index of list for value in diff: # will return the index of the latex formula in the sentence # 找到题干中latex公式的位置, 并保存 index = text.find(value) while index != -1: latex_index = {} latex_index['id'] = self.id latex_index["sources"] = text latex_index["index"] = index # 如若此latext公式已经存在在列表中, 则直接往对应的LaTeX公式中保存句子位置等信息 if value in self.position.keys(): self.position[value].append(latex_index) # 如若此latext公式尚未存在在列表中, 则添加相对应的所有位置信息。 else: idx.append(latex_index) self.position[value] = idx # 继续寻找该句子中下一latex公式的位置 index = text.find(value, index + 1) #返回题干中,用空格代替LaTeX公式。 text = text.replace(value, len(value) * '&') # print('extract latex {}'.format(time.time() - start)) log_server.logging( "Extrace Latex finish! Total time is {} >>>".format(time.time() - start)) return text
def parse_spellDetect(self, data): """ 解析数据, 进行错词检测。 :param data: :return: """ error_detail = dict() log_server.logging('====== Allocation spell Detect! ======') zh_error_set = [] subject, query = data['subject'], data['query'] query = self.text_filter(query) if subject in ['history', 'politics', 'chinese']: faspell = self.com_spellchecker elif subject in ['biology']: faspell = self.bio_spellchecker elif subject in ['physics', 'math', 'chemistry']: faspell = self.science_spellchecker else: faspell = None subject = config['trans_subject'][subject] for ques in query: ques = split_sentence(ques) process = Data_Process(subject) for que in ques: if not self.ancient.detect(que): data = preprocess(que) data = process.numProcess(data) if (len(data) < 100) & (len(data) > 10): result = faspell.make_corrections([data]) """对错别字检测结果进行后处理""" error_detail = process.wordProcess(result) if len(error_detail) > 0: zh_error_set.append( process.ocr_spell_process(error_detail)) log_server.logging('====== Spell Detect Finished! ======') error_detail['error_detail'] = zh_error_set return error_detail
def write_vocab(vocab, filename): """Writes a vocab to a file Writes one word per line. Args: vocab: iterable that yields word filename: path to vocab file Returns: write a word per line """ log_server.logging("Writing vocab...") with open(filename, "w") as f: for i, word in enumerate(vocab): if i != len(vocab) - 1: f.write("{}\n".format(word)) else: f.write(word) log_server.logging("- done. {} tokens".format(len(vocab)))
def __init__(self, config, model_path, max_length): self.config = config self.config.init_checkpoint = model_path self.config.max_seq_length = max_length log_server.logging(self.config.init_checkpoint) log_server.logging(self.config.max_seq_length) self.graph = tf.reset_default_graph() # create session session_conf = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False, intra_op_parallelism_threads=4, inter_op_parallelism_threads=4) session_conf.gpu_options.allow_growth = True session_conf.gpu_options.per_process_gpu_memory_fraction = 0.9 self.session = tf.Session(config=session_conf) # set_session(self.session) # load model # tf.compat.v1.disable_eager_execution() self.model = self.load_model(config) self.session.run(tf.global_variables_initializer()) log_server.logging( "-----------load {} sucess------------".format(model_path)) self.processor = Processor(config.vocab_file, config.max_seq_length)
def ansContMatch(self): """ 题干加选项与解答进行匹配。 情况1: 判断选项是否为空, 如若为空,则题型不匹配。 (选择题选项不能为空) 情况2:判断选项的长度是否与解答的长度相匹配。 2.1 判断答案是否为列表,还是为字符串。 2.1.1 为列表,则遍历列表中的字符串并把对应答案的选项和解答传到optSolu的方法里面进行下一步校验。 2.1.2 为字符串, 则直接答案, 对应的选项,传到optSolu的方法中进行下一步校验。 情况3:当答案长度大于选项,解答时, 则遍历答案,并把答案一一与解答和选项相对应。 情况4: 当答案长度与解答或选项不对应时,此为特殊例子。 待优化。 """ self.ErrorSet = [] count = 0 # 情况1 if all(self.option): #情况2 if len(self.option) == len(self.answer): for ii, ans in enumerate(self.answer): #情况2.1.1 if type(ans) == list: for a in ans: a = self.removeSub(a) count = self.optSolu(a, ii, count) #情况2.1.2 else: ans = self.removeSub(ans) count = self.optSolu(ans, ii, count) #情况3 elif (len(self.option) < len(self.answer)) & (len( self.option) == 1) & (len(self.solution) < len( self.answer)) & (len(self.solution) == 1): for ans in self.answer: #情况3.1:当选项为列表时,需要一一遍历对应校验 if type(ans) == list: for a in ans: a = self.removeSub(a) count = self.optSolu(a, 0, count) #情况3.2:当选项为字符串时,直接校验 else: ans = self.removeSub(ans) count = self.optSolu(ans, 0, count) #情况4 else: log_server.logging('specialCase', len(self.option), len(self.answer), len(self.solution)) if count is not None: if count > 0: return None else: # 返回 题干选项解答不匹配错误。 self.ErrorSet.append( ['(题干加选项内容)与解答不匹配', 'zh_qos_unmatch_error']) return self.ErrorSet else: # 选项内容为空 return None else: # 选项为空 return None
def __init__(self): log_server.logging('===============Init Checker!===============') self.tradWordDetect = config["zh_check_config"][ 'chinese_textQ_config']["tradWordDetect"] self.wrongWordDetect = config["zh_check_config"][ 'chinese_textQ_config']["wrongWordDetect"] self.keywordMatch = config["zh_check_config"]['chinese_textQ_config'][ "keywordMatch"] self.contentMatch = config["zh_check_config"]['chinese_textQ_config'][ "contentMatch"] self.symbolCheck = config["zh_check_config"]['chinese_textQ_config'][ "symbolCheck"] self.typeMatch = config["zh_check_config"]['chinese_textQ_config'][ "typeMatch"] self.contTypeMatch = config["zh_check_config"]['chinese_textQ_config'][ "contTypeMatch"] self.enSymbolDetect = config["zh_check_config"][ 'chinese_textQ_config']["enSymbolDetect"] self.serialCheck = config["zh_check_config"]['chinese_textQ_config'][ "serialCheck"] log_server.logging('>>> 1.Zh_checker Initializing !') self.lac_mode = LAC( mode=config["zh_check_config"]['lac_segment']['lac_mode']) self.lac_mode.load_customization( config["zh_check_config"]['lac_segment']['word_loc']) self.ancient = AncientClassifier() self.com_spellchecker = SpellChecker( config['faspell_config']["literal"]['model'], config['faspell_config']["literal"]['max_seq_length']) self.bio_spellchecker = SpellChecker( config['faspell_config']["biology"]['model'], config['faspell_config']["biology"]['max_seq_length']) self.science_spellchecker = SpellChecker( config['faspell_config']["science"]['model'], config['faspell_config']["science"]['max_seq_length']) log_server.logging(self.com_spellchecker) log_server.logging('Zh_checker Initialization over ! >>>') log_server.logging('>>> 2.En_checker Initializing !') self.en_selection_mode = [ 'check symbol and full-width', 'check spell', 'check match', 'check grammar', 'all functions' ] self.en_checker = EnCheck(self.en_selection_mode[4]) log_server.logging('En_checker Initialization over ! >>>') log_server.logging('>>> 3.Fm_checker Initializing !') self.fm_mode = [ 'symbol_repeat', 'illegal_symbol', 'func_name', 'brackets', 'latex2mathml', 'texcheck', 'textidote', 'LaTeXEqChecker', 'mathJax' ] self.fm_select_mode = [ 'func_name', 'symbol_repeat', 'latex2mathml', 'textidote', 'illegal_symbol' ] # self.latex_checker = FmChecker(self.fm_select_mode) self.re_ = re.compile(r'(\\rm)|</?[^<>]+>') log_server.logging('Fm_checker Initialization over ! >>>') log_server.logging('=============Init Checker Over!=============')
def text_quality_check(): """ Main Interface Route """ log_server.logging('============Enter Checker System !===========') result = { 'code': config['RETURN_CODE']['OK'], 'message': config['RETURN_MSG']['OK'], 'en_data': dict() } try: post_data = request.data # step1: 数据编码校验 if isinstance(post_data, bytes): post_data = post_data.decode() query_json = json.loads(post_data) except Exception as e: log_server.logging('Please check input en_data: {}'.format(e)) result['code'] = config['RETURN_CODE']['DATA_FORMAT_ERROR'] result['message'] = config['RETURN_MSG']['DATA_FORMAT_ERROR'] return jsonify(result) # step2: 数据格式校验 if not check_param(query_json): log_server.logging('Data Format Error: {}'.format(query_json)) result['code'] = config['RETURN_CODE']['PARAMETERS_ERROR'] result['message'] = config['RETURN_MSG']['PARAMETERS_ERROR'] return jsonify(result) # step3: 开始检测 pre_start = time.time() try: with counter.get_lock(): ### 这里是检测的入口 ### check_result = controller.parse(query_json) # 处理主函数 counter.value += 1 current_file_path = os.path.join( os.getcwd() + '/access_date', ''.join(start_time[0:10].split('-')) + '.txt') with open(current_file_path, 'w', encoding='utf-8') as f: localtime = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) st, end = start_time[0:10].split('-'), localtime[0:10].split( '-') spend_time = ( datetime.datetime(int(end[0]), int(end[1]), int(end[2])) - datetime.datetime(int(st[0]), int(st[1]), int(st[2]))).days f.write("开始时间: {}, 当前时间: {}, 共 {} 天, 使用量 {} 次".format( start_time, localtime, spend_time, str(counter.value))) except Exception as e: log_server.logging('Predict Error: {}'.format(e)) result['code'] = config['RETURN_CODE']['HANDLE_ERROR'] result['message'] = config['RETURN_MSG']['HANDLE_ERROR'] return jsonify(result) pre_end = time.time() result['data'] = check_result log_server.logging( '>>>>>>>> Time of the whole process: {:.6f}'.format(pre_end - pre_start)) log_server.logging('============Exit Checker System !===========') return jsonify(result)
def tqc_spell_check(): """ Main Interface Route """ log_server.logging('============Enter Spell Checker System !===========') result = { 'code': config['RETURN_CODE']['OK'], 'message': config['RETURN_MSG']['OK'], 'en_data': dict() } try: post_data = request.data # step1: 数据编码校验 if isinstance(post_data, bytes): post_data = post_data.decode() query_json = json.loads(post_data) except Exception as e: log_server.logging('Please check input en_data: {}'.format(e)) result['code'] = config['RETURN_CODE']['DATA_FORMAT_ERROR'] result['message'] = config['RETURN_MSG']['DATA_FORMAT_ERROR'] return jsonify(result) # step2: 数据格式校验 if not check_param_1(query_json): log_server.logging('Data Format Error: {}'.format(query_json)) result['code'] = config['RETURN_CODE']['PARAMETERS_ERROR'] result['message'] = config['RETURN_MSG']['PARAMETERS_ERROR'] return jsonify(result) # step3: 开始检测 pre_start = time.time() try: ### 这里是检测的入口 ### check_result = controller.parse_spellDetect(query_json) except Exception as e: log_server.logging('Predict Error: {}'.format(e)) result['code'] = config['RETURN_CODE']['HANDLE_ERROR'] result['message'] = config['RETURN_MSG']['HANDLE_ERROR'] return jsonify(result) pre_end = time.time() result['data'] = check_result log_server.logging( '>>>>>>>> Time of the whole process: {:.6f}'.format(pre_end - pre_start)) log_server.logging('============Exit Checker System !===========') return jsonify(result)
def contentMatch(self, keywordMatch, contentMatch): """ 分题型对题干, 答案,选项, 解答进行匹配。 选择题: 数学, 政治,历史,语文的只做:答案A与解答故A校验 其他科目则,答案A与解答故A校验 和 题干+正确答案选项 -> 解答校验。 简答题, 材料分析题, 填空题, 解答题, 实验探究他, 选择填空题,论述题等: 数学:pass 其他科目则:答案与解答校验 和题干与解答校验 """ log_server.logging(">>> Checking zh keywords match error !") start = time.time() if (self.type == '选择题') | (self.type == '多选题') | (self.type == '单选题'): """ 选择题或多选题, 数学,政治,历史与语文只做答案A 与故选A 校验 其他科目则做答案A 与故选A校验 和 题干加选项与解答校验。 """ if keywordMatch: errorAns = self.multiChoice() if (errorAns is None) & contentMatch: self.ansContMatch() elif contentMatch: self.ansContMatch() elif (self.type == '材料分析题') | (self.type == '简答题') | ( self.type == '实验探究题') | (self.type == '论述题') | ( self.type == '选择填空题') | (self.type == '解答题') | (self.type == '填空题'): """ 解答题,填空题,等, 数学跳过 其他科目则做答案与解答校验, 和题干与解答校验。 """ if contentMatch: if (self.subject == '数学'): pass else: self.shortAns() elif (self.type == '判断题'): if keywordMatch: """ 因为地理学科的解答多没有明确提及判断正确或错误字眼, 因此地理学科跳过。 因为政治的判断题中的解答多为开放性回答, 并没有具体提及判断正确或错误的字眼。 """ if (self.subject == '地理'): #& (self.subject == '政治') pass else: result = qa_matching(self.answer, self.solution, subject=self.subject) if result is not None: if len(result) > 0: if result['description'] != 'solution匹配不到关键字': self.adderrorList( result['errorType'], sent='answers-solutions', replace=0, description=result['description']) elif (self.type == '辨析题'): if checkSymbol(self.answer): if keywordMatch: result = qa_matching(self.answer, self.solution, subject=self.subject) if (result is not None): if len(result) > 0: if result['description'] != 'solution匹配不到关键字': self.adderrorList( result['errorType'], sent='answers-solutions', replace=0, description=result['description']) else: if contentMatch: errorAns = self.aqc.ansMatch(self.answer, self.question) if (errorAns is not None): for errA in errorAns: self.adderrorList(errA[1], sent='answers-solutions', replace=0, description=errA[0]) log_server.logging( "Keywords match finish! Total time is {} >>>".format(time.time() - start))
def spellDetect(self, tradWord, wrongWord): """ 错词检测:繁体字检测, 错别字检测 """ log_server.logging(">>> Detecting wrong words! <<<") start = time.time() """理科的解答多为公式, 因此不对理科的解答做错词检测""" if self.subject not in ['物理', '化学', '数学']: totallist = [(self.description, 'description'), (self.stems, 'stems'), (self.solution, 'solutions')] else: totallist = [(self.description, 'description'), (self.stems, 'stems')] if tradWord | wrongWord: for value in totallist: source = [] target = [] position = [] trad_list = [] if isinstance(value[0], str) and all(value[0]): if wrongWord: try: trad_list, source, target, position = self.process.spell_process( value[0], -1, self.ancient, self.spellchecker, self.description, tradWord) except Exception as e: log_server.logging( '>>>Checking spell function exception: {}'. format(e)) elif tradWord: try: trad_list += self.process.tradProcess(value[0], -1) except Exception as e: log_server.logging( '>>>Checking trad function exception: {}'. format(e)) else: if all(value[0]): for i, text in enumerate(value[0]): if isinstance(text, dict): if wrongWord: try: tradl, s, t, p = self.process.spell_process( text['stem'], i, self.ancient, self.spellchecker, self.stems[i]['stem'], tradWord) source += s target += t position += p trad_list += tradl except Exception as e: log_server.logging( '>>>Checking spell function exception: {}' .format(e)) elif tradWord: try: trad_list += self.process.tradProcess( text['stem'], i) except Exception as e: log_server.logging( '>>>Checking trad function exception: {}' .format(e)) else: if wrongWord: try: tradl, s, t, p = self.process.spell_process( text, i, self.ancient, self.spellchecker, self.solution[i], tradWord) source += s target += t position += p trad_list += tradl except Exception as e: log_server.logging( '>>>Checking spell function exception: {}' .format(e)) else: try: trad_list += self.process.tradProcess( text, i) except Exception as e: log_server.logging( '>>>Checking trad function exception: {}' .format(e)) else: pass if source and target and position: self.adderrorList( 'zh_spell_error', sent=value[1], replace=1, description=zh_check_config['description'] ['zh_spell_error'].format(value[0]), source=source, target=target, pos=position) if tradWord and (trad_list is not None): if len(trad_list) > 0: source, target, position = [], [], [] for trad in trad_list: source.append(trad[0]) target.append(trad[1]) position.append([trad[3], (trad[2], trad[2] + 1)]) self.adderrorList( 'zh_font_error', sent=value[1], replace=1, description=zh_check_config['description'] ['zh_font_error'].format(value[0]), source=source, target=target, pos=position) log_server.logging( ">>> Wrong words detection finish! Total time is {} <<<".format( time.time() - start))
def contTypeCheck(self, contTypeMatch): """ 对题目的格式进行校验。 判断题的关键字, 除了√×w外, 其他一律不规范 选择题中答案如果不是A-Z, 一律视为不规范。 :param contTypeMatch: :return: """ def judgeCheck(answer, contTypeMatch): if self.subject == '政治': return True elif (self.subject == '地理'): if answer.strip(PUNCTUATION) in ['对', '错']: return True else: if contTypeMatch: self.adderrorList( 'zh_type_style_unmatch_error', sent='type', replace=0, description=zh_check_config['description'] ['zh_type_style_unmatch_error']['判断题'].format( self.type)) else: return False elif (self.subject == '生物'): if answer.strip(PUNCTUATION) in ['T', 'F']: return True else: if contTypeMatch: self.adderrorList( 'zh_type_style_unmatch_error', sent='type', replace=0, description=zh_check_config['description'] ['zh_type_style_unmatch_error']['判断题'].format( self.type)) else: return False else: if contTypeMatch: self.adderrorList( 'zh_type_style_unmatch_error', sent='type', replace=0, description=zh_check_config['description'] ['zh_type_style_unmatch_error']['判断题'].format( self.type)) else: return False log_server.logging(">>> Checking the format of the question! <<<") start = time.time() pattern = re.compile(r'[A-Za-z]') answer = pump_list_to_str(self.answer) if not simple_serial_matching(answer): if not all(self.solution): if contTypeMatch: self.adderrorList( 'zh_type_style_unmatch_error', sent='type', replace=0, description=zh_check_config['description'] ['zh_type_style_unmatch_error']['选择题']['解答'], source='解答:{}'.format("text")) else: return False if self.type == '判断题': if len(answer) > 1: for ans in answer: if isinstance(ans, list): ans = ''.join(ans) ans = latex_process(ans) if checkSymbol(ans.strip(PUNCTUATION), special=True): return True else: return judgeCheck(answer, contTypeMatch) else: answer = latex_process(answer) if checkSymbol(answer.strip(PUNCTUATION), special=True): return True else: return judgeCheck(answer, contTypeMatch) elif (self.type == '选择题') | (self.type == '多选题') | (self.type == '单选题'): if len(re.findall(pattern, answer)) > 0: return True else: if contTypeMatch: self.adderrorList( 'zh_type_style_unmatch_error', sent='type', replace=0, description=zh_check_config['description'] ['zh_type_style_unmatch_error']['选择题'] ['答案'].format(self.type)) else: return False else: return True else: return True log_server.logging( ">>> Format checking finished! Total time is {} <<<".format( time.time() - start))
def typeCheck(self, typeMatch): """ 对题目进行题型匹配功能检测: 选择题的选项不能为空 判断题的答案需匹配到关键字 除辨析题和以上两个题型外的其他题型, 如果能匹配到判断题关键字, 则题型不匹配。 如果选项不为空, 则题型不匹配。 :param typeMatch: 只有typeMatch==True的时候, 才能添加并显示错误。 :return: """ log_server.logging(">>> Checking the type match !!!<<<") start = time.time() answer = pump_list_to_str(self.answer) if self.subject == '地理': value_list = ['对', '错'] else: value_list = None if (self.type == '选择题') | (self.type == '多选题') | (self.type == '单选题'): flag = False for stem in self.stems: if "options" in stem.keys(): if all(stem['options']): flag = True if flag: return True else: if typeMatch: self.adderrorList( 'zh_type_unmatch_error', sent='type', replace=0, description=zh_check_config['description'] ['zh_type_unmatch_error']['选择题'].format(self.type)) else: return False elif self.type == '判断题': """ 生物较为特殊, 判断题中是有存在T/F,A/B这种格式的判断。 如果答案中存在小题号, 那么每个小题都必须有判断题的关键字, 否则提醒不匹配 """ if self.subject == '政治': return True else: if value_list is None: value_list = ['T', 'F', 'A', 'B', '对', '错'] else: value_list += ['T', 'F', 'A', 'B', '对', '错'] p1 = re.compile(r'[(\(]\d+[)\)]') """检查答案中是否存在小题号""" if len(self.answer) > 1: count = 0 record = [] for i, ans in enumerate(self.answer): ans = pump_list_to_str(ans) if simple_serial_matching(answer): ans = re.sub(p1, '', ans) ans = latex_process(ans) if checkSymbol(ans.strip(PUNCTUATION), value_list=value_list): count += 1 else: record.append(i + 1) if count == len(self.answer): return True else: if typeMatch: self.adderrorList( 'zh_type_unmatch_error', sent='type', replace=0, description=zh_check_config['description'] ['zh_type_unmatch_error']['判断题'].format( self.type, record)) else: return False else: if simple_serial_matching(answer): answer = re.sub(p1, '', answer) answer = latex_process(answer) if checkSymbol(answer.strip(PUNCTUATION), value_list=value_list): return True else: if typeMatch: self.adderrorList( 'zh_type_unmatch_error', sent='type', replace=0, description=zh_check_config['description'] ['zh_type_unmatch_error']['判断题'].format( self.type, [1])) else: return False else: """ 非判断题的答案中存在A/B可能是为选择题等情况。 因此不添加A/B关键字 """ if not (self.subject == '生物'): if value_list is None: value_list = ['T', 'F'] else: value_list += ['T', 'F'] if self.type == '辨析题': return True else: flag = False for stem in self.stems: if "options" in stem.keys(): flag = True if flag: """ 语文中的现代文阅读, 文言文阅读, 综合读写等题目中都是选择题+解答题混合的格式, 无法进行准确分辨,因此跳过语文科目。 """ if (self.subject == '语文'): return True else: if typeMatch: self.adderrorList( 'zh_type_unmatch_error', sent='type', replace=0, description=zh_check_config['description'] ['zh_type_unmatch_error']['非选择题'].format( self.type)) else: return False else: p1 = re.compile(r'[(\(]\d+[)\)]') if len(self.answer) > 1: """ 当答案中存在小题号的情况, 我们需要判断是否所有小题的内容都为判断题内容。 否则它并非判断题,因此没有题型不匹配错误。 """ count = 0 for ans in self.answer: if isinstance(ans, list): ans = ''.join(ans) if simple_serial_matching(answer): ans = re.sub(p1, '', ans) ans = latex_process(ans) if checkSymbol(ans.strip(PUNCTUATION), value_list=value_list): count += 1 if count == len(self.answer): if self.subject == '语文': return True else: if typeMatch: self.adderrorList( 'zh_type_unmatch_error', sent='type', replace=0, description=zh_check_config[ 'description'] ['zh_type_unmatch_error'] ['非判断题'].format(self.type)) else: return False else: return True else: if simple_serial_matching(answer): answer = re.sub(p1, '', answer) answer = latex_process(answer) if checkSymbol(answer.strip(PUNCTUATION), value_list=value_list): if self.subject == '语文': return True else: if typeMatch: self.adderrorList( 'zh_type_unmatch_error', sent='type', replace=0, description=zh_check_config[ 'description'] ['zh_type_unmatch_error'] ['非判断题'].format(self.type)) else: return False else: return True log_server.logging( ">>> Type Match finish! Total time is {} <<<".format(time.time() - start))
def serial_check(self, serialCheck): """ 对答案和解答中是否存在某小题号缺失, 小题号格式不完整, 小题号格式不统一等情况进行检测。 题目格式比较多样, 有(一)/一、/Ⅰ/①/(1)/(1)等格式存在。 但是我们只对(1)/(1)进行检验。 如果某小题中,缺失小题号, 但是它为选择题选项, 则跳过。 :param serialCheck: :return: """ if serialCheck: log_server.logging('>>> Checking serial sub question !!! <<<') start = time.time() flag = False checkList = [(self.stems, 'stems'), (self.answer, 'answers'), (self.solution, 'solutions')] if self.type == '选择题': flag = True if not flag: for checkL in checkList: text = pump_list_to_str(checkL[0]) if simple_serial_matching(text): zh_pattern = re.findall( r'([\(\(][一二三四五六七八九十][\)\)]{1,2})', text) zh_pattern1 = re.findall(r'([一二三四五六七八九十]{1,2}、)', text) zh_pattern2 = re.findall(r'([ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩ]{1,3})', text) zh_pattern3 = re.findall(r'(【[\u4e00-\u9fff]+】)', text) if (len(zh_pattern) == 0) & (len(zh_pattern1) == 0) & ( len(zh_pattern2) == 0) & (len(zh_pattern3) == 0): if len(checkL[0]) > 1: for i, value in enumerate(checkL[0]): if isinstance(value, dict): value = pump_list_to_str(value['stem']) else: value = pump_list_to_str(value).strip() special = re.search(r'^(【拓展】)', value) if (special is not None): pass else: pattern = re.search(r'^(解[::])', value) if pattern is not None: value = re.sub( r'(解[::])', '', value).strip() pattern1 = re.search( r'^([\(\(](\${\d+\}\$)[\)\)])', value) pattern2 = re.search( r'^(\$\{[\(\(]\d+[\)\)]\}\$)', value) if (pattern1 is not None) | ( pattern2 is not None): value = re.sub( r'[\$\{\}\$]', '', value).strip() x = re.search( r'^([\(\(](\d){1,2}[\)\)])', value) x1 = re.search( r'([\(\(](\d){1,2}[\)\)])', value) x2 = re.search(r'^((\d){1,2}\.)', value) if (x2 == None): if (x == None) & (x1 == None): if len(value) <= 10: if len( re.findall( '[A-D]{1,4}', value)) > 0: pass else: self.adderrorList( 'zh_serial_unmatch_error', checkL[1], replace=0, description= zh_check_config[ 'description'] ['zh_serial_unmatch_error'] .format( checkL[1], i + 1, i + 1)) else: self.adderrorList( 'zh_serial_unmatch_error', checkL[1], replace=0, description= zh_check_config[ 'description'] ['zh_serial_unmatch_error'] .format( checkL[1], i + 1, i + 1)) else: pass else: pass # print('serial sub question checking {}'.format(time.time() - start)) log_server.logging( '>>>Finished serial sub question checking!!! Total time is {}' .format(time.time() - start)) else: pass
def enSymbolCheck(self): """ 对题目, 解答还有选项进行全半角英文字符混用检测。 如果检测到混用的情况, 返回检测错误。并且将全角英文字符更改会半角英文字符。 :param enSymbolDetect: :return: """ log_server.logging(">>> Checking english symbol in sentence!!! <<<") start = time.time() check_list = [(self.description, "description"), (self.solution, "solutions")] count = 0 for _list in check_list: record, replaces, positions = [], [], [] sentence = _list[0] type = _list[1] cont = '' if all(sentence): if isinstance(sentence, list): for i, sent in enumerate(sentence): if isinstance(sent, dict): flag, r, repl, pos = self.process.enSymbolProcess( i, sent['stem']) else: flag, r, repl, pos = self.process.enSymbolProcess( i, sent) if flag: record += r replaces += repl positions += pos if len(record) > 0 and len(replaces) > 0 and len( positions) > 0: count += 1 self.adderrorList( 'zh_enSymbol_unmatch_error', sent=type, replace=1, description=zh_check_config['description'] ['zh_enSymbol_unmatch_error'].format(type), source=record, target=replaces, pos=positions) else: flag, r, repl, pos = self.process.enSymbolProcess( -1, sentence) if flag: record += r replaces += repl positions += pos if len(record) > 0 and len(replaces) > 0 and len( positions) > 0: count += 1 self.adderrorList( 'zh_enSymbol_unmatch_error', sent=type, replace=1, description=zh_check_config['description'] ['zh_enSymbol_unmatch_error'].format(type), source=record, target=replaces, pos=positions) # print('english symbol checking {}'.format(time.time() - start)) log_server.logging( ">>> Finished English symbol checking!!! Total time is {}<<<". format(time.time() - start)) if count == 0: return True else: return False
def checker(self, grade, subject, query): """ 英语检查器入口 :param query: :return: """ log_server.logging(">>> Enter en_checker !") all_error_dict = dict() grade_subject = grade + '_' + subject ### step1 解析数据 ### tid, _, _, _, _, _ = parse_data(query) if self.mode == 'all functions': ### step2 符号检查 ### try: symbol_error_dict = self.check_symbol_format(query) except Exception as e: log_server.logging('Symbol check exception: {}'.format(e)) symbol_error_dict = dict() ### step3 拼写检查 ### try: spell_error_dict = self.check_spell(query) except Exception as e: log_server.logging('Spell check exception: {}'.format(e)) spell_error_dict = dict() ### step4 检查答案与解答是否对应 ### try: unmatch_error_dict = self.check_match(query, grade_subject) except Exception as e: log_server.logging('Check match exception: {}'.format(e)) unmatch_error_dict = dict() ### step5 语法检查 ### try: grammar_error_dict = self.check_grammar(query) except Exception as e: log_server.logging('Grammar check exception: {}'.format(e)) grammar_error_dict = dict() ############################### 第四步 错误类型汇总 ################################ all_error_dict = merge_dicts(tid, symbol_error_dict, spell_error_dict, unmatch_error_dict, grammar_error_dict) log_server.logging('Finish en checker! All error dict saved.') return all_error_dict elif self.mode == 'check symbol and full-width': all_error_dict = self.check_symbol_format(query) return all_error_dict elif self.mode == 'check spell': all_error_dict = self.check_spell(query) return all_error_dict elif self.mode == 'check match': all_error_dict = self.check_match(query, grade_subject) return all_error_dict elif self.mode == 'check grammar': all_error_dict = self.check_grammar(query) return all_error_dict else: return all_error_dict
def __call__(self, tradWordDetect=False, wrongWordDetect=False, keywordMatch=False, contentMatch=False, symbolCheck=False, typeMatch=False, contTypeMatch=False, enSymbolDetect=False, serialCheck=False): log_server.logging(">>> Enter zh_checker ! ") try: if symbolCheck: self.final_check_confusion_symbol() self.final_check_space_in_solutions() except Exception as e: log_server.logging( '>>>Checking special symbol function exception: {}'.format(e)) try: if serialCheck: self.serial_check(serialCheck) except Exception as e: log_server.logging( '>>>Checking serial function exception: {}'.format(e)) self.description = self.delTab(self.description) self.description = self.extractLatex(self.description) solu_list = [] for solu in self.solution: solu_list.append(self.extractLatex(solu)) self.solution = solu_list try: if symbolCheck: self.matchbracket() except Exception as e: log_server.logging( '>>>Checking symbol function exception: {}'.format(e)) try: if enSymbolDetect: self.enSymbolCheck() except Exception as e: log_server.logging( '>>>Checking enSymbol function exception: {}'.format(e)) try: if self.typeCheck(typeMatch): if self.contTypeCheck(contTypeMatch): self.contentMatch(keywordMatch, contentMatch) except Exception as e: log_server.logging( '>>>Checking type function exception: {}'.format(e)) try: #错词功能部分。 测试时 if self.subject == '语文': if self.type in ['现代文阅读', '写作', '名著阅读']: self.spellDetect(tradWordDetect, wrongWordDetect) else: pass elif self.subject in ['化学', '数学', '地理']: pass else: self.spellDetect(tradWordDetect, wrongWordDetect) except Exception as e: log_server.logging( '>>>Checking spell and trad function exception: {}'.format(e)) log_server.logging("Zh_checker finish ! >>>") return self.position, self.errorList