Beispiel #1
0
    def final_check_space_in_solutions(self):
        """
        检查解答中的空格, 开头和结尾的空格不算
        带公式的数学、化学较多,暂时不支持数学、化学
        :return:z
        """
        log_server.logging('>>>>>>Checking extra space in solution<<<<<<')
        start = time.time()
        result_list = []
        for i, text in enumerate(self.solution):
            htmlPattern = re.compile(r'(html\s*{.*}\s*)')
            if len(re.findall(htmlPattern, text)) > 0:
                text = re.sub(htmlPattern, '', text).strip()

            if self.subject not in ['数学', '化学']:
                latex_p = re.compile(r"(\${.*?}\$)")
                space_p = re.compile(r'(?<=\S) {3,}(?=\S)')
                if len(re.findall(latex_p, text)) != 0:
                    for latex in re.findall(latex_p, text):
                        l_len = len(latex)
                        text = text.replace(latex, l_len * '#')
                res = [(m.start(), m.start() + len(m.group()))
                       for m in re.finditer(space_p, text)]
                _, ress = sortList(res, None)
                result_list += [[i, res] for res in ress]
        if len(result_list) > 0:
            self.adderrorList(
                'zh_format_error',
                'solutions',
                replace=0,
                description=zh_check_config['description']['zh_format_error'])
        # print('space checking {}'.format(time.time() - start))
        log_server.logging(
            ">>> Finished space checking! Total time is {}.<<< ".format(
                time.time() - start))
Beispiel #2
0
 def summarize(self, query, symbol_error_dict, spell_error_dict,
               unmatch_error_dict):
     """
     总结错误
     :param symbol_error_dict:
     :param spell_error_dict:
     :param unmatch_error_dict:
     :return:
     """
     log_server.logging('>>>>>Start summarize all error type<<<<<')
     tid, question, answers, opts, solutions, _type = parse_data(query)
     # keys = symbol_error_dict.keys()
     # for key in tqdm(spell_error_dict.keys()):
     #     if key in keys:
     #         symbol_error_dict[key].extend(spell_error_dict[key])
     #     else:
     #         symbol_error_dict[key] = spell_error_dict[key]
     # for key in tqdm(unmatch_error_dict.keys()):
     #     if key in keys:
     #         symbol_error_dict[key].extend(unmatch_error_dict[key])
     #     else:
     #         symbol_error_dict[key] = unmatch_error_dict[key]
     all_error_dict = dict()
     all_error_list = list()
     if len(symbol_error_dict) > 0:
         all_error_list.extend(symbol_error_dict[tid])
     if len(spell_error_dict) > 0:
         all_error_list.extend(spell_error_dict[tid])
     if len(unmatch_error_dict) > 0:
         all_error_list.extend(unmatch_error_dict[tid])
     all_error_dict[tid] = all_error_list
     print(all_error_dict)
     print('Finish summarizing! All error dict saved.')
     return all_error_dict
Beispiel #3
0
 def __init__(self, mode):
     self.mode = mode
     self.mc = MatchChecker()
     self.et = EnchantTool()
     self.nt = NltkTool()
     self.multi_word_list = self.nt.get_multi_word_list()
     self.st = SequenceTagger()
     self.fc = FormatChecker()
     log_server.logging("Init en_checker over!")
Beispiel #4
0
 def delTab(self, text):
     """
     去除制表符, 换行符, 去除两个以上的英文字符
     """
     log_server.logging(">>> Checking zh deltab error !")
     start = time.time()
     sentence = re.sub(r'[\t\n\xa0\s]', '', text)
     sentence = re.sub(r'[a-zA-Z]{2,}', '', sentence)
     log_server.logging(
         "Deltab finish! Total time is {} >>>".format(time.time() - start))
     return sentence
Beispiel #5
0
 def check_spell(self, query):
     """
     检查英文拼写(NER采用LSTM)
     """
     log_server.logging('>>> checking spelling error ')
     start_time = time.time()
     spell_error_dict = self.et.spell_checker(query)
     end_time = time.time()
     log_server.logging(
         'Spelling check finish! Total time is {} >>>'.format(end_time -
                                                              start_time))
     # write_to_json(all_error_dict, error_save_file)
     return spell_error_dict
Beispiel #6
0
    def final_check_confusion_symbol(self):
        """
        优化终检问题,解决多个符号乱用
        eg:
        。,|。,|!。
        :return:
        """
        log_server.logging(">>> Check confusion symbol !! <<<<")
        start = time.time()
        check_list = [(self.description, 'description'), (self.stems, 'stems'),
                      (self.solution, 'solutions')]

        for value in check_list:
            if isinstance(value[0],
                          str) and len(value[0]) > 0 and value[0] is not None:
                errors_list, result_list = self.process.specialSymbol(
                    value[0], -1)
                if len(errors_list) > 0 and len(result_list) > 0:
                    self.adderrorList(
                        'zh_symbol_error_1',
                        value[1],
                        replace=0,
                        description=zh_check_config['description']
                        ['zh_symbol_error_1'],
                        source=errors_list,
                        pos=result_list)
            elif isinstance(value[0], list):
                errors_list, result_list = [], []
                for i, data in enumerate(value[0]):
                    if isinstance(data, dict):
                        e, r = self.process.specialSymbol(data['stem'], i)
                        errors_list += e
                        result_list += r
                    else:
                        e, r = self.process.specialSymbol(data, i)
                        errors_list += e
                        result_list += r
                if len(errors_list) > 0 and len(result_list) > 0:
                    self.adderrorList(
                        'zh_symbol_error_1',
                        value[1],
                        replace=0,
                        description=zh_check_config['description']
                        ['zh_symbol_error_1'],
                        source=errors_list,
                        pos=result_list)

        # print('confusion Symbol {}'.format(time.time() - start))
        log_server.logging(
            ">>> Finished confusion symbol !! Total time is {} <<<<".format(
                time.time() - start))
Beispiel #7
0
 def check_symbol_format(self, query):
     """
     符号、格式和全半角检查
     :param query:
     :return:
     """
     log_server.logging(">>> checking symbol and full-width error ")
     start_time = time.time()
     symbol_error_dict = self.fc.format_checker(query)
     end_time = time.time()
     log_server.logging(
         'Symbol and full-width check finish and saved! Total time is {} >>>'
         .format(end_time - start_time))
     return symbol_error_dict
Beispiel #8
0
 def check_match(self, query, grade_subject):
     """
     匹配答案解析等各种对应关系质检
     :param query:
     :return:
     """
     log_server.logging(">>> Checking matching error")
     start_time = time.time()
     unmatch_error_dict = dict()
     unmatch_error_dict = self.mc.parser(query, grade_subject)
     end_time = time.time()
     log_server.logging(
         'Matching check finish and saved! Total time is {} >>>'.format(
             end_time - start_time))
     return unmatch_error_dict
    def parse(self, data):
        """
        根据不同 grade、subject 调用相应的检查器进行质检
        :param data:
        :return:
        """
        log_server.logging('====== Allocation Checker! ======')
        grade, subject, query = data['grade'], data['subject'], data['query']

        query = self.text_filter(query)
        if subject in ['history', 'politics', 'chinese']:
            faspell = self.com_spellchecker
        elif subject in ['biology']:
            faspell = self.bio_spellchecker
        elif subject in ['physics', 'math', 'chemistry']:
            faspell = self.science_spellchecker
        else:
            faspell = None

        #modified

        if subject == 'english':
            en_error_set = self.en_checker.checker(grade, subject, query)
            return en_error_set
        else:
            self.zh_checker = ZhCheck(dictionary=query,
                                      subject=config['trans_subject'][subject],
                                      lac=self.lac_mode,
                                      faspell=faspell,
                                      ancient=self.ancient)
            zh_position, zh_error_set = [], []
            #modified
            p, errors = self.zh_checker(tradWordDetect=self.tradWordDetect,
                                        wrongWordDetect=self.wrongWordDetect,
                                        keywordMatch=self.keywordMatch,
                                        contentMatch=self.contentMatch,
                                        symbolCheck=self.symbolCheck,
                                        typeMatch=self.typeMatch,
                                        contTypeMatch=self.contTypeMatch,
                                        enSymbolDetect=self.enSymbolDetect,
                                        serialCheck=self.serialCheck)
            if len(p) > 0:
                zh_position.append(p)
            if len(errors) > 0:
                zh_error_set.append(errors)

            return zh_error_set
Beispiel #10
0
    def matchbracket(self):
        """
        检测括号,引号是否匹配
        """
        log_server.logging(">>> Checking Bracket <<<")
        start = time.time()
        check_list = [(self.description, 'description'), (self.stems, 'stems'),
                      (self.solution, 'solutions')]

        for value in check_list:
            symbol, position = [], []
            if isinstance(value[0], list):
                for i, text in enumerate(value[0]):
                    if isinstance(text, dict):
                        s, p = self.process.bracketMatch(text['stem'], i)
                        symbol += s
                        position += p
                    else:
                        s, p = self.process.bracketMatch(text, i)
                        symbol += s
                        position += p
                if len(symbol) > 0 and len(position) > 0:
                    self.adderrorList(
                        'zh_symbol_error',
                        sent=value[1],
                        replace=0,
                        description=zh_check_config['description']
                        ['zh_symbol_error'],
                        source=symbol,
                        pos=position)
            else:
                if len(value[0]) > 0:
                    s, p = self.process.bracketMatch(value[0], -1)
                    if len(s) > 0 and len(p) > 0:
                        self.adderrorList(
                            'zh_symbol_error',
                            sent=value[1],
                            replace=0,
                            description=zh_check_config['description']
                            ['zh_symbol_error'],
                            source=s,
                            pos=p)
        # print('bracket checking  {}'.format(time.time() - start))
        log_server.logging(
            '>>> Finished bracket checking!! Total time is {}<<<'.format(
                time.time() - start))
Beispiel #11
0
def get_glove_vocab(filename):
    """Load vocab from file

    Args:
        filename: path to the glove vectors

    Returns:
        vocab: set() of strings
    """
    log_server.logging("Building vocab...")
    vocab = set()
    with open(filename) as f:
        for line in f:
            word = line.strip().split(' ')[0]
            vocab.add(word)
    log_server.logging("- done. {} tokens".format(len(vocab)))
    return vocab
Beispiel #12
0
def get_vocabs(datasets):
    """Build vocabulary from an iterable of datasets objects

    Args:
        datasets: a list of dataset objects

    Returns:
        a set of all the words in the dataset

    """
    log_server.logging("Building vocab...")
    vocab_words = set()
    vocab_tags = set()
    for dataset in datasets:
        for words, tags in dataset:
            vocab_words.update(words)
            vocab_tags.update(tags)
    log_server.logging("- done. {} tokens".format(len(vocab_words)))
    return vocab_words, vocab_tags
Beispiel #13
0
    def extractLatex(self, text):
        """
        提取Latext公式, 并记录位置。
        """

        log_server.logging(">>> Extrace Latex !")
        start = time.time()
        latex = re.findall(r'\$.+?\$', text)
        alphabet = re.findall(r'(\$\{\s*[A-F]{1,4}\s*\}\$)', text)
        alphabet1 = re.findall(r'(\$\{\s*(\\times)\s*\}\$)', text)
        diff = set(latex).difference(set(alphabet + alphabet1))
        idx = list()  # index of list
        for value in diff:
            # will return the index of the latex formula in the sentence
            # 找到题干中latex公式的位置, 并保存
            index = text.find(value)
            while index != -1:

                latex_index = {}
                latex_index['id'] = self.id
                latex_index["sources"] = text
                latex_index["index"] = index

                # 如若此latext公式已经存在在列表中, 则直接往对应的LaTeX公式中保存句子位置等信息
                if value in self.position.keys():
                    self.position[value].append(latex_index)

                # 如若此latext公式尚未存在在列表中, 则添加相对应的所有位置信息。
                else:
                    idx.append(latex_index)
                    self.position[value] = idx

                # 继续寻找该句子中下一latex公式的位置
                index = text.find(value, index + 1)

            #返回题干中,用空格代替LaTeX公式。
            text = text.replace(value, len(value) * '&')
        # print('extract latex {}'.format(time.time() - start))
        log_server.logging(
            "Extrace Latex finish! Total time is {} >>>".format(time.time() -
                                                                start))
        return text
    def parse_spellDetect(self, data):
        """
        解析数据, 进行错词检测。
        :param data:
        :return:
        """
        error_detail = dict()
        log_server.logging('====== Allocation spell Detect! ======')
        zh_error_set = []
        subject, query = data['subject'], data['query']
        query = self.text_filter(query)

        if subject in ['history', 'politics', 'chinese']:
            faspell = self.com_spellchecker
        elif subject in ['biology']:
            faspell = self.bio_spellchecker
        elif subject in ['physics', 'math', 'chemistry']:
            faspell = self.science_spellchecker
        else:
            faspell = None

        subject = config['trans_subject'][subject]

        for ques in query:
            ques = split_sentence(ques)
            process = Data_Process(subject)
            for que in ques:
                if not self.ancient.detect(que):
                    data = preprocess(que)
                    data = process.numProcess(data)
                    if (len(data) < 100) & (len(data) > 10):
                        result = faspell.make_corrections([data])
                        """对错别字检测结果进行后处理"""
                        error_detail = process.wordProcess(result)
                        if len(error_detail) > 0:
                            zh_error_set.append(
                                process.ocr_spell_process(error_detail))

        log_server.logging('====== Spell Detect Finished! ======')
        error_detail['error_detail'] = zh_error_set
        return error_detail
Beispiel #15
0
def write_vocab(vocab, filename):
    """Writes a vocab to a file

    Writes one word per line.

    Args:
        vocab: iterable that yields word
        filename: path to vocab file

    Returns:
        write a word per line

    """
    log_server.logging("Writing vocab...")
    with open(filename, "w") as f:
        for i, word in enumerate(vocab):
            if i != len(vocab) - 1:
                f.write("{}\n".format(word))
            else:
                f.write(word)
    log_server.logging("- done. {} tokens".format(len(vocab)))
Beispiel #16
0
    def __init__(self, config, model_path, max_length):
        self.config = config
        self.config.init_checkpoint = model_path
        self.config.max_seq_length = max_length

        log_server.logging(self.config.init_checkpoint)
        log_server.logging(self.config.max_seq_length)
        self.graph = tf.reset_default_graph()
        # create session
        session_conf = tf.ConfigProto(allow_soft_placement=True,
                                      log_device_placement=False,
                                      intra_op_parallelism_threads=4,
                                      inter_op_parallelism_threads=4)
        session_conf.gpu_options.allow_growth = True
        session_conf.gpu_options.per_process_gpu_memory_fraction = 0.9
        self.session = tf.Session(config=session_conf)
        # set_session(self.session)
        # load model
        # tf.compat.v1.disable_eager_execution()
        self.model = self.load_model(config)
        self.session.run(tf.global_variables_initializer())
        log_server.logging(
            "-----------load {} sucess------------".format(model_path))
        self.processor = Processor(config.vocab_file, config.max_seq_length)
Beispiel #17
0
    def ansContMatch(self):
        """
        题干加选项与解答进行匹配。
        情况1: 判断选项是否为空, 如若为空,则题型不匹配。 (选择题选项不能为空)
        情况2:判断选项的长度是否与解答的长度相匹配。
            2.1 判断答案是否为列表,还是为字符串。
                2.1.1 为列表,则遍历列表中的字符串并把对应答案的选项和解答传到optSolu的方法里面进行下一步校验。
                2.1.2 为字符串, 则直接答案, 对应的选项,传到optSolu的方法中进行下一步校验。
        情况3:当答案长度大于选项,解答时, 则遍历答案,并把答案一一与解答和选项相对应。
        情况4: 当答案长度与解答或选项不对应时,此为特殊例子。 待优化。
        """
        self.ErrorSet = []
        count = 0
        # 情况1
        if all(self.option):
            #情况2
            if len(self.option) == len(self.answer):
                for ii, ans in enumerate(self.answer):
                    #情况2.1.1
                    if type(ans) == list:
                        for a in ans:
                            a = self.removeSub(a)
                            count = self.optSolu(a, ii, count)
                    #情况2.1.2
                    else:
                        ans = self.removeSub(ans)
                        count = self.optSolu(ans, ii, count)

            #情况3
            elif (len(self.option) < len(self.answer)) & (len(
                    self.option) == 1) & (len(self.solution) < len(
                        self.answer)) & (len(self.solution) == 1):
                for ans in self.answer:

                    #情况3.1:当选项为列表时,需要一一遍历对应校验
                    if type(ans) == list:
                        for a in ans:
                            a = self.removeSub(a)
                            count = self.optSolu(a, 0, count)

                    #情况3.2:当选项为字符串时,直接校验
                    else:
                        ans = self.removeSub(ans)
                        count = self.optSolu(ans, 0, count)

            #情况4
            else:
                log_server.logging('specialCase', len(self.option),
                                   len(self.answer), len(self.solution))

            if count is not None:
                if count > 0:
                    return None
                else:
                    # 返回 题干选项解答不匹配错误。
                    self.ErrorSet.append(
                        ['(题干加选项内容)与解答不匹配', 'zh_qos_unmatch_error'])
                    return self.ErrorSet
            else:
                # 选项内容为空
                return None
        else:
            # 选项为空
            return None
    def __init__(self):
        log_server.logging('===============Init Checker!===============')
        self.tradWordDetect = config["zh_check_config"][
            'chinese_textQ_config']["tradWordDetect"]
        self.wrongWordDetect = config["zh_check_config"][
            'chinese_textQ_config']["wrongWordDetect"]
        self.keywordMatch = config["zh_check_config"]['chinese_textQ_config'][
            "keywordMatch"]
        self.contentMatch = config["zh_check_config"]['chinese_textQ_config'][
            "contentMatch"]
        self.symbolCheck = config["zh_check_config"]['chinese_textQ_config'][
            "symbolCheck"]
        self.typeMatch = config["zh_check_config"]['chinese_textQ_config'][
            "typeMatch"]
        self.contTypeMatch = config["zh_check_config"]['chinese_textQ_config'][
            "contTypeMatch"]
        self.enSymbolDetect = config["zh_check_config"][
            'chinese_textQ_config']["enSymbolDetect"]
        self.serialCheck = config["zh_check_config"]['chinese_textQ_config'][
            "serialCheck"]

        log_server.logging('>>> 1.Zh_checker Initializing !')
        self.lac_mode = LAC(
            mode=config["zh_check_config"]['lac_segment']['lac_mode'])
        self.lac_mode.load_customization(
            config["zh_check_config"]['lac_segment']['word_loc'])
        self.ancient = AncientClassifier()

        self.com_spellchecker = SpellChecker(
            config['faspell_config']["literal"]['model'],
            config['faspell_config']["literal"]['max_seq_length'])
        self.bio_spellchecker = SpellChecker(
            config['faspell_config']["biology"]['model'],
            config['faspell_config']["biology"]['max_seq_length'])
        self.science_spellchecker = SpellChecker(
            config['faspell_config']["science"]['model'],
            config['faspell_config']["science"]['max_seq_length'])

        log_server.logging(self.com_spellchecker)
        log_server.logging('Zh_checker Initialization over ! >>>')

        log_server.logging('>>> 2.En_checker Initializing !')
        self.en_selection_mode = [
            'check symbol and full-width', 'check spell', 'check match',
            'check grammar', 'all functions'
        ]
        self.en_checker = EnCheck(self.en_selection_mode[4])
        log_server.logging('En_checker Initialization over ! >>>')

        log_server.logging('>>> 3.Fm_checker Initializing !')
        self.fm_mode = [
            'symbol_repeat', 'illegal_symbol', 'func_name', 'brackets',
            'latex2mathml', 'texcheck', 'textidote', 'LaTeXEqChecker',
            'mathJax'
        ]
        self.fm_select_mode = [
            'func_name', 'symbol_repeat', 'latex2mathml', 'textidote',
            'illegal_symbol'
        ]
        # self.latex_checker = FmChecker(self.fm_select_mode)
        self.re_ = re.compile(r'(\\rm)|</?[^<>]+>')
        log_server.logging('Fm_checker Initialization over ! >>>')

        log_server.logging('=============Init Checker Over!=============')
def text_quality_check():
    """
    Main Interface Route
    """
    log_server.logging('============Enter Checker System !===========')
    result = {
        'code': config['RETURN_CODE']['OK'],
        'message': config['RETURN_MSG']['OK'],
        'en_data': dict()
    }

    try:
        post_data = request.data
        # step1: 数据编码校验
        if isinstance(post_data, bytes):
            post_data = post_data.decode()
        query_json = json.loads(post_data)
    except Exception as e:
        log_server.logging('Please check input en_data: {}'.format(e))
        result['code'] = config['RETURN_CODE']['DATA_FORMAT_ERROR']
        result['message'] = config['RETURN_MSG']['DATA_FORMAT_ERROR']
        return jsonify(result)

    # step2: 数据格式校验
    if not check_param(query_json):
        log_server.logging('Data Format Error: {}'.format(query_json))
        result['code'] = config['RETURN_CODE']['PARAMETERS_ERROR']
        result['message'] = config['RETURN_MSG']['PARAMETERS_ERROR']
        return jsonify(result)

    # step3: 开始检测
    pre_start = time.time()
    try:
        with counter.get_lock():
            ### 这里是检测的入口 ###
            check_result = controller.parse(query_json)  # 处理主函数
            counter.value += 1
            current_file_path = os.path.join(
                os.getcwd() + '/access_date',
                ''.join(start_time[0:10].split('-')) + '.txt')
            with open(current_file_path, 'w', encoding='utf-8') as f:
                localtime = time.strftime("%Y-%m-%d %H:%M:%S",
                                          time.localtime())
                st, end = start_time[0:10].split('-'), localtime[0:10].split(
                    '-')
                spend_time = (
                    datetime.datetime(int(end[0]), int(end[1]), int(end[2])) -
                    datetime.datetime(int(st[0]), int(st[1]), int(st[2]))).days
                f.write("开始时间: {}, 当前时间: {}, 共 {} 天, 使用量 {} 次".format(
                    start_time, localtime, spend_time, str(counter.value)))
    except Exception as e:
        log_server.logging('Predict Error: {}'.format(e))
        result['code'] = config['RETURN_CODE']['HANDLE_ERROR']
        result['message'] = config['RETURN_MSG']['HANDLE_ERROR']
        return jsonify(result)

    pre_end = time.time()
    result['data'] = check_result
    log_server.logging(
        '>>>>>>>> Time of the whole process: {:.6f}'.format(pre_end -
                                                            pre_start))
    log_server.logging('============Exit Checker System !===========')
    return jsonify(result)
def tqc_spell_check():
    """
    Main Interface Route
    """
    log_server.logging('============Enter Spell Checker System !===========')
    result = {
        'code': config['RETURN_CODE']['OK'],
        'message': config['RETURN_MSG']['OK'],
        'en_data': dict()
    }

    try:
        post_data = request.data
        # step1: 数据编码校验
        if isinstance(post_data, bytes):
            post_data = post_data.decode()
        query_json = json.loads(post_data)
    except Exception as e:
        log_server.logging('Please check input en_data: {}'.format(e))
        result['code'] = config['RETURN_CODE']['DATA_FORMAT_ERROR']
        result['message'] = config['RETURN_MSG']['DATA_FORMAT_ERROR']
        return jsonify(result)

    # step2: 数据格式校验
    if not check_param_1(query_json):
        log_server.logging('Data Format Error: {}'.format(query_json))
        result['code'] = config['RETURN_CODE']['PARAMETERS_ERROR']
        result['message'] = config['RETURN_MSG']['PARAMETERS_ERROR']
        return jsonify(result)

    # step3: 开始检测
    pre_start = time.time()
    try:
        ### 这里是检测的入口 ###
        check_result = controller.parse_spellDetect(query_json)
    except Exception as e:
        log_server.logging('Predict Error: {}'.format(e))
        result['code'] = config['RETURN_CODE']['HANDLE_ERROR']
        result['message'] = config['RETURN_MSG']['HANDLE_ERROR']
        return jsonify(result)

    pre_end = time.time()
    result['data'] = check_result
    log_server.logging(
        '>>>>>>>> Time of the whole process: {:.6f}'.format(pre_end -
                                                            pre_start))
    log_server.logging('============Exit Checker System !===========')
    return jsonify(result)
Beispiel #21
0
    def contentMatch(self, keywordMatch, contentMatch):
        """
        分题型对题干, 答案,选项, 解答进行匹配。
        选择题:
            数学, 政治,历史,语文的只做:答案A与解答故A校验
            其他科目则,答案A与解答故A校验 和 题干+正确答案选项 -> 解答校验。
        简答题, 材料分析题, 填空题, 解答题, 实验探究他, 选择填空题,论述题等:
            数学:pass
            其他科目则:答案与解答校验 和题干与解答校验
        """
        log_server.logging(">>> Checking zh keywords match error !")
        start = time.time()

        if (self.type == '选择题') | (self.type == '多选题') | (self.type == '单选题'):
            """
                选择题或多选题, 数学,政治,历史与语文只做答案A 与故选A 校验
                其他科目则做答案A 与故选A校验 和 题干加选项与解答校验。
            """
            if keywordMatch:
                errorAns = self.multiChoice()
                if (errorAns is None) & contentMatch:
                    self.ansContMatch()
            elif contentMatch:
                self.ansContMatch()

        elif (self.type == '材料分析题') | (self.type == '简答题') | (
                self.type == '实验探究题') | (self.type == '论述题') | (
                    self.type == '选择填空题') | (self.type == '解答题') | (self.type
                                                                    == '填空题'):
            """
                解答题,填空题,等, 数学跳过
                其他科目则做答案与解答校验, 和题干与解答校验。
            """
            if contentMatch:
                if (self.subject == '数学'):
                    pass
                else:
                    self.shortAns()

        elif (self.type == '判断题'):
            if keywordMatch:
                """
                因为地理学科的解答多没有明确提及判断正确或错误字眼, 因此地理学科跳过。
                因为政治的判断题中的解答多为开放性回答, 并没有具体提及判断正确或错误的字眼。
                """
                if (self.subject == '地理'):  #& (self.subject == '政治')
                    pass
                else:
                    result = qa_matching(self.answer,
                                         self.solution,
                                         subject=self.subject)
                    if result is not None:
                        if len(result) > 0:
                            if result['description'] != 'solution匹配不到关键字':
                                self.adderrorList(
                                    result['errorType'],
                                    sent='answers-solutions',
                                    replace=0,
                                    description=result['description'])

        elif (self.type == '辨析题'):
            if checkSymbol(self.answer):
                if keywordMatch:
                    result = qa_matching(self.answer,
                                         self.solution,
                                         subject=self.subject)
                    if (result is not None):
                        if len(result) > 0:
                            if result['description'] != 'solution匹配不到关键字':
                                self.adderrorList(
                                    result['errorType'],
                                    sent='answers-solutions',
                                    replace=0,
                                    description=result['description'])
            else:
                if contentMatch:
                    errorAns = self.aqc.ansMatch(self.answer, self.question)
                    if (errorAns is not None):
                        for errA in errorAns:
                            self.adderrorList(errA[1],
                                              sent='answers-solutions',
                                              replace=0,
                                              description=errA[0])

        log_server.logging(
            "Keywords match finish! Total time is {} >>>".format(time.time() -
                                                                 start))
Beispiel #22
0
    def spellDetect(self, tradWord, wrongWord):
        """
        错词检测:繁体字检测, 错别字检测
        """
        log_server.logging(">>> Detecting wrong words! <<<")
        start = time.time()
        """理科的解答多为公式, 因此不对理科的解答做错词检测"""
        if self.subject not in ['物理', '化学', '数学']:
            totallist = [(self.description, 'description'),
                         (self.stems, 'stems'), (self.solution, 'solutions')]
        else:
            totallist = [(self.description, 'description'),
                         (self.stems, 'stems')]

        if tradWord | wrongWord:
            for value in totallist:
                source = []
                target = []
                position = []
                trad_list = []
                if isinstance(value[0], str) and all(value[0]):
                    if wrongWord:
                        try:
                            trad_list, source, target, position = self.process.spell_process(
                                value[0], -1, self.ancient, self.spellchecker,
                                self.description, tradWord)
                        except Exception as e:
                            log_server.logging(
                                '>>>Checking spell function exception: {}'.
                                format(e))
                    elif tradWord:
                        try:
                            trad_list += self.process.tradProcess(value[0], -1)
                        except Exception as e:
                            log_server.logging(
                                '>>>Checking trad function exception: {}'.
                                format(e))

                else:
                    if all(value[0]):
                        for i, text in enumerate(value[0]):
                            if isinstance(text, dict):
                                if wrongWord:
                                    try:
                                        tradl, s, t, p = self.process.spell_process(
                                            text['stem'], i, self.ancient,
                                            self.spellchecker,
                                            self.stems[i]['stem'], tradWord)
                                        source += s
                                        target += t
                                        position += p
                                        trad_list += tradl
                                    except Exception as e:
                                        log_server.logging(
                                            '>>>Checking spell function exception: {}'
                                            .format(e))

                                elif tradWord:
                                    try:
                                        trad_list += self.process.tradProcess(
                                            text['stem'], i)
                                    except Exception as e:
                                        log_server.logging(
                                            '>>>Checking trad function exception: {}'
                                            .format(e))
                            else:
                                if wrongWord:
                                    try:
                                        tradl, s, t, p = self.process.spell_process(
                                            text, i, self.ancient,
                                            self.spellchecker,
                                            self.solution[i], tradWord)
                                        source += s
                                        target += t
                                        position += p
                                        trad_list += tradl
                                    except Exception as e:
                                        log_server.logging(
                                            '>>>Checking spell function exception: {}'
                                            .format(e))
                                else:
                                    try:
                                        trad_list += self.process.tradProcess(
                                            text, i)
                                    except Exception as e:
                                        log_server.logging(
                                            '>>>Checking trad function exception: {}'
                                            .format(e))

                    else:
                        pass

                if source and target and position:
                    self.adderrorList(
                        'zh_spell_error',
                        sent=value[1],
                        replace=1,
                        description=zh_check_config['description']
                        ['zh_spell_error'].format(value[0]),
                        source=source,
                        target=target,
                        pos=position)

                if tradWord and (trad_list is not None):
                    if len(trad_list) > 0:
                        source, target, position = [], [], []
                        for trad in trad_list:
                            source.append(trad[0])
                            target.append(trad[1])
                            position.append([trad[3], (trad[2], trad[2] + 1)])
                        self.adderrorList(
                            'zh_font_error',
                            sent=value[1],
                            replace=1,
                            description=zh_check_config['description']
                            ['zh_font_error'].format(value[0]),
                            source=source,
                            target=target,
                            pos=position)

        log_server.logging(
            ">>>  Wrong words detection finish! Total time is {} <<<".format(
                time.time() - start))
Beispiel #23
0
    def contTypeCheck(self, contTypeMatch):
        """
        对题目的格式进行校验。
        判断题的关键字, 除了√×w外, 其他一律不规范
        选择题中答案如果不是A-Z, 一律视为不规范。
        :param contTypeMatch:
        :return:
        """
        def judgeCheck(answer, contTypeMatch):
            if self.subject == '政治':
                return True
            elif (self.subject == '地理'):
                if answer.strip(PUNCTUATION) in ['对', '错']:
                    return True
                else:
                    if contTypeMatch:
                        self.adderrorList(
                            'zh_type_style_unmatch_error',
                            sent='type',
                            replace=0,
                            description=zh_check_config['description']
                            ['zh_type_style_unmatch_error']['判断题'].format(
                                self.type))
                    else:
                        return False
            elif (self.subject == '生物'):
                if answer.strip(PUNCTUATION) in ['T', 'F']:
                    return True
                else:
                    if contTypeMatch:
                        self.adderrorList(
                            'zh_type_style_unmatch_error',
                            sent='type',
                            replace=0,
                            description=zh_check_config['description']
                            ['zh_type_style_unmatch_error']['判断题'].format(
                                self.type))
                    else:
                        return False
            else:
                if contTypeMatch:
                    self.adderrorList(
                        'zh_type_style_unmatch_error',
                        sent='type',
                        replace=0,
                        description=zh_check_config['description']
                        ['zh_type_style_unmatch_error']['判断题'].format(
                            self.type))
                else:
                    return False

        log_server.logging(">>> Checking the format of the question! <<<")
        start = time.time()
        pattern = re.compile(r'[A-Za-z]')
        answer = pump_list_to_str(self.answer)

        if not simple_serial_matching(answer):
            if not all(self.solution):
                if contTypeMatch:
                    self.adderrorList(
                        'zh_type_style_unmatch_error',
                        sent='type',
                        replace=0,
                        description=zh_check_config['description']
                        ['zh_type_style_unmatch_error']['选择题']['解答'],
                        source='解答:{}'.format("text"))
                else:
                    return False
            if self.type == '判断题':
                if len(answer) > 1:
                    for ans in answer:
                        if isinstance(ans, list):
                            ans = ''.join(ans)
                        ans = latex_process(ans)
                        if checkSymbol(ans.strip(PUNCTUATION), special=True):
                            return True
                        else:
                            return judgeCheck(answer, contTypeMatch)
                else:
                    answer = latex_process(answer)
                    if checkSymbol(answer.strip(PUNCTUATION), special=True):
                        return True
                    else:
                        return judgeCheck(answer, contTypeMatch)

            elif (self.type == '选择题') | (self.type == '多选题') | (self.type
                                                                == '单选题'):
                if len(re.findall(pattern, answer)) > 0:
                    return True
                else:
                    if contTypeMatch:
                        self.adderrorList(
                            'zh_type_style_unmatch_error',
                            sent='type',
                            replace=0,
                            description=zh_check_config['description']
                            ['zh_type_style_unmatch_error']['选择题']
                            ['答案'].format(self.type))
                    else:
                        return False
            else:
                return True
        else:
            return True

        log_server.logging(
            ">>> Format checking finished! Total time is {} <<<".format(
                time.time() - start))
Beispiel #24
0
    def typeCheck(self, typeMatch):
        """
        对题目进行题型匹配功能检测:
            选择题的选项不能为空
            判断题的答案需匹配到关键字
            除辨析题和以上两个题型外的其他题型,
                如果能匹配到判断题关键字, 则题型不匹配。
                如果选项不为空, 则题型不匹配。
        :param typeMatch: 只有typeMatch==True的时候, 才能添加并显示错误。
        :return:
        """
        log_server.logging(">>> Checking the type match !!!<<<")
        start = time.time()

        answer = pump_list_to_str(self.answer)

        if self.subject == '地理':
            value_list = ['对', '错']
        else:
            value_list = None

        if (self.type == '选择题') | (self.type == '多选题') | (self.type == '单选题'):
            flag = False
            for stem in self.stems:
                if "options" in stem.keys():
                    if all(stem['options']):
                        flag = True

            if flag:
                return True
            else:
                if typeMatch:
                    self.adderrorList(
                        'zh_type_unmatch_error',
                        sent='type',
                        replace=0,
                        description=zh_check_config['description']
                        ['zh_type_unmatch_error']['选择题'].format(self.type))
                else:
                    return False
        elif self.type == '判断题':
            """
            生物较为特殊, 判断题中是有存在T/F,A/B这种格式的判断。
            如果答案中存在小题号, 那么每个小题都必须有判断题的关键字, 否则提醒不匹配
            """
            if self.subject == '政治':
                return True
            else:
                if value_list is None:
                    value_list = ['T', 'F', 'A', 'B', '对', '错']
                else:
                    value_list += ['T', 'F', 'A', 'B', '对', '错']
                p1 = re.compile(r'[(\(]\d+[)\)]')
                """检查答案中是否存在小题号"""
                if len(self.answer) > 1:
                    count = 0
                    record = []
                    for i, ans in enumerate(self.answer):
                        ans = pump_list_to_str(ans)
                        if simple_serial_matching(answer):
                            ans = re.sub(p1, '', ans)
                        ans = latex_process(ans)
                        if checkSymbol(ans.strip(PUNCTUATION),
                                       value_list=value_list):
                            count += 1
                        else:
                            record.append(i + 1)
                    if count == len(self.answer):
                        return True
                    else:
                        if typeMatch:
                            self.adderrorList(
                                'zh_type_unmatch_error',
                                sent='type',
                                replace=0,
                                description=zh_check_config['description']
                                ['zh_type_unmatch_error']['判断题'].format(
                                    self.type, record))
                        else:
                            return False
                else:
                    if simple_serial_matching(answer):
                        answer = re.sub(p1, '', answer)
                    answer = latex_process(answer)
                    if checkSymbol(answer.strip(PUNCTUATION),
                                   value_list=value_list):
                        return True
                    else:
                        if typeMatch:
                            self.adderrorList(
                                'zh_type_unmatch_error',
                                sent='type',
                                replace=0,
                                description=zh_check_config['description']
                                ['zh_type_unmatch_error']['判断题'].format(
                                    self.type, [1]))
                        else:
                            return False
        else:
            """
            非判断题的答案中存在A/B可能是为选择题等情况。 因此不添加A/B关键字
            """
            if not (self.subject == '生物'):
                if value_list is None:
                    value_list = ['T', 'F']
                else:
                    value_list += ['T', 'F']
            if self.type == '辨析题':
                return True
            else:
                flag = False
                for stem in self.stems:
                    if "options" in stem.keys():
                        flag = True
                if flag:
                    """
                    语文中的现代文阅读, 文言文阅读, 综合读写等题目中都是选择题+解答题混合的格式, 无法进行准确分辨,因此跳过语文科目。
                    """
                    if (self.subject == '语文'):
                        return True
                    else:
                        if typeMatch:
                            self.adderrorList(
                                'zh_type_unmatch_error',
                                sent='type',
                                replace=0,
                                description=zh_check_config['description']
                                ['zh_type_unmatch_error']['非选择题'].format(
                                    self.type))
                        else:
                            return False
                else:
                    p1 = re.compile(r'[(\(]\d+[)\)]')
                    if len(self.answer) > 1:
                        """
                        当答案中存在小题号的情况, 我们需要判断是否所有小题的内容都为判断题内容。 否则它并非判断题,因此没有题型不匹配错误。
                        """
                        count = 0
                        for ans in self.answer:
                            if isinstance(ans, list):
                                ans = ''.join(ans)
                            if simple_serial_matching(answer):
                                ans = re.sub(p1, '', ans)
                            ans = latex_process(ans)

                            if checkSymbol(ans.strip(PUNCTUATION),
                                           value_list=value_list):
                                count += 1

                        if count == len(self.answer):
                            if self.subject == '语文':
                                return True
                            else:
                                if typeMatch:
                                    self.adderrorList(
                                        'zh_type_unmatch_error',
                                        sent='type',
                                        replace=0,
                                        description=zh_check_config[
                                            'description']
                                        ['zh_type_unmatch_error']
                                        ['非判断题'].format(self.type))
                                else:
                                    return False
                        else:
                            return True

                    else:
                        if simple_serial_matching(answer):
                            answer = re.sub(p1, '', answer)
                        answer = latex_process(answer)
                        if checkSymbol(answer.strip(PUNCTUATION),
                                       value_list=value_list):
                            if self.subject == '语文':
                                return True
                            else:
                                if typeMatch:
                                    self.adderrorList(
                                        'zh_type_unmatch_error',
                                        sent='type',
                                        replace=0,
                                        description=zh_check_config[
                                            'description']
                                        ['zh_type_unmatch_error']
                                        ['非判断题'].format(self.type))
                                else:
                                    return False
                        else:
                            return True

        log_server.logging(
            ">>>  Type Match finish! Total time is {} <<<".format(time.time() -
                                                                  start))
Beispiel #25
0
    def serial_check(self, serialCheck):
        """
        对答案和解答中是否存在某小题号缺失, 小题号格式不完整, 小题号格式不统一等情况进行检测。
        题目格式比较多样, 有(一)/一、/Ⅰ/①/(1)/(1)等格式存在。 但是我们只对(1)/(1)进行检验。
        如果某小题中,缺失小题号, 但是它为选择题选项, 则跳过。
        :param serialCheck:
        :return:
        """
        if serialCheck:
            log_server.logging('>>> Checking serial sub question !!! <<<')
            start = time.time()
            flag = False
            checkList = [(self.stems, 'stems'), (self.answer, 'answers'),
                         (self.solution, 'solutions')]
            if self.type == '选择题':
                flag = True
            if not flag:
                for checkL in checkList:
                    text = pump_list_to_str(checkL[0])
                    if simple_serial_matching(text):
                        zh_pattern = re.findall(
                            r'([\(\(][一二三四五六七八九十][\)\)]{1,2})', text)
                        zh_pattern1 = re.findall(r'([一二三四五六七八九十]{1,2}、)', text)
                        zh_pattern2 = re.findall(r'([ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩ]{1,3})', text)
                        zh_pattern3 = re.findall(r'(【[\u4e00-\u9fff]+】)', text)
                        if (len(zh_pattern) == 0) & (len(zh_pattern1) == 0) & (
                                len(zh_pattern2) == 0) & (len(zh_pattern3)
                                                          == 0):
                            if len(checkL[0]) > 1:
                                for i, value in enumerate(checkL[0]):
                                    if isinstance(value, dict):
                                        value = pump_list_to_str(value['stem'])
                                    else:
                                        value = pump_list_to_str(value).strip()
                                    special = re.search(r'^(【拓展】)', value)

                                    if (special is not None):
                                        pass
                                    else:
                                        pattern = re.search(r'^(解[::])', value)
                                        if pattern is not None:
                                            value = re.sub(
                                                r'(解[::])', '', value).strip()
                                        pattern1 = re.search(
                                            r'^([\(\(](\${\d+\}\$)[\)\)])',
                                            value)
                                        pattern2 = re.search(
                                            r'^(\$\{[\(\(]\d+[\)\)]\}\$)',
                                            value)
                                        if (pattern1 is not None) | (
                                                pattern2 is not None):
                                            value = re.sub(
                                                r'[\$\{\}\$]', '',
                                                value).strip()

                                        x = re.search(
                                            r'^([\(\(](\d){1,2}[\)\)])', value)
                                        x1 = re.search(
                                            r'([\(\(](\d){1,2}[\)\)])', value)
                                        x2 = re.search(r'^((\d){1,2}\.)',
                                                       value)

                                        if (x2 == None):
                                            if (x == None) & (x1 == None):

                                                if len(value) <= 10:
                                                    if len(
                                                            re.findall(
                                                                '[A-D]{1,4}',
                                                                value)) > 0:
                                                        pass
                                                    else:
                                                        self.adderrorList(
                                                            'zh_serial_unmatch_error',
                                                            checkL[1],
                                                            replace=0,
                                                            description=
                                                            zh_check_config[
                                                                'description']
                                                            ['zh_serial_unmatch_error']
                                                            .format(
                                                                checkL[1],
                                                                i + 1, i + 1))
                                                else:
                                                    self.adderrorList(
                                                        'zh_serial_unmatch_error',
                                                        checkL[1],
                                                        replace=0,
                                                        description=
                                                        zh_check_config[
                                                            'description']
                                                        ['zh_serial_unmatch_error']
                                                        .format(
                                                            checkL[1], i + 1,
                                                            i + 1))
                                            else:
                                                pass
                                        else:
                                            pass
                # print('serial sub question checking {}'.format(time.time() - start))
                log_server.logging(
                    '>>>Finished serial sub question checking!!! Total time is {}'
                    .format(time.time() - start))
            else:
                pass
Beispiel #26
0
    def enSymbolCheck(self):
        """
        对题目, 解答还有选项进行全半角英文字符混用检测。
        如果检测到混用的情况, 返回检测错误。并且将全角英文字符更改会半角英文字符。
        :param enSymbolDetect:
        :return:
        """
        log_server.logging(">>> Checking english symbol in sentence!!! <<<")
        start = time.time()
        check_list = [(self.description, "description"),
                      (self.solution, "solutions")]
        count = 0

        for _list in check_list:
            record, replaces, positions = [], [], []
            sentence = _list[0]
            type = _list[1]
            cont = ''
            if all(sentence):
                if isinstance(sentence, list):
                    for i, sent in enumerate(sentence):
                        if isinstance(sent, dict):
                            flag, r, repl, pos = self.process.enSymbolProcess(
                                i, sent['stem'])
                        else:
                            flag, r, repl, pos = self.process.enSymbolProcess(
                                i, sent)
                        if flag:
                            record += r
                            replaces += repl
                            positions += pos
                    if len(record) > 0 and len(replaces) > 0 and len(
                            positions) > 0:
                        count += 1
                        self.adderrorList(
                            'zh_enSymbol_unmatch_error',
                            sent=type,
                            replace=1,
                            description=zh_check_config['description']
                            ['zh_enSymbol_unmatch_error'].format(type),
                            source=record,
                            target=replaces,
                            pos=positions)
                else:
                    flag, r, repl, pos = self.process.enSymbolProcess(
                        -1, sentence)
                    if flag:
                        record += r
                        replaces += repl
                        positions += pos
                    if len(record) > 0 and len(replaces) > 0 and len(
                            positions) > 0:
                        count += 1
                        self.adderrorList(
                            'zh_enSymbol_unmatch_error',
                            sent=type,
                            replace=1,
                            description=zh_check_config['description']
                            ['zh_enSymbol_unmatch_error'].format(type),
                            source=record,
                            target=replaces,
                            pos=positions)
        # print('english symbol checking {}'.format(time.time() - start))
        log_server.logging(
            ">>> Finished English symbol checking!!! Total time is {}<<<".
            format(time.time() - start))
        if count == 0:
            return True
        else:
            return False
Beispiel #27
0
 def checker(self, grade, subject, query):
     """
     英语检查器入口
     :param query:
     :return:
     """
     log_server.logging(">>> Enter en_checker !")
     all_error_dict = dict()
     grade_subject = grade + '_' + subject
     ### step1 解析数据 ###
     tid, _, _, _, _, _ = parse_data(query)
     if self.mode == 'all functions':
         ### step2 符号检查 ###
         try:
             symbol_error_dict = self.check_symbol_format(query)
         except Exception as e:
             log_server.logging('Symbol check exception: {}'.format(e))
             symbol_error_dict = dict()
         ### step3 拼写检查 ###
         try:
             spell_error_dict = self.check_spell(query)
         except Exception as e:
             log_server.logging('Spell check exception: {}'.format(e))
             spell_error_dict = dict()
         ### step4 检查答案与解答是否对应 ###
         try:
             unmatch_error_dict = self.check_match(query, grade_subject)
         except Exception as e:
             log_server.logging('Check match exception: {}'.format(e))
             unmatch_error_dict = dict()
         ### step5 语法检查 ###
         try:
             grammar_error_dict = self.check_grammar(query)
         except Exception as e:
             log_server.logging('Grammar check exception: {}'.format(e))
             grammar_error_dict = dict()
         ############################### 第四步 错误类型汇总 ################################
         all_error_dict = merge_dicts(tid, symbol_error_dict,
                                      spell_error_dict, unmatch_error_dict,
                                      grammar_error_dict)
         log_server.logging('Finish en checker! All error dict saved.')
         return all_error_dict
     elif self.mode == 'check symbol and full-width':
         all_error_dict = self.check_symbol_format(query)
         return all_error_dict
     elif self.mode == 'check spell':
         all_error_dict = self.check_spell(query)
         return all_error_dict
     elif self.mode == 'check match':
         all_error_dict = self.check_match(query, grade_subject)
         return all_error_dict
     elif self.mode == 'check grammar':
         all_error_dict = self.check_grammar(query)
         return all_error_dict
     else:
         return all_error_dict
Beispiel #28
0
    def __call__(self,
                 tradWordDetect=False,
                 wrongWordDetect=False,
                 keywordMatch=False,
                 contentMatch=False,
                 symbolCheck=False,
                 typeMatch=False,
                 contTypeMatch=False,
                 enSymbolDetect=False,
                 serialCheck=False):

        log_server.logging(">>> Enter zh_checker ! ")
        try:
            if symbolCheck:
                self.final_check_confusion_symbol()
                self.final_check_space_in_solutions()
        except Exception as e:
            log_server.logging(
                '>>>Checking special symbol function exception: {}'.format(e))

        try:
            if serialCheck:
                self.serial_check(serialCheck)
        except Exception as e:
            log_server.logging(
                '>>>Checking serial function exception: {}'.format(e))

        self.description = self.delTab(self.description)
        self.description = self.extractLatex(self.description)
        solu_list = []
        for solu in self.solution:
            solu_list.append(self.extractLatex(solu))
        self.solution = solu_list

        try:
            if symbolCheck:
                self.matchbracket()
        except Exception as e:
            log_server.logging(
                '>>>Checking symbol function exception: {}'.format(e))

        try:
            if enSymbolDetect:
                self.enSymbolCheck()
        except Exception as e:
            log_server.logging(
                '>>>Checking enSymbol function exception: {}'.format(e))

        try:
            if self.typeCheck(typeMatch):
                if self.contTypeCheck(contTypeMatch):
                    self.contentMatch(keywordMatch, contentMatch)
        except Exception as e:
            log_server.logging(
                '>>>Checking type function exception: {}'.format(e))

        try:
            #错词功能部分。 测试时
            if self.subject == '语文':
                if self.type in ['现代文阅读', '写作', '名著阅读']:
                    self.spellDetect(tradWordDetect, wrongWordDetect)
                else:
                    pass
            elif self.subject in ['化学', '数学', '地理']:
                pass
            else:
                self.spellDetect(tradWordDetect, wrongWordDetect)
        except Exception as e:
            log_server.logging(
                '>>>Checking spell and trad function exception: {}'.format(e))
        log_server.logging("Zh_checker finish ! >>>")

        return self.position, self.errorList