def removeMidSpace4Chinese(self, sen):
        """
        删除中文之间的空格,英文之间的不要删
        :param sen: 
        :return: 
        """
        pre_ch = None
        cur_ch_iscn = False
        cur_ch_isalpha = False
        pre_ch_isalpha1 = False
        pre_ch_isalpha2 = False
        res = []
        for ch in sen:
            if UnicodeConvertor.is_empty(ch):
                # 如果前面是空格,继续跳过,多个空格只保留一个即可
                if UnicodeConvertor.is_empty(pre_ch):
                    continue
                else:
                    cur_ch_isalnum = False

            else:
                cur_ch_iscn = UnicodeConvertor.is_chinese(ch)
                cur_ch_isalpha = ch.isalpha()
                # 如果当前是数字或者英文,前一个是空格,再前一个是数字或者英文,中间的空格保留一个
                if cur_ch_iscn == False and cur_ch_isalpha == True and UnicodeConvertor.is_empty(pre_ch) and  pre_ch_isalpha2 == True:
                    res.append(u' ')
                res.append(ch)

            pre_ch = ch
            pre_ch_isalpha2 = pre_ch_isalpha1
            pre_ch_isalpha1 = cur_ch_isalpha and cur_ch_iscn == False

        return ''.join(res)
Exemple #2
0
    def postProcessSentence(self, line):
        """
        去掉前缀的文本处理
        :param line: 
        :return: 
        """
        line = line.strip()
        line = line.strip(u'、:.,{}.,:{}')
        line = line.strip()

        # 去掉符合正则的部分
        line = self.removeReSection(line)

        # 可能还有前缀
        level, line = self.judgeLevel(line)
        # 去掉后缀词
        line = self.removeSuffixSection(line)
        # 去掉前缀词
        line = self.removePrefixSection(line)
        # 包含词的处理
        line = self.removeMiddleSection(line)
        # 对于有破折号,冒号,等号等部分进行处理
        line = self.parseDashSection(line)

        # 可能还有前缀
        level, line = self.judgeLevel(line)

        # 去掉中间空格
        line = self.preprocesser.removeMidSpace4Chinese(line)

        # 如果还有数字在里面,则去掉
        pattern_list = [self.preprocesser.re_nouse_number]
        line = self.preprocesser.removeByRegexPattern(pattern_list, line)

        # 如果长度大于20,并且包含逗号的,不要
        if len(line) > 20 and (line.__contains__(u',')
                               or line.__contains__(u',') or len(line) > 30):
            line = u''
        # 如果还有以特殊符号开头的,不要扔掉
        if len(line) > 0 and line[0] <> u'《' and line[0] <> u'“' and (
                UnicodeConvertor.is_other(line[0])
                or UnicodeConvertor.is_other(line[-1])):
            line = u''
        # 单引号在cypher语句中是有特殊涵义的
        if line.__contains__(u"'"):
            line = line.replace(u"'", u"’")

        line = line.strip()
        line = line.strip(u'、:.,{}.,:{}')
        line = line.strip()

        if len(line) < 2:
            line = u''

        return line
    def createGoodResult2Excel4PersonCheck(self, course_path_info_list):
        """
        # 将好的结果输出excel,让人工进行审核
        :param course_path_info: 
        :return: 
        """
        # 将知识点和试题关联起来
        column_data_list = []
        column_data_list.append(QuestionInformation.getGoodCorrelationData())
        exam_index = 1
        for course_path_info in course_path_info_list:
            course = course_path_info.course
            if not self.exam_info.examquestion_dict.__contains__(course):
                continue

            exam_question_list = self.exam_info.examquestion_dict.get(course)

            for exam_question in exam_question_list:
                knowledge_list = exam_question.knowledge_list
                k_index = 1
                for k in knowledge_list:
                    if isinstance(k, str):
                        if len(k.strip()) == 0:
                            continue
                        kname = k
                        kscore = u''
                    else:
                        if k.score < 0.45:
                            continue
                        kscore = k.score
                        kname = k.text
                    kscore = u'{}%'.format(round(kscore * 100, 2))
                    row = []
                    if k_index == 1:
                        exam_index_unicode = UnicodeConvertor.numToUnicode(
                            exam_index)
                        row.append(exam_index_unicode)
                        row.append(exam_question.getOnlyContentAndAnswer())
                    else:
                        row.append(u'')
                        row.append(u'')
                    row.append(kname)
                    row.append(kscore)
                    row.append(u'')
                    row.append(u'')
                    row.append(u'')

                    column_data_list.append(row)
                    k_index += 1

                exam_index += 1
            # 一个课程完
        # 一组课程完
        sheet_datas = {}
        sheet_datas['sheet1'] = column_data_list
        ExcelWriter.writeExcelFile(
            course_path_info_list[0].correlation_good_xls_filepath,
            sheet_datas)
        print 'Excel文件:{}已生成'.format(
            course_path_info_list[0].course.coursebase_name)
Exemple #4
0
    def isChineseSign(self, word):
        flag = True
        if word is None:
            return flag

        for ch in word:
            flag = UnicodeConvertor.is_chinese(ch)
            if flag == False:
                break

        return flag
Exemple #5
0
 def isChinese(self, ch):
     res = False
     s_unicode = UnicodeConvertor.stringToUnicode(ch)
     if s_unicode >= u'\\u4e00' and s_unicode <= u'\\u9fa5':
         res = True
     return  res
Exemple #6
0
    def readZhuankeCatalog(self, filepath):
        """
        专科专业目录
        :param filepath: 
        :return: 
        """
        self.category_list = []
        excel_reader = ExcelReader.ExcelReader()
        excel_reader.filepath = filepath
        excel_reader.sheet_scope_indexes = [0]
        # 定义一个question对象

        # self.excel_reader.column_scope_names = [u'题目内容', u'选项A', u'选项B', u'选项C', u'选项D', u'选项E', u'答案',u'试题编号',u'题型名称',u'试题类别',u'难度', u'标注知识点']
        excel_reader.column_scope_names = {}
        excel_reader.column_scope_names[u'专业大类'] = -1
        excel_reader.column_scope_names[u'专业大类代码'] = -1
        excel_reader.column_scope_names[u'专业类名称'] = -1
        excel_reader.column_scope_names[u'专业类代码'] = -1
        excel_reader.column_scope_names[u'专业名称'] = -1
        excel_reader.column_scope_names[u'专业代码'] = -1

        excel_content_rows = excel_reader.readFile()

        # 分析数据
        fst_name = None
        fst_code = None
        snd_name = None
        snd_code = None
        fst_code_length = 0
        snd_code_length = 0
        for row in excel_content_rows:
            level_fst_name = row[0]
            level_fst_code = row[1]
            level_snd_name = row[2]
            level_snd_code = row[3]
            level_trd_name = row[4]
            level_trd_code = row[5]



            if len(level_fst_name) > 0:
                fst_code = UnicodeConvertor.numToUnicode(level_fst_code)
                fst_name = level_fst_name
                fst_code_length = len(fst_code)
                ci = CatalogItem()
                ci.initData(fst_code, fst_name)
                self.category_list.append(ci)
            if len(level_snd_name) > 0:
                snd_code = UnicodeConvertor.numToUnicode(level_snd_code)
                snd_name = level_snd_name
                ci = CatalogItem()
                snd_code_length = len(snd_code)
                code = u'{}.{}'.format(fst_code, snd_code[fst_code_length:])
                ci.initData(code, snd_name)
                self.category_list.append(ci)

            ci = CatalogItem()
            trd_code = UnicodeConvertor.numToUnicode(level_trd_code)
            #code_length = len(snd_code)
            code = u'{}.{}.{}'.format(fst_code, snd_code[fst_code_length:],trd_code[snd_code_length:])
            ci.initData(code, level_trd_name)
            self.category_list.append(ci)

        return excel_content_rows
Exemple #7
0
                    continue

                res_list.append(word)

            yield res_list

        fin.close()
        # fout = open('./../data/words-jieba-plit.txt', 'w')  # 以写得方式打开文件
        # fout.write('\n'.join(self.jiebasplitor.wordposlist))  # 将分词好的结果写入到输出文件
        # fout.close()
        #
        # fout = open('./../data/words-hanlp-plit.txt', 'w')  # 以写得方式打开文件
        # fout.write('\n'.join(self.hanlpsplitor.wordposlist))  # 将分词好的结果写入到输出文件
        # fout.close()



if __name__ == "__main__":
    sr = SentenceReader()
    #sr.splitSentence('./../data/financial-course.txt')
    sen = u'个人自学结合在一起的教学组织形式是() 答案: 特朗普制'
    res = sr.splitOneSentence(sen)
    for word in res:
        word1 = word.split()
        for ch in word:
            res = UnicodeConvertor.is_chinese(ch)
            print res

    print res
    res = sr.splitSentenceCanRepeat(sen)
    print 'split over'
    def getCourseName(self, course):

        course_chlist = []
        # 如果开头不是中文的,都作为不符合规则处理,需要人为参与
        startch = course[0]
        if not UnicodeConvertor.is_chinese(startch):
            if not self.coursenameErrdict.__contains__(course):
                self.coursenameErrdict[course] = ''
            return ''

        # 如果名称中包含中文的逗号,冒号,引号等特殊符号,需要人工处理
        if course.__contains__(',') or course.__contains__(':') or \
            course.__contains__('、') or course.__contains__('——') or \
            course.__contains__('《') or course.__contains__('“') or course.__contains__('【') :
            if not self.coursenameErrdict.__contains__(course):
                self.coursenameErrdict[course] = ''
            return ''

        # 如果名字有中文横线-,则取最后一段
        arr = course.split('-')
        if len(arr) > 1:
            course = arr[-1]
        arr = course.split('—')
        if len(arr) > 1:
            course = arr[-1]
        arr1 = course.split('-')
        if len(arr1) > 1:
            course = arr1[-1]

        # 如果名字中有中文逗号,或者顿号,或者引号等,无法处理的

        prech = ''
        prech2 = ''
        engword = []
        for ch in course:

            # 如果是汉字,添加到列表,继续
            if UnicodeConvertor.is_chinese(ch):
                if prech2.isalpha() and len(engword) > 0:
                    course_chlist.append(''.join(engword))
                    engword = []
                course_chlist.append(ch)
            elif ch.isdigit():
                break
            elif ch.isalpha():
                engword.append(ch)
            else:
                break
            # 如果是数字或者符号,则结束
            #elif ch == '+' and (prech == '+' or prech.isalpha()):
            #    course_chlist.append(ch)

            prech = ch
            prech2 = prech
        # 如果名称末尾有上、下的,可以去掉
        if len(course_chlist) > 0 and \
                (course_chlist[-1] == u'上' or course_chlist[-1] == u'下' or
                         course_chlist[-1] == u'一' or course_chlist[-1] == u'二' or
                         course_chlist[-1] == u'三' or course_chlist[-1] == u'四' or
                         course_chlist[-1] == u'五' or course_chlist[-1] == u'六' or
                         course_chlist[-1] == u'七' or course_chlist[-1] == u'八' or
                         course_chlist[-1] == u'九' or course_chlist[-1] == u'十'
                 ):
            del course_chlist[-1]
        if len(engword) > 1:
            course_chlist.append(''.join(engword))

        # 返回课程名字
        return ''.join(course_chlist)
 def initByRows(self, rows):
     for row in rows:
         course_code = row[column_head_dict[u'课程编号']]
         course_code = UnicodeConvertor.numToUnicode(course_code)
         course = self.course_info.getCourseByCourseCode(course_code)
         self.__initByData(course, row)