def removeMidSpace4Chinese(self, sen): """ 删除中文之间的空格,英文之间的不要删 :param sen: :return: """ pre_ch = None cur_ch_iscn = False cur_ch_isalpha = False pre_ch_isalpha1 = False pre_ch_isalpha2 = False res = [] for ch in sen: if UnicodeConvertor.is_empty(ch): # 如果前面是空格,继续跳过,多个空格只保留一个即可 if UnicodeConvertor.is_empty(pre_ch): continue else: cur_ch_isalnum = False else: cur_ch_iscn = UnicodeConvertor.is_chinese(ch) cur_ch_isalpha = ch.isalpha() # 如果当前是数字或者英文,前一个是空格,再前一个是数字或者英文,中间的空格保留一个 if cur_ch_iscn == False and cur_ch_isalpha == True and UnicodeConvertor.is_empty(pre_ch) and pre_ch_isalpha2 == True: res.append(u' ') res.append(ch) pre_ch = ch pre_ch_isalpha2 = pre_ch_isalpha1 pre_ch_isalpha1 = cur_ch_isalpha and cur_ch_iscn == False return ''.join(res)
def postProcessSentence(self, line): """ 去掉前缀的文本处理 :param line: :return: """ line = line.strip() line = line.strip(u'、:.,{}.,:{}') line = line.strip() # 去掉符合正则的部分 line = self.removeReSection(line) # 可能还有前缀 level, line = self.judgeLevel(line) # 去掉后缀词 line = self.removeSuffixSection(line) # 去掉前缀词 line = self.removePrefixSection(line) # 包含词的处理 line = self.removeMiddleSection(line) # 对于有破折号,冒号,等号等部分进行处理 line = self.parseDashSection(line) # 可能还有前缀 level, line = self.judgeLevel(line) # 去掉中间空格 line = self.preprocesser.removeMidSpace4Chinese(line) # 如果还有数字在里面,则去掉 pattern_list = [self.preprocesser.re_nouse_number] line = self.preprocesser.removeByRegexPattern(pattern_list, line) # 如果长度大于20,并且包含逗号的,不要 if len(line) > 20 and (line.__contains__(u',') or line.__contains__(u',') or len(line) > 30): line = u'' # 如果还有以特殊符号开头的,不要扔掉 if len(line) > 0 and line[0] <> u'《' and line[0] <> u'“' and ( UnicodeConvertor.is_other(line[0]) or UnicodeConvertor.is_other(line[-1])): line = u'' # 单引号在cypher语句中是有特殊涵义的 if line.__contains__(u"'"): line = line.replace(u"'", u"’") line = line.strip() line = line.strip(u'、:.,{}.,:{}') line = line.strip() if len(line) < 2: line = u'' return line
def createGoodResult2Excel4PersonCheck(self, course_path_info_list): """ # 将好的结果输出excel,让人工进行审核 :param course_path_info: :return: """ # 将知识点和试题关联起来 column_data_list = [] column_data_list.append(QuestionInformation.getGoodCorrelationData()) exam_index = 1 for course_path_info in course_path_info_list: course = course_path_info.course if not self.exam_info.examquestion_dict.__contains__(course): continue exam_question_list = self.exam_info.examquestion_dict.get(course) for exam_question in exam_question_list: knowledge_list = exam_question.knowledge_list k_index = 1 for k in knowledge_list: if isinstance(k, str): if len(k.strip()) == 0: continue kname = k kscore = u'' else: if k.score < 0.45: continue kscore = k.score kname = k.text kscore = u'{}%'.format(round(kscore * 100, 2)) row = [] if k_index == 1: exam_index_unicode = UnicodeConvertor.numToUnicode( exam_index) row.append(exam_index_unicode) row.append(exam_question.getOnlyContentAndAnswer()) else: row.append(u'') row.append(u'') row.append(kname) row.append(kscore) row.append(u'') row.append(u'') row.append(u'') column_data_list.append(row) k_index += 1 exam_index += 1 # 一个课程完 # 一组课程完 sheet_datas = {} sheet_datas['sheet1'] = column_data_list ExcelWriter.writeExcelFile( course_path_info_list[0].correlation_good_xls_filepath, sheet_datas) print 'Excel文件:{}已生成'.format( course_path_info_list[0].course.coursebase_name)
def isChineseSign(self, word): flag = True if word is None: return flag for ch in word: flag = UnicodeConvertor.is_chinese(ch) if flag == False: break return flag
def isChinese(self, ch): res = False s_unicode = UnicodeConvertor.stringToUnicode(ch) if s_unicode >= u'\\u4e00' and s_unicode <= u'\\u9fa5': res = True return res
def readZhuankeCatalog(self, filepath): """ 专科专业目录 :param filepath: :return: """ self.category_list = [] excel_reader = ExcelReader.ExcelReader() excel_reader.filepath = filepath excel_reader.sheet_scope_indexes = [0] # 定义一个question对象 # self.excel_reader.column_scope_names = [u'题目内容', u'选项A', u'选项B', u'选项C', u'选项D', u'选项E', u'答案',u'试题编号',u'题型名称',u'试题类别',u'难度', u'标注知识点'] excel_reader.column_scope_names = {} excel_reader.column_scope_names[u'专业大类'] = -1 excel_reader.column_scope_names[u'专业大类代码'] = -1 excel_reader.column_scope_names[u'专业类名称'] = -1 excel_reader.column_scope_names[u'专业类代码'] = -1 excel_reader.column_scope_names[u'专业名称'] = -1 excel_reader.column_scope_names[u'专业代码'] = -1 excel_content_rows = excel_reader.readFile() # 分析数据 fst_name = None fst_code = None snd_name = None snd_code = None fst_code_length = 0 snd_code_length = 0 for row in excel_content_rows: level_fst_name = row[0] level_fst_code = row[1] level_snd_name = row[2] level_snd_code = row[3] level_trd_name = row[4] level_trd_code = row[5] if len(level_fst_name) > 0: fst_code = UnicodeConvertor.numToUnicode(level_fst_code) fst_name = level_fst_name fst_code_length = len(fst_code) ci = CatalogItem() ci.initData(fst_code, fst_name) self.category_list.append(ci) if len(level_snd_name) > 0: snd_code = UnicodeConvertor.numToUnicode(level_snd_code) snd_name = level_snd_name ci = CatalogItem() snd_code_length = len(snd_code) code = u'{}.{}'.format(fst_code, snd_code[fst_code_length:]) ci.initData(code, snd_name) self.category_list.append(ci) ci = CatalogItem() trd_code = UnicodeConvertor.numToUnicode(level_trd_code) #code_length = len(snd_code) code = u'{}.{}.{}'.format(fst_code, snd_code[fst_code_length:],trd_code[snd_code_length:]) ci.initData(code, level_trd_name) self.category_list.append(ci) return excel_content_rows
continue res_list.append(word) yield res_list fin.close() # fout = open('./../data/words-jieba-plit.txt', 'w') # 以写得方式打开文件 # fout.write('\n'.join(self.jiebasplitor.wordposlist)) # 将分词好的结果写入到输出文件 # fout.close() # # fout = open('./../data/words-hanlp-plit.txt', 'w') # 以写得方式打开文件 # fout.write('\n'.join(self.hanlpsplitor.wordposlist)) # 将分词好的结果写入到输出文件 # fout.close() if __name__ == "__main__": sr = SentenceReader() #sr.splitSentence('./../data/financial-course.txt') sen = u'个人自学结合在一起的教学组织形式是() 答案: 特朗普制' res = sr.splitOneSentence(sen) for word in res: word1 = word.split() for ch in word: res = UnicodeConvertor.is_chinese(ch) print res print res res = sr.splitSentenceCanRepeat(sen) print 'split over'
def getCourseName(self, course): course_chlist = [] # 如果开头不是中文的,都作为不符合规则处理,需要人为参与 startch = course[0] if not UnicodeConvertor.is_chinese(startch): if not self.coursenameErrdict.__contains__(course): self.coursenameErrdict[course] = '' return '' # 如果名称中包含中文的逗号,冒号,引号等特殊符号,需要人工处理 if course.__contains__(',') or course.__contains__(':') or \ course.__contains__('、') or course.__contains__('——') or \ course.__contains__('《') or course.__contains__('“') or course.__contains__('【') : if not self.coursenameErrdict.__contains__(course): self.coursenameErrdict[course] = '' return '' # 如果名字有中文横线-,则取最后一段 arr = course.split('-') if len(arr) > 1: course = arr[-1] arr = course.split('—') if len(arr) > 1: course = arr[-1] arr1 = course.split('-') if len(arr1) > 1: course = arr1[-1] # 如果名字中有中文逗号,或者顿号,或者引号等,无法处理的 prech = '' prech2 = '' engword = [] for ch in course: # 如果是汉字,添加到列表,继续 if UnicodeConvertor.is_chinese(ch): if prech2.isalpha() and len(engword) > 0: course_chlist.append(''.join(engword)) engword = [] course_chlist.append(ch) elif ch.isdigit(): break elif ch.isalpha(): engword.append(ch) else: break # 如果是数字或者符号,则结束 #elif ch == '+' and (prech == '+' or prech.isalpha()): # course_chlist.append(ch) prech = ch prech2 = prech # 如果名称末尾有上、下的,可以去掉 if len(course_chlist) > 0 and \ (course_chlist[-1] == u'上' or course_chlist[-1] == u'下' or course_chlist[-1] == u'一' or course_chlist[-1] == u'二' or course_chlist[-1] == u'三' or course_chlist[-1] == u'四' or course_chlist[-1] == u'五' or course_chlist[-1] == u'六' or course_chlist[-1] == u'七' or course_chlist[-1] == u'八' or course_chlist[-1] == u'九' or course_chlist[-1] == u'十' ): del course_chlist[-1] if len(engword) > 1: course_chlist.append(''.join(engword)) # 返回课程名字 return ''.join(course_chlist)
def initByRows(self, rows): for row in rows: course_code = row[column_head_dict[u'课程编号']] course_code = UnicodeConvertor.numToUnicode(course_code) course = self.course_info.getCourseByCourseCode(course_code) self.__initByData(course, row)