def generateCourseBaseCode(self, course): index = 0 if self.course_base_dict.__contains__(course.coursebase_name): self.current_base_course = self.course_base_dict.get( course.coursebase_name) course.coursebase_code = self.current_base_course.coursebase_code course.coursebase_index = self.current_base_course.coursebase_index else: if self.current_base_course is None: index = 1 else: index = self.current_base_course.coursebase_index if self.max_index >= index: self.max_index += 1 else: self.max_index = index course.coursebase_code = 'open.bc.' + str(self.max_index) course.coursebase_index = self.max_index # 课程基础 cb = CourseInfomation.CourseBase() cb.coursebase_code = course.coursebase_code cb.coursebase_name = course.coursebase_name cb.coursebase_index = course.coursebase_index self.course_base_dict[cb.coursebase_name] = cb self.current_base_course = cb
def createCypherFile(self, course): """ 数据生成cypher语句文件 :param course: :return: """ if not self.exam_info.examquestion_dict.__contains__(course): return exam_question_list = self.exam_info.examquestion_dict.get(course) # 将知识点和试题关联起来 cypherlist = self.generateCypher(course, exam_question_list) # 保存数据 course_path_info = CourseInfomation.CourseFilepath() course_path_info.courseware_source_directory = self.rootpath course_path_info.initByCourse(course) file_name = course_path_info.cypher_txt_filepath fout = open(file_name, 'w') for item in cypherlist: fout.write(item) fout.write('\n') fout.close() print 'cypher文件:{}已生成'.format(course.NewCourseName) return cypherlist
def associateFlow(self, course_list): """ 关联流程,按步骤执行即可 :return: """ self.treefactory.course_filepath_list = [] for course_tuple in course_list: course_path_info = CourseInfomation.CourseFilepath() # 指定课件的源文件路径 course_path_info.courseware_source_directory = self.srcrootpath course_path_info.courseware_source_docx_filepath = self.c_rootpath + '/' + course_tuple[ 1] course_path_info.initByCourse(course_tuple[0]) self.treefactory.course_filepath_list.append(course_path_info) print '正在转换docx到txt,并抽取知识树...' self.treefactory.extractKnowledge() #self.knowledgemidfilepath = self.treefactory.wordreader.outputmidfilepath print '知识树抽取完成。' # 第三步:处理试题源文件,生成训练和测试样本 print '开始获取试题数据...' self.questionSourceFileProcess(self.treefactory.course_filepath_list) #print '开始生成试题词网...' #self.questionGenerateWordnet() #print '试题数据获取、试题词网生成完成。' print '试题数据获取完成。' # 第四步:将知识点和试题进行关联 print '开始关联试题与知识点...' self.questionAndKnowledge(self.treefactory.course_filepath_list) print '完成试题与知识点的关联。'
def createCorpusFile(self, course_path_info): # 将结果按文件id输出 course = course_path_info.course if self.exam_info.examquestion_dict.__contains__(course): file_name = course_path_info.examquestion_source_txt_filepath row_list = self.exam_info.examquestion_dict.get(course) fout = open(file_name, 'w') for item in row_list: fout.write(item.getContentAndAnswer()) fout.write('\n') fout.close() print '语料文件:{}已生成'.format(course.NewCourseName) return # 下面的是生成全集的 for course in self.exam_info.examquestion_dict.keys(): course_path_info = CourseInfomation.CourseFilepath() course_path_info.courseware_source_directory = self.rootpath course_path_info.initByCourse(course) file_name = course_path_info.examquestion_source_txt_filepath row_list = self.exam_info.examquestion_dict.get(course) fout = open(file_name, 'w') for item in row_list: fout.write(item.getContentAndAnswer()) fout.write('\n') fout.close() print '语料文件:{}已生成'.format(course.NewCourseName)
def __init__(self, course_source_filename): """ initialize data """ # course对象 self.course_info = CourseInfomation.CourseDictionary( course_source_filename) self.excel_reader = ExcelReader.ExcelReader()
def statistics(self): """ 对course_score_list中的结果,进行统计 :return: """ # 课程的数量分布统计变量 n_coure_count_more50 = 0 n_coure_score_less50 = 0 # 统计所有课程,百分比的分布情况 n_coure_score = CourseInfomation.CourseScore() for course_score in self.course_score_list: n_coure_score.score_scope_more60_count += course_score.score_scope_more60_count n_coure_score.score_scope_between5060_count += course_score.score_scope_between5060_count n_coure_score.score_scope_between4050_count += course_score.score_scope_between4050_count n_coure_score.score_scope_less40_count += course_score.score_scope_less40_count # 如果50分以上的超过50%,则more50+1 if (n_coure_score.score_scope_between5060_count + n_coure_score.score_scope_more60_count): n_coure_count_more50 += 1 else: n_coure_score_less50 += 1 n_coure_score.compute() ns = '试题总数:{}'.format(n_coure_score.score_scope_total) print ns ns = '比较靠谱数(60分以上):{} ,比较靠谱占比:{}%'.format( n_coure_score.score_scope_more60_count, round(n_coure_score.score_scope_more60_rate * 100, 2)) print ns ns = '基本靠谱数(50-60分):{} ,基本靠谱占比:{}%'.format( n_coure_score.score_scope_between5060_count, round(n_coure_score.score_scope_between5060_rate * 100, 2)) print ns ns = '不太靠谱数(40-50分):{} ,不太靠谱占比:{}%'.format( n_coure_score.score_scope_between4050_count, round(n_coure_score.score_scope_between4050_rate * 100, 2)) print ns ns = '不靠谱数(40分以下):{} ,不靠谱占比:{}%'.format( n_coure_score.score_scope_less40_count, round(n_coure_score.score_scope_less40_rate * 100, 2)) print ns # 统计课程所在区域的分布情况 n_coure_count_total = n_coure_score_less50 + n_coure_count_more50 n_coure_score_less_rate = float( n_coure_score_less50) / n_coure_count_total n_coure_count_more_rate = float( n_coure_count_more50) / n_coure_count_total ns = '50%以上的试题得分大于50分的课程数量:{} 占比:{}'.format(n_coure_count_more50, n_coure_count_more_rate) print ns ns = '50%以上的试题得分小于50分的课程数量:{} 占比:{}'.format(n_coure_score_less50, n_coure_score_less_rate) print ns pass
def loadBaseCourse(self, base_course_file): if not FilePath.fileExist(base_course_file): return f_input = open(base_course_file, 'r') for line in f_input: line = line.strip('\n') cb = CourseInfomation.CourseBase() cb.initByString(line) self.course_base_list.append(cb) self.current_base_course = cb
def __init__(self): """ initialize local variables. """ self.courseinfo = CourseInfomation.CourseDictionary() #self.courseinfo.initDictionary(u'./../data/dictionary/course.txt') self.excelreader = ExcelReader.ExcelReader() self.re_num_0 = ur'(第[一二三四五六七八九零十百千万亿0123456789]+[章节讲])' self.pattern = re.compile(self.re_num_0) self.outputfile = u'./../data/course-knowledge-tgt/'
def __init__(self, course_source_filename): """ initialize data """ # course对象 self.course_info = CourseInfomation.CourseDictionary( course_source_filename) self.excel_reader = ExcelReader.ExcelReader() self.course_base_dict = {} self.course_base_list = [] self.current_base_course = None self.max_index = 0
def loadProcessedCourse(self, rootpath): output_mid_filepath = '{}/statistics-mid.txt'.format(rootpath) if not FilePath.fileExist(output_mid_filepath): return fout = open(output_mid_filepath, 'r') lines = fout.readlines() for one_course_str in lines: course_score = CourseInfomation.CourseScore() course_score.initByString(one_course_str) key = '{}-{}'.format(course_score.school_code, course_score.course_code) self.course_processed_dict[key] = course_score self.course_score_list.append(course_score) fout.close()
def __init__(self): """ initialize data """ self.result = [] self.cypher = None self.cypherlist = [] self.knowledge = {} self.question = {} self.course = CourseInfomation.CourseDictionary() #self.course.initDictionary(u'./../data/dictionary/course.txt') # 记录没有知识点的行,打印用 self.no_kwg_row_count = 0 # 初始化列 self.__initColumn()
def createExcelFile(self): """ 按照课程,分别存储每个文件 :return: """ for course in self.exam_info.examquestion_dict.keys(): course_path_info = CourseInfomation.CourseFilepath() course_path_info.courseware_source_directory = self.rootpath course_path_info.initByCourse(course) file_name = course_path_info.examquestion_source_xlsx_filepath row_list = self.exam_info.examquestion_dict.get(course) column_data_list = [] column_data_list.append(QuestionInformation.column_head_list) for exam_question in row_list: if exam_question.content.startswith(u'<img'): continue column_data_list.append(exam_question.toList()) sheet_datas = {} sheet_datas['sheet1'] = column_data_list ExcelWriter.writeExcelFile( course_path_info.examquestion_source_xlsx_filepath, sheet_datas) print 'Excel文件:{}已生成'.format(course.NewCourseName)
def statistics(self, statistics_filepath): """ 对course_score_list中的结果,进行统计 :return: """ # bad course list bad_course_list = [] # 课程的数量分布统计变量 n_coure_count_more50 = 0 n_coure_score_less50 = 0 f_stat = open(statistics_filepath, 'w') # 统计所有课程,百分比的分布情况 n_coure_score = CourseInfomation.CourseScore() for course_score in self.course_score_list: n_coure_score.score_scope_more60_count += course_score.score_scope_more60_count n_coure_score.score_scope_between5060_count += course_score.score_scope_between5060_count n_coure_score.score_scope_between4050_count += course_score.score_scope_between4050_count n_coure_score.score_scope_less40_count += course_score.score_scope_less40_count course_descrip = course_score.getDescription() f_stat.write('\n'.join(course_descrip)) f_stat.write('\n\n') # 如果50分以上的超过50%,则more50+1 if (course_score.score_scope_between5060_rate + course_score.score_scope_more60_rate) > 0.5: n_coure_count_more50 += 1 else: n_coure_score_less50 += 1 bad_course_list.append(course_score) f_stat.write('所有课程的汇总统计:') course_descrip = n_coure_score.getDescription() f_stat.write('\n'.join(course_descrip)) f_stat.write('\n\n') # 统计课程所在区域的分布情况 n_coure_count_total = n_coure_score_less50 + n_coure_count_more50 if n_coure_count_total > 0: n_coure_score_less_rate = float( n_coure_score_less50) / n_coure_count_total n_coure_count_more_rate = float( n_coure_count_more50) / n_coure_count_total ns = '50%以上的试题得分大于50分的课程数量:{} 占比:{}'.format( n_coure_count_more50, n_coure_count_more_rate) print ns ns = '50%以上的试题得分小于50分的课程数量:{} 占比:{}'.format( n_coure_score_less50, n_coure_score_less_rate) print ns # 保存bad course信息 print 'bad course information.' for bad_course in bad_course_list: course_descrip = bad_course.getDescription() f_stat.write('\n'.join(course_descrip)) f_stat.write('\n') # 保存未识别的课程 f_stat.write('\n\n') f_stat.write('未识别的课程:') f_stat.write('\n'.join(self.course_unrecongnized)) f_stat.write('\n\n') f_stat.write('超出范围的课程:') f_stat.write('\n'.join(self.course_over_scope)) f_stat.close()
answer_content = answer_content + ', ' + str(question_row[4]) if str(question_answer).__contains__('D'): answer_content = answer_content + ', ' + str(question_row[5]) if str(question_answer).__contains__('E'): answer_content = answer_content + ', ' + str(question_row[6]) answer_content = str(answer_content) if len(answer_content): answer_content = answer_content[1:] knowledge = '' if len(question_row) > 12: knowledge = str(question_row[12]) content = '{}:: {} 答案:{}'.format(knowledge, question_content, answer_content) return content if __name__ == '__main__': #read_excel() er = ExamQuestionProcessor() er.isTest = True course_path_info = CourseInfomation.CourseFilepath() course_path_info.courseware_source_directory = er.rootpath course_path_info.examquestion_source_xlsx_filepath = u'{}/q-xlsx/20181122-200plus.xlsx'.format( er.rootpath) course_info = CourseInfomation.CourseDictionary() er.setCourseInfo(course_info) er.courseExamQuestionGenerator(course_path_info) print ''
def batchProcessAssociate1(self, dirname): """ 批量处理多个课程的自动关联工作 :return: """ # 指定一个文件夹,该文件夹用来存放多个课程的pdf课件 # 逐个课件处理 # 首先指定根目录位置,从该目录读取pdf课件 srcrootpath = './../../data/course-knowledge-machine/' + dirname c_rootpath = srcrootpath + '/c-docx' #q_rootpath = srcrootpath + '/q-xlsx' self.loadProcessedCourse(srcrootpath) # os.walk(path)这个函数得到的结果是一个或多个tuple, # 或者使用os.listdir(path)函数能得到文件夹下所有文件(包括文件夹)的名称,但是无法获取子文件夹的状态 count = 0 filelist = self.get_filename_from_dir(c_rootpath) # 然后是对每一个文件进行处理 for f in filelist: count = count + 1 self.coursename = os.path.splitext(f)[0] # 判断该课程是否在需要处理的范围之内,如果不是,则跳过该课程 current_coursename = self.getCourseNameFromFileName( self.coursename) current_schoolname = self.getSchoolNameFromFileName( self.coursename) if not self.school_course_scope_dict.__contains__( current_schoolname + current_coursename): self.course_over_scope.append(f) continue # 如果课程名称中包含英语,不处理 if current_coursename.__contains__(u'英语'): continue # 如果该课程在需要处理的范围内,则开始处理 print '开始处理文件:{}'.format(f) course = self.school_course_scope_dict.get(current_schoolname + current_coursename) if len(course.SchoolName) == 0: self.course_unrecongnized.append(f) continue # 如果课程已经被处理了,跳过 course_key = '{}-{}'.format(course.SchoolCode, course.CourseCode) if self.course_processed_dict.__contains__(course_key): print '第{0}篇 课程:{1} 已处理过;'.format(count, f) continue course_path_info = CourseInfomation.CourseFilepath() # 指定课件的源文件路径 course_path_info.courseware_source_directory = srcrootpath course_path_info.courseware_source_docx_filepath = c_rootpath + '/' + f #course_path_info.examquestion_source_xlsx_filepath = q_rootpath + '/' + current_coursename + '.xlsx' course_path_info.initByCourse(course) # 如果已经关联过了,不用再次关联 self.associateFlow(course_path_info) print '第{0}篇 课程:{1} 处理完成;'.format(count, f) # 测试时,先跑4个文件即可 if self.isTest: if count > 4: break print '所有课程处理完毕,共处理:{0}篇'.format(count) # 开始统计结果 print '开始统计结果。' statistics_filepath = '{}/statistics.txt'.format(srcrootpath) self.statistics(statistics_filepath) print '统计结果结束。' # 关联差的数据保存 combine_bad_filepath = '{}/combine_bad.xls'.format(srcrootpath) self.combineBadExamquestion(combine_bad_filepath)
continue if word.startswith(u'概'): continue result_list.append(word) return result_list def outfile(self, filepath): fout = open(filepath, 'w') fout.writelines(self.snd_level_catalog) fout.close() if __name__ == "__main__": course_path_info = CourseInfomation.CourseFilepath() course_path_info.courseware_source_txt_filepath = u'./../../data/course-base/本科专业目录-catalog.xlsx.txt' course_path_info.vector_corpus_txt_filepath = u'./../../data/course-base/本科专业目录-catalog.corpus.txt' course_path_info.vector_model_bin_filepath = u'./../../data/course-base/本科专业目录-catalog.model.bin' course_path_info.correlation_txt_filepath = u'./../../data/course-base/本科专业目录-course-catalog.txt' sr = TextVector(course_path_info) #sr.course_path_info = course_path_info sr.readCourseNameList() sr.train() sr.predication() sr.output_dict() filepath = u'./../../data/course-base/本科专业目录-course-catalog-tag.txt' sr.outfile(filepath)
def predication(self): self.bad_examquestion_list = [] self.course_score = CourseInfomation.CourseScore() self.course_score.initCourse(self.course_path_info_list[0].course) # match(n)-[:NEXT]-(m) where n.name in ['典型','金本位制','指','金币','本位'] return n,m if self.examquestion_info is None: return qindex = 0 question_knowledge_map = {} for course_path_info in self.course_path_info_list: if not self.examquestion_info.examquestion_dict.__contains__( course_path_info.course): return exam_question_list = self.examquestion_info.examquestion_dict.get( course_path_info.course) for exam_question in exam_question_list: #line = "物权的分类:从设立的角度对他物权再做分类,可把其分为()。,用益物权和担保物权" k = exam_question.knowledge_list q = exam_question.getContentAndAnswer() question_knowledge_map[q] = k qindex = qindex + 1 q_words = self.sentence.splitSentenceCanRepeat(q) # 从q中找重点词, 并放大重点词 q_words = self.preprocessor.enlargeVipWords(q_words, q) if len(q_words) == 0: continue # 然后再遍历知识点 index = 0 res_list = [] for k_key in self.knowledge.keys(): k_tup = self.knowledge.get(k_key) k_words = k_tup[0] if len(k_words) == 0: continue score = self.doc_vec.pred_similarity(q_words, k_words) res = ResultInfo.ResultInfo(index, score, k_tup[2], k_key) res_list.append(res) index += 1 # 对列表按score降序排列 res_list.sort(cmp=None, key=lambda x: x.score, reverse=True) # 取分值最高的几个,超过1%,的舍去,或者再限定具体数量,比如3个 # 统计得分的情况 self.computeScore(res_list) # 统计不超过50分的试题 if self.badExamquestionStatistics(res_list) == True: self.bad_examquestion_list.append( (exam_question, res_list[0:3])) # 获取上级 知识点 #reslist = self.getParentKnowledge(reslist) # 格式化输出 reslist, wordlist = self.formatOutput(res_list, k) # 统计正确率 if len(reslist) > 0: ns = '问题{0}:'.format(qindex) + q self.outputcontentlist.append(ns + '\n') ns = '电脑标识知识点:' + ';'.join(wordlist) self.outputcontentlist.append(ns + '\n') ns = '知识点评估指标:' + ';'.join(reslist) self.outputcontentlist.append(ns + '\n') #print '老师标识知识点:' + k ns = '老师标识知识点:' self.outputcontentlist.append(ns + '\n') self.outputcontentlist.append('\n') #ns = '电脑标识是否正确:' #self.outputcontentlist.append(ns) # 计算正确率 # 题目总数 self.course_score.compute() ns = '试题总数:{}'.format(self.course_score.score_scope_total) self.outputcontentlist.append(ns + '\n') print ns ns = '比较靠谱数(60分以上):{} ,比较靠谱占比:{}%'.format( self.course_score.score_scope_more60_count, round(self.course_score.score_scope_more60_rate * 100, 2)) self.outputcontentlist.append(ns + '\n') print ns ns = '基本靠谱数(50-60分):{} ,基本靠谱占比:{}%'.format( self.course_score.score_scope_between5060_count, round(self.course_score.score_scope_between5060_rate * 100, 2)) self.outputcontentlist.append(ns + '\n') print ns ns = '不太靠谱数(40-50分):{} ,不太靠谱占比:{}%'.format( self.course_score.score_scope_between4050_count, round(self.course_score.score_scope_between4050_rate * 100, 2)) self.outputcontentlist.append(ns + '\n') print ns ns = '不靠谱数(40分以下):{} ,不靠谱占比:{}%'.format( self.course_score.score_scope_less40_count, round(self.course_score.score_scope_less40_rate * 100, 2)) self.outputcontentlist.append(ns + '\n') print ns
def predication(self): # match(n)-[:NEXT]-(m) where n.name in ['典型','金本位制','指','金币','本位'] return n,m if self.input_subject_file is None: self.input_subject_file = self.doc_vec.train_input_subject_file question = open(self.input_subject_file, 'r') ids_lines = question.readlines() qindex = 0 question_knowledge_map = {} self.course_score = CourseInfomation.CourseScore() for line in ids_lines: #line = "物权的分类:从设立的角度对他物权再做分类,可把其分为()。,用益物权和担保物权" line = line.strip('\n') index = line.index(':') k = line[0:index] q = line[index + 1:] question_knowledge_map[q] = k qindex = qindex + 1 q_words = self.sentence.splitSentenceCanRepeat(q) # 从q中找重点词, 并放大重点词 q_words = self.preprocessor.enlargeVipWords(q_words, q) if len(q_words) == 0: continue # 然后再遍历知识点 index = 0 res_list = [] for k_key in self.knowledge.keys(): k_tup = self.knowledge.get(k_key) k_words = k_tup[0] if len(k_words) == 0: continue score = self.doc_vec.pred_similarity(q_words, k_words) res = ResultInfo.ResultInfo(index, score, k_tup[2] + ' ' + k_key) res_list.append(res) index += 1 # 对列表按score降序排列 res_list.sort(cmp=None, key=lambda x: x.score, reverse=True) # 取分值最高的几个,超过1%,的舍去,或者再限定具体数量,比如3个 # 统计得分的情况 self.computeScore(res_list) # 获取上级 知识点 #reslist = self.getParentKnowledge(reslist) # 格式化输出 reslist, wordlist = self.formatOutput(res_list) # 统计正确率 if len(reslist) > 0: ns = '问题{0}:'.format(qindex) + q self.outputcontentlist.append(ns + '\n') ns = '电脑标识知识点:' + ';'.join(wordlist) self.outputcontentlist.append(ns + '\n') ns = '知识点评估指标:' + ';'.join(reslist) self.outputcontentlist.append(ns + '\n') #print '老师标识知识点:' + k ns = '老师标识知识点:' + k self.outputcontentlist.append(ns + '\n') self.outputcontentlist.append('\n') #ns = '电脑标识是否正确:' #self.outputcontentlist.append(ns) # 计算正确率 # 题目总数 self.course_score.compute() ns = '试题总数:{}'.format(self.course_score.score_scope_total) self.outputcontentlist.append(ns + '\n') print ns ns = '比较靠谱数(60分以上):{} ,比较靠谱占比:{}%'.format( self.course_score.score_scope_more60_count, round(self.course_score.score_scope_more60_rate * 100, 2)) self.outputcontentlist.append(ns + '\n') print ns ns = '基本靠谱数(50-60分):{} ,基本靠谱占比:{}%'.format( self.course_score.score_scope_between5060_count, round(self.course_score.score_scope_between5060_rate * 100, 2)) self.outputcontentlist.append(ns + '\n') print ns ns = '不太靠谱数(40-50分):{} ,不太靠谱占比:{}%'.format( self.course_score.score_scope_between4050_count, round(self.course_score.score_scope_between4050_rate * 100, 2)) self.outputcontentlist.append(ns + '\n') print ns ns = '不靠谱数(40分以下):{} ,不靠谱占比:{}%'.format( self.course_score.score_scope_less40_count, round(self.course_score.score_scope_less40_rate * 100, 2)) self.outputcontentlist.append(ns + '\n') print ns
k_list = [] f = open(self.ngram.outputfile) for k in f: k_list.append(k) return k_list if __name__ == "__main__": pusher = TreeFactory() sen = u'“诗史”' sen = pusher.postProcessSentence(sen) sen = u'协议出让33.4%表 2.招标出让22%图3.折扣(discount为' pattern = re.compile(pusher.re_nouse_tag_percent) res = pattern.findall(sen) pusher.outputfile = u'./../data/course-knowledge-tgt/抽取模板.txt' pusher.inputfile = u'D:/奥鹏/学生服务中心标注/文科课程电子辅导资料-docx/抽取模板.docx' course_filepath = CourseInfomation.CourseFilepath() course_filepath.sourse_filetype = course_filepath.type_text course_filepath.courseware_source_txt_filepath = u'./../../data/other/中级财务会计.txt' course_filepath.courseware_knowledge_txt_filepath = u'./../../data/other/中级财务会计-kwg.txt' course = CourseInfomation.Course() course.CourseCode = '1000' course_filepath.course = course pusher.course_filepath = course_filepath pusher.extractKnowledge() print 'over.'