def readRegularKnowledgeList(self): if not FilePath.fileExist(self.course_path_info_list[0]. courseware_knowledge_txt_filepath): return #words = open('./../data/79037-002_knowledge.txt', 'r') # zhongjicaiwukuaiji-auto-knowledge f = open( self.course_path_info_list[0].courseware_knowledge_txt_filepath, 'r') ids_lines = f.readlines() index = 0 for line in ids_lines: index += 1 if index == 1: continue line = line.strip('\n') line_k = line.split(' ') if len(line_k) < 2: continue line_k_code = line_k[0] line_k_word = line_k[1] line_k_word_key = line_k_word.replace(u"'", ".") #line_k_confidence = line_k[2] line_k_confidence = 100 if self.knowledge.__contains__(line_k_word_key): continue words = self.sentence.splitSentenceCanRepeat(line_k_word) words = self.preprocessor.enlargeVipWords(words, line_k_word) tup = (words, line_k_confidence, line_k_code) self.knowledge[line_k_word_key] = tup tup = (line_k_word_key, line_k_confidence, line_k_code) self.knowledgeByCode[line_k_code] = tup
def train(self): # 先检查模型是否存在,如果存在,直接加载 if FilePath.fileExist(self.course_path_info.vector_model_bin_filepath): #self.model_loaded = Word2Vec.load_word2vec_format(self.model_file, binary=True) self.model_loaded = KeyedVectors.load_word2vec_format( self.course_path_info.vector_model_bin_filepath, binary=True) # 输出词典 #self.output_dict(self.model_loaded.wv.index2word) # 生成语料 self.generate_train_file() return # 生成语料 self.generate_train_file() # 加载语料 #sentences = word2vec.Text8Corpus(self.train_output_result_file) sentences = LineSentence( self.course_path_info.vector_corpus_txt_filepath) # 训练skip-gram模型,默认window=5 # 第一个参数是训练语料,第二个参数是小于该数的单词会被剔除,默认值为5, 第三个参数是神经网络的隐藏层单元数,默认为100 # 注意:min_count = 1,就是所有词,如果设置大的话,会过滤掉小于的词 print '正在训练模型...' model = Word2Vec(sentences, size=500, min_count=1, iter=5000) #model.wv.save(self.model_file) model.wv.save_word2vec_format( self.course_path_info.vector_model_bin_filepath, binary=True) self.model_loaded = model
def loadProcessedFile(self, filepath): if not FilePath.fileExist(filepath): return f_input = open(filepath, 'r') for fname in f_input: fname = fname.strip('\n') if len(fname) == 0: continue self.processed_file.append(fname)
def readFile(self, filepath=None): """ read the excel data python操作excel主要用到xlrd和xlwt这两个库,即xlrd是读excel,xlwt是写excel的库。 可从这里下载https://pypi.python.org/pypi。下面分别记录python读和写excel. :param filepath:the excel full path :return: ture if read file ok, false otherwise """ result_list = [] # 如果filepath 是空的话,就先看看self.filepath 是否为空 if filepath is None and self.filepath is None: print '请设置读取的文件名称.' return result_list if filepath is None: filepath = self.filepath # 检查文件是否存在 if not FilePath.fileExist(filepath): return result_list # 打开文件 workbook = xlrd.open_workbook(filepath) # 获取所有sheet #print workbook.sheet_names() # [u'sheet1', u'sheet2'] #sheet2_name = workbook.sheet_names()[1] local_sheet_scope_indexes = self.getSheetScope(workbook) totalcount = 0 for index in local_sheet_scope_indexes: sheet = workbook.sheet_by_index(index) rowindex = self.start_row_index local_sheet_columns_indexes = self.getSheetColumnScope(sheet) # 如果列的范围与预期不一致,就跳过该sheet if self.column_scope_names is not None and len(local_sheet_columns_indexes) != len(self.column_scope_names): print '该sheet没有需要的数据' continue while rowindex < sheet.nrows: row = sheet.row_values(rowindex) rowindex = rowindex + 1 try: one_row = self.addOneRow(row, local_sheet_columns_indexes) result_list.append(one_row) totalcount = totalcount + 1 if totalcount % 100 == 0: print '已经读取:{0}行'.format(totalcount) except Exception: print '数据异常行数:' + str(rowindex) print '读取数据异常:' + Exception.message print '共读取:{0}行'.format(totalcount) return result_list
def loadProcessedCourse(self, rootpath): output_mid_filepath = '{}/statistics-mid.txt'.format(rootpath) if not FilePath.fileExist(output_mid_filepath): return fout = open(output_mid_filepath, 'r') lines = fout.readlines() for course_base_code in lines: course_base_code = course_base_code.strip('\n') self.course_processed_dict[course_base_code] = course_base_code fout.close()
def loadBaseCourse(self, base_course_file): if not FilePath.fileExist(base_course_file): return f_input = open(base_course_file, 'r') for line in f_input: line = line.strip('\n') cb = CourseInfomation.CourseBase() cb.initByString(line) self.course_base_list.append(cb) self.current_base_course = cb
def readText(self): self.content_rows = [] if self.filepath is None: return if not FilePath.fileExist(self.filepath): return f_input = open(self.filepath) for row in f_input: self.content_rows.append(row) return self.content_rows
def loadProcessedCourse(self, rootpath): output_mid_filepath = '{}/statistics-mid.txt'.format(rootpath) if not FilePath.fileExist(output_mid_filepath): return fout = open(output_mid_filepath, 'r') lines = fout.readlines() for one_course_str in lines: course_score = CourseInfomation.CourseScore() course_score.initByString(one_course_str) key = '{}-{}'.format(course_score.school_code, course_score.course_code) self.course_processed_dict[key] = course_score self.course_score_list.append(course_score) fout.close()
def generate_train_file(self): # 加载训练文本,训练文本有2部分组成,一部分是课件,一部分是试题 # 检查语料文件是否已经生成, 如果已经生成,则不用再生成 #if FilePath.fileExist(self.course_path_info.vector_corpus_txt_filepath): # return # 打开结果文件 f_out = open(self.course_path_info_list[0].vector_corpus_txt_filepath, 'w') for course_path_info in self.course_path_info_list: # 第一步先加载课件 if course_path_info.courseware_source_txt_filepath: for c_line in self.sentence_reader.splitSentence( course_path_info.courseware_source_txt_filepath): f_out.write(' '.join(c_line)) f_out.write('\n') # 第二步加载试题 if course_path_info.examquestion_source_txt_filepath and FilePath.fileExist( course_path_info.examquestion_source_txt_filepath): question = open( course_path_info.examquestion_source_txt_filepath, 'r') ids_lines = question.readlines() for line in ids_lines: # line = "物权的分类:从设立的角度对他物权再做分类,可把其分为()。,用益物权和担保物权" line = line.strip('\n') index = line.find('::') if index < 0: continue k = line[0:index] q = line[index + 2:] q_words = self.sentence_reader.splitOneSentence(q) q_words = self.sentence_processor.enlargeVipWords( q_words, q) f_out.write(' '.join(q_words)) f_out.write('\n') # 第三步抽取的知识点也作为训练样本 if self.knowledge: for k_key in self.knowledge: k_tup = self.knowledge[k_key] f_out.write(' '.join(k_tup[0])) f_out.write('\n') f_out.close()
def subjectSimilarity(self): """ 判断2个短句的相似性 :return: """ if not FilePath.fileExist( self.course_path_info.examquestion_source_txt_filepath): return question = open(self.course_path_info.examquestion_source_txt_filepath, 'r') ids_lines = question.readlines() qindex = 0 question_map = {} processed_q_map = {} for line in ids_lines: line = line.strip('\n') index = line.index('::') k = line[0:index] q = line[index + 2:] qindex = qindex + 1 q_words = self.sentence.splitOneSentence(q) if len(q_words) == 0: continue # 然后再遍历知识点 find_same_flag = False for old_q in question_map.keys(): old_q_list = question_map.get(old_q) old_q_words = self.sentence.splitOneSentence(old_q) score = self.doc_vec.pred_similarity(q_words, old_q_words) if score > 0.95: old_q_list.append(q) find_same_flag = True break if find_same_flag == False: question_map[q] = [] # 过滤下,对于list是空的,不要管 self.result_map = {} for q_key in question_map.keys(): q_list = question_map.get(q_key) if len(q_list) > 0: self.result_map[q_key] = q_list
def generateKnowledgeCypher(course_path_info): """ generate cypher :return: """ cypherlist = [] # 检查文件 if not FilePath.fileExist(course_path_info.courseware_knowledge_txt_filepath): return cypherlist cypherlist.append("CREATE CONSTRAINT ON (c:Knowledge) ASSERT c.code IS UNIQUE;") cypherlist.append("CREATE CONSTRAINT ON (c:Question) ASSERT c.code IS UNIQUE;") cypherlist.append("create index on:Question(databaseid);") # 读取知识点文件,到一个字典文件中 # 知识点内部的关系,暂时仅建立父子之间的直接关系 k_dict = {} f_k = open(course_path_info.courseware_knowledge_txt_filepath, 'r') for k in f_k: k = k.strip('\n') k_arr = k.split(' ') if len(k_arr) < 2: continue k_dict[k_arr[0]] = k_arr[1] # 建立父子关系 for k_code, k_name in k_dict.items(): # 处理k_code,寻找上一级code k_code_arr = k_code.split('.') k_code_parent = '.'.join(k_code_arr[:-1]) # 不存在父节点,就不用创建关系 if not k_dict.__contains__(k_code_parent): continue # 存在父节点,创建父子关系 k_name_parent = k_dict.get(k_code_parent) k_ns_child = "MERGE (k_child:Knowledge {{code:'{0}'}}) on create set k_child.name='{1}'".format(k_code, k_name) k_ns_parent = "MERGE (k_parent:Knowledge {{code:'{0}'}}) on create set k_parent.name='{1}'".format(k_code_parent, k_name_parent) k_ns_parent_child = "MERGE (k_parent)-[:CHILD]->(k_child);" com = k_ns_child + ' ' + k_ns_parent + ' ' + k_ns_parent_child cypherlist.append(com) return cypherlist
def generate_train_file(self): self.sentence_words_dict = {} # 加载训练文本,训练文本有2部分组成,一部分是课件,一部分是试题 # 检查语料文件是否已经生成, 如果已经生成,则不用再生成 #if FilePath.fileExist(self.course_path_info.vector_corpus_txt_filepath): # return catalog_corpus_file = u'D:/pythonproject/open-neo4j-service/data/course-base/本科专业目录-catalog.corpus.txt' catalog_xls_file = u'D:/pythonproject/open-neo4j-service/data/course-base/本科专业目录-catalog.xlsx.txt' # 打开结果文件 f_out = open(catalog_corpus_file, 'w') # 第一步先加载分类目录 if FilePath.fileExist(catalog_xls_file): fin = open(catalog_xls_file, 'r') # 以读的方式打开文件 # 以第二层为判断点 # 合并第二层以下的点为一行 level_snd_list = [] write_line = '' first_code = '' first_name = '' index = 0 for line in fin: arr = line.split() code_line = arr[0] name_line = arr[1] code_section_list = code_line.split('.') if index == 0: first_code = code_line first_name = name_line if len(code_section_list) == 1: name_line1 = self.preprocessSent(name_line) #c_line_words = self.sentence_reader.splitSentenceCanRepeat(name_line1) c_line_words = self.sentence_reader.splitOneSentence( name_line1) c_line_words = self.postWordList(c_line_words) #section_name = ' '.join(c_line_words) f_out.write(' '.join(c_line_words)) f_out.write('\n') self.catalog_code_dict[name_line] = (code_line, name_line, c_line_words) elif len(code_section_list) == 2: if len(level_snd_list) > 0: write_line = ' '.join(level_snd_list) write_line1 = self.preprocessSent(write_line) #c_line_words = self.sentence_reader.splitSentenceCanRepeat(write_line1) c_line_words = self.sentence_reader.splitOneSentence( write_line1) c_line_words = self.postWordList(c_line_words) section_name = ' '.join(c_line_words) f_out.write(section_name) f_out.write('\n') self.catalog_code_dict[first_name] = (first_code, first_name, c_line_words) # 第二层的数据 self.snd_level_catalog.append( (first_code, first_name, section_name, c_line_words)) # 重置列表为空列表 level_snd_list = [] level_snd_list.append(name_line) first_code = code_line first_name = name_line else: level_snd_list.append(name_line) index += 1 # 最后一项 write_line = ' '.join(level_snd_list) write_line1 = self.preprocessSent(write_line) #c_line_words = self.sentence_reader.splitSentenceCanRepeat(write_line1) c_line_words = self.sentence_reader.splitOneSentence(write_line1) c_line_words = self.postWordList(c_line_words) section_name = ' '.join(c_line_words) f_out.write(section_name) f_out.write('\n') self.catalog_code_dict[first_name] = (first_code, first_name, c_line_words) # 第二层的数据 self.snd_level_catalog.append( (first_code, first_name, section_name, c_line_words)) f_out.close()
def predication1(self): self.course_score = CourseInfomation.CourseScore() self.course_score.initCourse(self.course_path_info.course) # match(n)-[:NEXT]-(m) where n.name in ['典型','金本位制','指','金币','本位'] return n,m if not FilePath.fileExist( self.course_path_info.examquestion_source_txt_filepath): return question = open(self.course_path_info.examquestion_source_txt_filepath, 'r') ids_lines = question.readlines() qindex = 0 question_knowledge_map = {} for line in ids_lines: #line = "物权的分类:从设立的角度对他物权再做分类,可把其分为()。,用益物权和担保物权" line = line.strip('\n') index = line.index('::') k = line[0:index] q = line[index + 2:] question_knowledge_map[q] = k qindex = qindex + 1 q_words = self.sentence.splitSentenceCanRepeat(q) # 从q中找重点词, 并放大重点词 q_words = self.preprocessor.enlargeVipWords(q_words, q) if len(q_words) == 0: continue # 然后再遍历知识点 index = 0 res_list = [] for k_key in self.knowledge.keys(): k_tup = self.knowledge.get(k_key) k_words = k_tup[0] if len(k_words) == 0: continue score = self.doc_vec.pred_similarity(q_words, k_words) res = ResultInfo.ResultInfo(index, score, k_tup[2] + ' ' + k_key) res_list.append(res) index += 1 # 对列表按score降序排列 res_list.sort(cmp=None, key=lambda x: x.score, reverse=True) # 取分值最高的几个,超过1%,的舍去,或者再限定具体数量,比如3个 # 统计得分的情况 self.computeScore(res_list) # 获取上级 知识点 #reslist = self.getParentKnowledge(reslist) # 格式化输出 reslist, wordlist = self.formatOutput(res_list, k) # 统计正确率 if len(reslist) > 0: ns = '问题{0}:'.format(qindex) + q self.outputcontentlist.append(ns + '\n') ns = '电脑标识知识点:' + ';'.join(wordlist) self.outputcontentlist.append(ns + '\n') ns = '知识点评估指标:' + ';'.join(reslist) self.outputcontentlist.append(ns + '\n') #print '老师标识知识点:' + k ns = '老师标识知识点:' self.outputcontentlist.append(ns + '\n') self.outputcontentlist.append('\n') #ns = '电脑标识是否正确:' #self.outputcontentlist.append(ns) # 计算正确率 # 题目总数 self.course_score.compute() ns = '试题总数:{}'.format(self.course_score.score_scope_total) self.outputcontentlist.append(ns + '\n') print ns ns = '比较靠谱数(60分以上):{} ,比较靠谱占比:{}%'.format( self.course_score.score_scope_more60_count, round(self.course_score.score_scope_more60_rate * 100, 2)) self.outputcontentlist.append(ns + '\n') print ns ns = '基本靠谱数(50-60分):{} ,基本靠谱占比:{}%'.format( self.course_score.score_scope_between5060_count, round(self.course_score.score_scope_between5060_rate * 100, 2)) self.outputcontentlist.append(ns + '\n') print ns ns = '不太靠谱数(40-50分):{} ,不太靠谱占比:{}%'.format( self.course_score.score_scope_between4050_count, round(self.course_score.score_scope_between4050_rate * 100, 2)) self.outputcontentlist.append(ns + '\n') print ns ns = '不靠谱数(40分以下):{} ,不靠谱占比:{}%'.format( self.course_score.score_scope_less40_count, round(self.course_score.score_scope_less40_rate * 100, 2)) self.outputcontentlist.append(ns + '\n') print ns