def getSchoolWord(code, k): #schoolWord {school_id1:{word1:word1出现的次数,word2:word2出现的次数,...},school_id2:{word1:word1出现的次数,word2:word2出现的次数,...}} sql = "SELECT name FROM `discipline_new` where code=%s" #teacher - school{teacher_id1: school1, teacher_id2: school2} teacherSchool=pickle.load(open(root+'/teacherSchool', 'rb')) result = dbs.getDics(sql, (code,)) #name学科名字 name = result[0]['name'] path = root+'/' + name + '-' + code #file对应的是计算tfidf后的文件 file = open(path + "/" + code + "_fenci_tdidf.txt", 'r', encoding="utf8") #paper paper和teacher的对应关系 paper = pickle.load(open(root+'/paperTeacher', 'rb')) list = file.readlines() schoolWord = {} for line in list: #temp 字典 {id:paperid,fenci:paper中的关键词} temp = eval(line) paper_id = temp['id'] if paper_id in paper: teacher_id = paper[paper_id]["author_id"] school_id=teacherSchool[teacher_id] if school_id not in schoolWord : schoolWord[school_id] = {} words = temp['fenci'].split(' ') for w in words: if w in schoolWord [school_id]: schoolWord[school_id][w] += 1 else: schoolWord[school_id][w] = 1 pickle.dump(schoolWord, open(root+'/' + code + '/k' + str(k) + '/schoolWord', 'wb'))
def getPaperAndTecher(): ''' paperTeacher {paper_id:{"author_id":r["author_id"],"name":r["name"]}} paperTeacher文件:字典,文章id为key,value是教师信息 teacherpaper {author_id:[]} teacherpaper文件:字典,老师id为值,value是文章信息 :return: ''' #获得论文id,论文author_id 作者name a是论文 b是老师名字 print('paper-teacher and teacher-paper..') sql = "SELECT a.paper_id,a.teacher_id,b.`name` FROM `teacher_paper` a ,es_teacher b where a.teacher_id=b.ID" paper = {} teacher = {} result = dbs.getDics(sql) for r in result: paper[r["paper_id"]] = { "author_id": r["teacher_id"], "name": r["name"] } #以老师名字为key,论文名字为value if r["teacher_id"] in teacher: teacher[r["teacher_id"]].append(r["paper_id"]) else: teacher[r["teacher_id"]] = [r["paper_id"]] pickle.dump(paper, open(root + '/paperTeacher', 'wb')) pickle.dump(teacher, open(root + '/teacherPaper', 'wb'))
def getInstitutionName(): ''' 写入院系信息,{institution_id: {'ID':, 'SCHOOL_ID':, 'SCHOOL_NAME':, 'NAME':}, ...} :return: ''' print('getInstitutionName..') print('得到院系信息') sql = "SELECT a.*,b.total from es_institution a join institution_rank b on a.ID=b.institution_id" result = dbs.getDics(sql) dic = {} for r in result: dic[r['ID']] = r pickle.dump(dic, open(root + '/InstitutionName', 'wb'))
def getWordPaper(code, k): ''' 读入某个学科某个主题数下的topic2word,输出word2topic和topic2word word2topic {'催化剂': {0: 0.104, 1: 0.0, 2: 0.0, 3: 0.0,} word2topic {word1:{topic1:p1,topic2:p2},word2:{}} topicToWord {topic1:{word1:p1,word2:p2},topic2:{word1:p1,word2:p2}} :param code:学科代码 :param k: 主题数 :return: ''' print('getWordPaper..') sql = "SELECT name FROM `discipline_new` where code=%s" result = dbs.getDics(sql, (code, )) name = result[0]['name'] # p = data/学科名-学科代码/k0828 p = root + '/' + name + '-' + code + '/k' + str(k) #file: # 0: {'无人机': 0.046, '直升机': 0.027, '纤维素': 0.024, # 1: {'电机': 0.061, '控制器': 0.031, '控制策略 file = open(p + '/' + code + "_topic.txt", 'r', encoding="utf8") #读取主题及其关键词 list = file.readlines() # 词2主题 每个词,如果这个词在某个主题下出现,就将它的概率记录下来 # {'催化剂': {0: 0.104, 1: 0.0, 2: 0.0, 3: 0.0,} wordToTopic = {} #与读入的文件结构一样 topicToWord = {} for topic_id, line in enumerate(list): #这里的:用来分隔主题号和关键词字典 index = line.find(":") #将关键词字典转化为字典,words{'催化剂': 0.104, '神经网络': 0.063,} words = eval(line[index + 1:]) topicToWord[topic_id] = words #words是字典,这样遍历,w指代字典中的key for w in words: if w in wordToTopic: wordToTopic[w][topic_id] = words[w] else: wordToTopic[w] = {} wordToTopic[w][topic_id] = words[w] path = root + '/' + code + '/k' + str(k) if not os.path.exists(path): os.makedirs(path) pickle.dump(topicToWord, open(path + '/topicToWord', 'wb')) pickle.dump(wordToTopic, open(path + '/wordToTopic', 'wb'))
def getTeacherAndSchool(): ''' 1.school-teacher {school_id1:[teacher_id1,teacher_id2,...],school_id2:{teacher_id3....}} 2.teacher-school {teacher_id1:school1,teacher_id2:school2} :return: ''' print('getTeacherAndSchool') sql = "SELECT a.ID,a.SCHOOL_ID from es_teacher a" result = dbs.getDics(sql) teacher = {} school = {} for r in result: if r["SCHOOL_ID"] in school: school[r["SCHOOL_ID"]].append(r["ID"]) else: school[r["SCHOOL_ID"]] = [r["ID"]] teacher[r["ID"]] = r["SCHOOL_ID"] pickle.dump(teacher, open(root + '/teacherSchool', 'wb')) pickle.dump(school, open(root + '/schoolTeacher', 'wb'))
def getPaperTopic(code, k): ''' 给teacher_topic.txt 加上文章id {paper_id1:{topic1:p1,topic2:p2},paper_id2:{},paper_id2:{topic2:p2}} 格式 {243084: {13: 0.92521083}, 242970: {25: 0.92443347, 15: 0.04058274}} :param code: :param k: :return: ''' print('getPaperTopic') sql = "SELECT name FROM `discipline_new` where code=%s" result = dbs.getDics(sql, (code, )) name = result[0]['name'] p = root + '/' + name + '-' + code + '/k' + str(k) #p: G:\w_project\data/农业工程-0828/k36 #file 读取文章主题文件 file = open(p + '/' + code + "_teacher_topic.txt", 'r', encoding="utf8") # paperId 计算tfidf后的文件 paperId = open(root + '/' + name + '-' + code + "/" + code + "_fenci_tdidf.txt", 'r', encoding="utf8") #ids里面是各个文档id ids = [] for line in paperId.readlines(): item = eval(line) ids.append(item['id']) #每个文章对应的主题及id # {243084: {13: 0.92521083}, 242970: {25: 0.92443347, 15: 0.04058274}, paperToic = {} for paper_index, line in enumerate(file.readlines()): # line:[(5, 0.75776845), (7, 0.17293692), (11, 0.04333982)] temp = eval(line) #ids[paper_index] 是文章id paperToic[ids[paper_index]] = {} for t in temp: #paperToic[ids[paper_index]] 是个字典 t[0] 是key t[1]是value paperToic[ids[paper_index]][t[0]] = t[1] path = root + '/' + code + '/k' + str(k) pickle.dump(paperToic, open(path + '/paperToic', 'wb'))
def getTeacherName(): ''' 把教师信息和权值写入一个文件,格式是{id1(即teacher表中id):{教师信息+权重},id2:{信息},...} {149104:{'id': 149104, 'name': '潘正华', 'position': None, 'title': None, 'school': '江南大学', 'institution': '理学院', # 'theme': None, 'eduexp': None, 'email': None, 'pic': None, # 'homepage': 'http://cksp.eol.cn/tutor_detail.php?id=11396', 'school_id': 17397, 'age': 0, 'field_id': None, # 'total': 0.75} :return: ''' #total是字段名 print('getTeacherName..') sql = "SELECT a.*,b.total from es_teacher a join teacher_rank b on a.ID=b.teacher_id" result = dbs.getDics(sql) dic = {} for r in result: #r: # {'id': 149104, 'name': '潘正华', 'position': None, 'title': None, 'school': '江南大学', 'institution': '理学院', # 'theme': None, 'eduexp': None, 'email': None, 'pic': None, # 'homepage': 'http://cksp.eol.cn/tutor_detail.php?id=11396', 'school_id': 17397, 'age': 0, 'field_id': None, # 'total': 0.75} dic[r['ID']] = r pickle.dump(dic, open(root + '/teacherName', 'wb'))
def getTeacherWord(code,k): ''' TeacherWord {teacher_id1:{word1:num1,word2:num2},teacher_id2:{word1:num1,...},...} :param code:学科代码 :param k: 主题数 :return: ''' sql="SELECT name FROM `discipline_new` where code=%s" result = dbs.getDics(sql,(code,)) #result :[{'name': '农业工程'}] #name是code对应的学科名字 name=result[0]['name'] path=root+'/' + name + '-' + code #file对应的是计算tfidf后的文件 file= open(path+ "/"+code+"_fenci_tdidf.txt", 'r', encoding="utf8") #paper是paper-teacher的对应关系 paper = pickle.load(open(root+'/paperTeacher', 'rb')) list=file.readlines() teacherWord={} for line in list: #temp 字典 {id:paperid,fenci:paper中的关键词} temp=eval(line) paper_id=temp['id'] # 70511 论文有30万篇,能找到作者的只有70000篇,所以很多对应不上,而学长代码是在一张表中,必然对应的上,所以要修改代码,判断该paper能否找到作者。 if paper_id in paper: teacher_id=paper[paper_id]["author_id"] if teacher_id not in teacherWord: teacherWord[teacher_id]={} words=temp['fenci'].split(' ') for w in words: if w in teacherWord[teacher_id]: teacherWord[teacher_id][w]+=1 else: teacherWord[teacher_id][w]= 1 pickle.dump(teacherWord, open(root+'/'+ code+'/k'+str(k)+'/teacherWord', 'wb'))