def keywords_save(): # 把所有keyword写入文件 keywords = open('keywords.txt', encoding='utf-8', mode='w') print(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))) for i in range(0, 90000000, 1000000): print(i) sql = 'select keyword from paper_clean1 limit ' + str(i) + ',1000000' paper_list = dbs.getDics(sql) if len(paper_list) == 0: break print(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))) for paper in paper_list: if (paper['keyword']): keywords.write(paper['keyword'] + '\n') keywords.close()
stopwords = [ line.strip() for line in open('stopwords.txt', encoding='utf-8').readlines() ] fill = [ 'vn', 'n', 'nr', 'nr1', 'nr2', 'nrj', 'nrf', 'ns', 'nsf', 'nt', 'nz', 'nl', 'ng' ] print('词典更新') jieba.load_userdict('userdict.txt') f = open('data/paperfenci.txt', 'w', encoding='utf8') for i in range(0, 10000000, 500000): sql = 'select id,name,abstract,keyword from paper_clean1 limit ' + str( i) + ',500000' paper_list = dbs.getDics(sql) if len(paper_list) == 0: break print('分词:' + str(i)) print(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))) DocWord = [] for paper in paper_list: line = paper['name'].strip('\n').strip( '\t') + ' ' + paper['abstract'].strip('\n').strip( '\t') + ' ' + paper["keyword"].strip('\n').strip('\t') seg_list = pseg.cut(line) words = [] for word, flag in seg_list: if flag in fill and word not in stopwords: words.append(word) DocWord.append(words)
from algorithm.base import dbs sql = "SELECT _id,discipline_subject FROM `journal`" temp = dbs.getDics(sql) dic = {} for t in temp: sub = t['discipline_subject'] list = sub.split('-') if list[0] not in dic: dic[t["_id"]] = list[0] sql = "SELECT id,org_id,author_id FROM paper_clean1" paper = dbs.getDics(sql) teacher = {} i = 0 for p in paper: if p['org_id'] != -1: item = {} item["father_id"] = p["id"] item["type"] = "1" item["prefix"] = dic[p['org_id']] records = {"table": "LdaPrefix", "params": item} dbs.insertItem(records) if p["author_id"] in teacher: if p['org_id'] in teacher[p["author_id"]]: teacher[p["author_id"]][p['org_id']] += 1 else: teacher[p["author_id"]][p['org_id']] = 1 else: teacher[p["author_id"]] = {p['org_id']: 1}
] for t in result: if t['fields'] is not None: list.append(t) continue f = [] for j in range(random.randint(0, 4) + 1): f.append(field[random.randint(0, len(field)) - 1]) t['fields'] = f list.append(t) return list print('导出数据') sql = 'SELECT paper_clean1.id,paper_clean1.author_id,paper_clean1.`name` as title,paper_clean1.abstract,t.name,t.school,t.institution,t.citation,t.paper_num,t.h_index,t.fields from paper_clean1 JOIN (select radar.author_id,radar.citation,radar.paper_num,radar.h_index,teacher.name,teacher.school,teacher.institution,teacher.fields from teacher LEFT JOIN radar on teacher.id =radar.author_id ) as t on paper_clean1.author_id=t.author_id' list = dbs.getDics(sql) print(len(list)) sql = "SELECT * FROM school_info" schools = dbs.getDics(sql) def getScool(name): r = [] for s in schools: if s['name'].find(name) >= 0: r.append(s) return r for l in list:
xueDic = { "中国史": "06", "农业资源与环境": "09", "图书情报与档案管理": "12", "城乡规划学": "12", "安全科学与工程": "0819", "戏剧与影视学": "05", "考古学": '06', "艺术学理论": '05', "草学": '09', "设计学": '05', "软件工程": "0812", "音乐与舞蹈学": '05', "风景园林学": '05', } xue = dbs.getDics(sql) for x in xue: if x["code"] is None: x["code"] = xueDic[x["xueke2"]] elif x["code"][0:2] == "07" or x["code"][0:2] == "08": x["code"] = x["code"][0:4] else: x["code"] = x["code"][0:2] sql = 'select * from teacher_dis_code' list = dbs.getDics(sql) sql = "SELECT * FROM school_info where name like %s" for l in list: params = (str(l["school"]), )
def get_words(): sql = "SELECT DISTINCT topic_value FROM `lda` where topic like '%计算机%'" result = dbs.getDics(sql) return result
import json from algorithm.base import dbs print('导出数据') sql = 'SELECT paper.id,paper.author_id,paper.author,teacher.name,teacher.institution from paper JOIN teacher on paper.author_id=teacher.id' list = dbs.getDics(sql) sql = 'SELECT id,name,school,institution from teacher' teacher = dbs.getDics(sql) print('开始关联') for t in teacher: t['name'] = t['name'].replace(' ', '') egoNet = {} for l in list: id = str(l['author_id']) try: author = json.loads(l['author']) except: print(l) name = l['name'].replace(' ', '') for a in author: if len(a) >= 1 and name != a['name']: find = False for t in teacher: if a['name'] == t['name'] and (len( a['org']) == 0 or a['org'].find(t['school']) >= 0): find = True key = id + '_' + str(t['id']) + '_' + a['name'] if key in egoNet.keys(): if l['id'] not in egoNet[key]: egoNet[key].append(l['id']) else: