def prepareNULL(): # nullDOI = readSeriz(nullDOI_pickle) yearTitle = [[[] for j in range(27)] for i in range(60)] yearList = [] num = 0 for nid in nullDOI: #print(str(nid)) getR = getResult('select time,title from paper where id=' + str(nid), cur) sr = getR[0] if not (len(sr['title']) > 0 and sr['time'] > 0): continue if sr['time'] in yearList: yidx = yearList.index(sr['time']) else: yearList.append(sr['time']) yidx = yearList.index(sr['time']) nidx = assignNum(sr['title']) yearTitle[yidx][nidx].append((nid, sr['time'], sr['title'])) print('now is ' + str(num) + ' id is ' + str(nid)) num += 1 print('begin saving') constructSeriz(yearList_pickle_null, yearList) for i in range(len(yearList)): for j in range(27): path = idyeartitle_path_null + str(i) + '_' + str(j) + '.pickle' if len(yearTitle[i][j]) > 0: constructSeriz(path, yearTitle[i][j])
def buildCoauthorByYear(year): #coauthornet xid,yid,score paperNetYear = readSeriz(paperNetYear_pickle) idList = readSeriz(idList_pickle) coauthorNet = readSeriz(expertNet_pickle)#这相当于初始化模板 ''' #---------------------------------------- #---不考虑数据库 start coauthorGraph = nx.Graph() paperGraph = readSeriz(paperGraph_pickle) for tup in paperGraph.edges(): coauthorList = paperGraph[tup[0]][tup[1]]['coauthoryear'] coauthorGraph.add_edge(*(tup[0],tup[1]),score=calculateCoauthor(coauthorList,tagYear)) constructSeriz(coauthorGraph_pickle,coauthorGraph) #---不考虑数据库 end #---------------------------------------- ''' for i in range(len(idList)): xid = str(idList[i]) for j in range((i+1),len(idList)): yid = str(idList[j]) #yearList = getResultList('select year where xid='+xid+', yid='+yid+', year>'+str(year),'year',cur) coauthorList = paperNetYear[i][j] #这里就是具体计算了 if len(yearList)==0: continue score = calculateCoauthor(coauthorList) #xid yid 已经转成str insertSQL = 'insert into coauthornet (xid,yid,score) values('+xid+','+yid+','+str(score)+')' cur.execute(insertSQL) conn.commit() coauthorNet[i][j] = score #这里也做一个记录 coauthorNet_pickle = '' #这个地址要重写,因为需要加入year这个变量 constructSeriz(coauthorNet_pickle,coauthorNet)
def prepareTopic(): subL1 = 'E:/Code/experience/pickle/subPickle 2012.pickle' subL2 = 'E:/Code/experience/pickle/subPickle 2013.pickle' subL3 = 'E:/Code/experience/pickle/subPickle 2014.pickle' subL4 = 'E:/Code/experience/pickle/subPickle 2015.pickle' #stopword = readStopWord() #准备保留词表数量 sub1 = readSeriz(subL1) sub2 = readSeriz(subL2) sub3 = readSeriz(subL3) sub4 = readSeriz(subL4) sub = sub1 + sub2 + sub3 + sub4 sub = list(set(sub)) topicWord = [] #停用词 for s in sub: s = s.replace('\ufeff', '').replace('/', ' ') for w in s.split(): if not w in stopword and len(w) > 2: newTopic = '' nw = wn.morphy(w) #词干化 try: newTopic += (nw + '') #变成新的 except Exception: newTopic += (w + '') #不变 topicWord.append(newTopic) newTopic = list(set(newTopic)) #print(topicWord) constructSeriz(word_pickle, topicWord) #序列化
def measureCOI(): #计算COI矩阵 reviewerDict = readSeriz(reviewerDict_pickle) personDict = readSeriz(personDict_pickle) reviewer_ID = readSeriz(reviewer_ID_pickle) totalEID = readSeriz(totalEID_pickle) paperDict = readSeriz(paperDict_pickle) shortDistanceMetrx = readSeriz(shortDistanceMetrx_pickle) paperList = list(paperDict.keys()) reviewerList = list(reviewerDict.keys()) COI = [[0 for y in range(len(reviewerList))] for x in range(len(paperList))] for title in paperDict.keys(): i = paperList.index(title) for rName in reviewerDict.keys(): j = reviewerList.index(rName) rid = reviewer_ID[rName] idy = totalEID.index(rid) coiList = [] for name in paperDict[title]: eid = personDict[name] idx = totalEID.index(eid) coi = shortDistanceMetrx[idx][idy] coiList.append(coi) mcoi = 1 - min(coiList) #这个COI就是这个评审对这个文章的COI COI[i][j] = mcoi constructSeriz(COI_pickle, COI)
def extractPaper2013(): # rfile = open(accepted_path, encoding='utf-8') rfile = list(rfile) paperDict = {} i = 0 title = '' for j in range(len(rfile)): i += 1 if '<strong>' in rfile[j] and '<br />' in rfile[j]: #title start = rfile[j].index('</strong>') end = rfile[j].index('<br />') title = rfile[j][start + len('</strong>'):end].strip() continue elif 'Authors: ' in rfile[j]: authorInstit = rfile[j].replace('Authors: ', '').replace( '<br />', '').replace('</h5>', '') authors = [] aipairs = authorInstit.split(';') for k in range(len(aipairs)): if len(aipairs[k]) < 5: continue if ',' in aipairs[k]: idx = aipairs[k].index(',') name = aipairs[k][:idx].strip() institu = aipairs[k][idx + 1:] if len(institu) > 1: institu = institu.strip() else: institu = 'Null' else: try: institu = aipairs[k].strip() idx = aipairs[k - 1].rindex(',') name = aipairs[k][idx + 1:].strip() authors[len(authors) - 1][1] = authors[len(authors) - 1][1][:idx] #print('error: '+authorInstit) except Exception: print(authorInstit) break authors.append([name, institu]) if title in paperDict.keys(): title += (' ' + str(i)) paperDict[title] = authors else: paperDict[title] = authors print('completed: ' + title) print(authors) print() else: #none pass #break constructSeriz(paperDict_pickle, paperDict) print('OK')
def flody(): #运行floyd 算最短距离 distanceMetrx = readSeriz(distanceMetrx_pickle) lenD = len(distanceMetrx) DSet = [] D0 = [[0 for i in range(lenD)] for j in range(lenD)] DSet.append(distanceMetrx) A0 = [[[] for i in range(lenD)] for j in range(lenD)] ASet = [] ASet.append(A0) for k in range(1, MAX_TIME): #从1开始,迭代次数 #初始化一个矩阵 DSet.append(D0) ASet.append(A0) for i in range(lenD): for j in range(lenD): if i == j: DSet[k][j][j] = 0 ASet[k][j][j] = 0 continue MIN_old = DSet[k - 1][i][j] for x in range(lenD): if (x == i) or (x == j): continue if (DSet[k - 1][i][x] >= MIN_old) or (DSet[k - 1][x][j] >= MIN_old): DSet[k][i][j] = DSet[k - 1][i][j] else: MIN_new = min(MIN_old, (DSet[k - 1][i][x] + DSet[k - 1][x][j])) DSet[k][i][j] = MIN_new if MIN_new < MIN_old: #记下x ASet[k][i][j] = [ ASet[k - 1][i][x], x, ASet[k - 1][x][j] ] else: ASet[k][i][j] = ASet[k - 1][i][j] MIN_old = MIN_new print('now is ' + str(k)) shortDistanceMetrx = DSet[k][:] if DSet[k] == DSet[k - 1]: break constructSeriz(shortDistanceMetrx_pickle, shortDistanceMetrx)
def extractPaper2014(): # rfile = open(accepted_path, encoding='utf-8') rfile = list(rfile) paperDict = {} i = 0 for j in range(len(rfile)): i += 1 if not '<strong>' in rfile[j]: continue else: parts = rfile[j].split('</strong> <br/>') title = parts[0].replace('<strong>', '').replace('<s>', '').strip() authorInstit = parts[1].split('<br/>')[0].strip() authors = [] aipairs = authorInstit.split(';') for k in range(len(aipairs)): if len(aipairs[k]) < 5: continue if ',' in aipairs[k]: idx = aipairs[k].index(',') name = aipairs[k][:idx].strip() institu = aipairs[k][idx + 1:].strip() else: try: institu = aipairs[k].strip() idx = aipairs[k - 1].rindex(',') name = aipairs[k][idx + 1:].strip() authors[len(authors) - 1][1] = authors[len(authors) - 1][1][:idx] #print('error: '+authorInstit) except Exception: print(authorInstit) break authors.append([name, institu]) if title in paperDict.keys(): title += (' ' + str(i)) paperDict[title] = authors else: paperDict[title] = authors print('completed: ' + title) #break constructSeriz(paperDict_pickle, paperDict) print('OK')
def compareNull(): yearListNull = readSeriz(yearList_pickle_null) yearList = readSeriz(yearList_pickle) for fp in readFiles(nullDict): sameList = [] single = [] print('now begin: ' + str(fp)) nullYearTitle = readSeriz(fp) yidx_null, nidx_null = extractYearTitle(fp) if yidx_null > len(yearListNull): print('error!!!!!!!!!!????') continue year = yearListNull[yidx_null] if year in yearList: yidx = yearList.index(year) else: print('error!!!!!!!!!!') continue path = idyeartitle_path + str(yidx) + '_' + str(nidx_null) + '.pickle' yearTitle = readSeriz(path) if len(yearTitle) < 1: for i in range(len(nullYearTitle)): single.append(nullYearTitle[i][0]) for i in range(len(nullYearTitle)): flag = False for j in range(len(yearTitle)): if nullYearTitle[i][2] == yearTitle[j][2]: sameList.append([nullYearTitle[i][0], yearTitle[j][0]]) flag = True continue if flag == False: single.append(nullYearTitle[i][0]) sameList_path = sameList_pickle + str(yidx) + '_' + str( nidx_null) + '.pickle' single_path = single_pickle + str(yidx) + '_' + str( nidx_null) + '.pickle' constructSeriz(sameList_path, sameList) constructSeriz(single_path, single)
def buildFinalGraph(): #对两个图进行加和 #coauthorGraph = readSeriz(coauthorGraph_pickle) #institutionGraph = readSeriz(institutionGraph_pickle) finalLen = len(coauthorNet) coauthorNet = readSeriz(coauthorNet_pickle) institutionNet = readSeriz(institutionNet_pickle) finalNet = readSeriz(expertNet_pickle) #初始化一个模板 finalGraph = nx.Graph() for i in range(finalLen): for j in range((i+1),finalLen): if i==j: continue finalNet[i][j] = coauthorNet[i][j] + institutionNet[i][j] finalNet[j][i] = finalNet[i][j] if finalNet[i][j] != 0: finalGraph.add_edge(*(i,j),score=finalNet[i][j]) constructSeriz(finalGraph_pickle,finalGraph)
def extractSub2012(): topicSet = [] file = open(file_path) for fl in file: #按行读取 if len(fl) > 3: fl = fl.lower() text = fl.split(': ')[1] text = text.replace(',', ' and') text = text.replace('//', ' and ') if 'and' in text: for word in text.split('and'): topicSet.append(word.strip()) else: topicSet.append(text.strip()) topicSet = list(set(topicSet)) for tp in topicSet: print(tp) constructSeriz(subPickle_path, topicSet)
def buildTopic(): # colleaguenet xid,yid,score topicNet = readSeriz(expertNet_pickle) topicNetMore = readSeriz(expertNet_pickle) topicGraph = nx.Graph() #预读 totalSet = [] for i in range(len(idList)): #other还没有设置,相当于tid xResult = getResult('select other from topic where eid='+str(idList[i]),cur) #需要有个other list 建立一个字典 key是other,num是内容 xDict = {} for x in xResult: xDict[x['other']] = x['num'] xID = xDict.keys() totalSet.append((xDict,xID)) for i in range(len(idList)): for j in range((i+1),len(idList)): topicList = list(set(totalSet[i][1]).intersection(totalSet[j][1])) if len(topicList)>0: #全部插入数据库中 score = calculateTopic(topicList) #xid yid 已经转成str insertSQL = 'insert into topicnet (xid,yid,score) values('+xid+','+yid+','+str(score)+')' cur.execute(insertSQL) conn.commit() topicNet[i][j] = score #这里也做一个记录 topicGraph(*(i,j),similar=score) topicNetMore[i][j] = topicList constructSeriz(topicNet_pickle,topicNet) constructSeriz(topicGraph_pickle,topicGraph) constructSeriz(topicNetMore_pickle,topicNetMore)
def buidPaperNet(): # #papernet id xid yid perid year paperNet = readSeriz(expertNet_pickle)#这相当于初始化模板 paperNetYear = readSeriz(expertNet_pickle)#这相当于初始化模板 paperGraph=nx.Graph() #初始化一个图 #预读 totalSet = [] for i in range(len(idList)): xResult = getResult('select id,time from paper where eid='+str(),cur) #需要有个pid list 建立一个字典 key是id,time是内容 xDict = {} for x in xResult: xDict[x['id']] = x['time'] xID = xDict.keys() totalSet.append((xDict,xID)) #预存一次表, 方便用时间这个条件进行控制 for i in range(len(idList)): for j in range((i+1),len(idList)): coauthorList = list(set(totalSet[i][1]).intersection(totalSet[j][1])) if len(coauthorList) >0: #全部插入数据库中 for cid in coauthorList: year = totalSet[i][0][cid] insertSQL = 'insert into papernet (xid,yid,perid,year) values('+str(idList[i])+','+str(idList[j])+','+str(cid)+','+str(year)+')' paperNet[i][j] = len(coauthorList) paperNetYear[i][j] = coauthorList paperGraph.add_edge(*(i,j),coauthoryear=coauthorList) #这个增加一条边 constructSeriz(paperNet_pickle,paperNet) constructSeriz(paperNetYear_pickle,paperNetYear) constructSeriz(paperGraph_pickle,paperGraph)
def extractReviewer(): reviewerDict = {} for p in pathList: rfile = open(p, encoding='utf-8') rfile = list(rfile) for j in range(len(rfile)): if '(' in rfile[j]: parts = rfile[j].split(' (') name = parts[0].strip() if not name in reviewerDict.keys(): reviewerDict[name] = parts[1].replace(')', '').strip() for p in pathListICDM: rfile = open(p, encoding='utf-8') rfile = list(rfile) for j in range(len(rfile)): if len(rfile[j]) > 4: parts = rfile[j].split(' ') name = parts[0].strip() if not name in reviewerDict.keys(): reviewerDict[name] = parts[1].strip() print(reviewerDict) constructSeriz(reviewerDict_pickle, reviewerDict)
def cleanDOI(): #id,doi paperid = 0 selectResult = getResult('select id,doi from paper', cur) print('read completed') doiDict = {} idPaperid = [] for i in range(len(selectResult)): if i % 5000 == 0: print('now is ' + str(i)) doi = selectResult[i]['doi'] if doi in doiDict: newPaperid = doiDict[doi][1] idPaperid.append([selectResult[i]['id'], newPaperid]) else: #add, assign paperid paperid += 1 doiDict[doi] = (selectResult[i]['id'], paperid) idPaperid.append([selectResult[i]['id'], paperid]) #print(doiDict) #print(idPaperid) constructSeriz(doiDict_pickle, doiDict) constructSeriz(idPaperid_pickle, idPaperid)
def prepareDistanceMetrx(): # 计算合作者之间的关联度 #import os;os.chdir('e:/Code/Python');import geng_coi;geng_coi.prepareDistanceMetrx() PART = 5 totalEID = readSeriz(totalEID_pickle) totalEID = list(set(totalEID)) lenEID = len(totalEID) #distanceMetrx = [[0 for i in range(lenEID)] for j in range(lenEID)] limit = int(lenEID / 10) print('len is ' + str(lenEID)) distanceMetrx_pickle = dictPath + 'distanceMetrx0.pickle' distanceMetrx = readSeriz(distanceMetrx_pickle) #distanceMetrx_pickle_new = dictPath + 'distanceMetrx'+str(1)+'.pickle' distanceMetrx_pickle_new = dictPath + 'distanceMetrx' + str( PART) + '.pickle' #print('here0') #for i in range(lenEID-1): for i in range((limit * (PART - 1)), lenEID - 1): #print('i is '+str(i)) py1 = getResultList( 'select paperid from tmp_paper where eid = ' + str(totalEID[i]), 'paperid', cur) set1 = set(py1) #print(set1) if i > (limit * PART): #print('???') break distanceMetrx[i][i] = 0 #print('here1') for j in range(i + 1, lenEID): py2 = getResultList( 'select paperid from tmp_paper where eid = ' + str(totalEID[j]), 'paperid', cur) set2 = set(py2) sameset = list(set1 & set2) value = 0 if len(sameset) == 0: #print('here2') distanceMetrx[i][j] = 0 #做完标准化后再附成M distanceMetrx[j][i] = 0 else: #print('here3') value = measureCoauthor(py1, py2, sameset) distanceMetrx[i][j] = value distanceMetrx[j][i] = value if j % 100 == 0: constructSeriz(distanceMetrx_pickle_new, distanceMetrx) print('completed: ' + str(i) + ', ' + str(j) + ' value: ' + str(value)) #break #break constructSeriz(distanceMetrx_pickle_new, distanceMetrx) constructSeriz(distanceMetrx_pickle_new, distanceMetrx)
def analysisNull(): # ''' nullDOI = readSeriz(nullDOI_pickle) print('num is '+str(len(nullDOI))) nullDOI = readSeriz(nullDOI_pickle) for nid in nullDOI: updateSQL = 'update paper set paperid=null where id='+str(nid) cur.execute(updateSQL) conn.commit() print('completed: '+str(nid)) #break ''' yearTitle = [[[] for j in range(27)] for i in range(100)] yearList = [] # selectResult = getResult( 'select id,time,title from paper where paperid is not null', cur) print('read completed, the total is ' + str(len(selectResult))) num = 0 for sr in selectResult: if not (len(sr['title']) > 0 and sr['time'] > 0): continue if sr['time'] in yearList: yidx = yearList.index(sr['time']) else: yearList.append(sr['time']) yidx = yearList.index(sr['time']) nidx = assignNum(sr['title']) yearTitle[yidx][nidx].append((sr['id'], sr['time'], sr['title'])) print('now is ' + str(num) + ' id is ' + str(sr['id'])) num += 1 print('begin saving') constructSeriz(yearList_pickle, yearList) for i in range(len(yearList)): for j in range(27): path = idyeartitle_path + str(i) + '_' + str(j) + '.pickle' if len(yearTitle[i][j]) > 0: constructSeriz(path, yearTitle[i][j]) else: constructSeriz(path, [])
def preparePid(): # # pidList = [] nidList = [] selectResult = getResult( 'select tem.pid,group_concat(tem.id) ids from (select pid, id from paper) tem group by tem.pid', cur) constructSeriz(selectResult_pickle, selectResult) for sr in selectResult: if ',' in sr['ids']: #说明不止一个 idList = sr['ids'].split(',') #pidQueue.put(idList) #print(idList) pidList.append(idList) else: #nidQueue.put([sr['ids']]) #print([sr['ids']]) #print('pid: '+sr['pid']) nidList.append(sr['ids']) constructSeriz(pidList_pickle, pidList) constructSeriz(nidList_pickle, nidList) print('completed')
def measure_topicSIM(topicsNum=30): # paperDictTopic = readSeriz(paperDictTopic_pickle) reviewerDictTopic = readSeriz(reviewerDictTopic_pickle) wordOfBagSet = [] i = 0 paperID = {} #通过wordofbad的序号,索引paperID列表,然后知道这是paper还是reviewer的paper for title in paperDictTopic: wordOfBagSet.append(paperDictTopic[title]) paperID[i] = (True, title) i += 1 for reviewer in reviewerDictTopic: for title in reviewerDictTopic[reviewer]: wordOfBagSet.append(title) paperID[i] = (False, title, reviewer) i += 1 texts = wordOfBagSet[:] dictionary = corpora.Dictionary(texts) #dictionary.save('F:/newsAnalysis/data/newswordsall.dict') corpus = [dictionary.doc2bow(text) for text in texts] #corpora.MmCorpus.serialize('/tmp/deerwester.mm', corpus) tfidf = models.TfidfModel(corpus) corpus_tfidf = tfidf[corpus] ldaModel = models.LdaModel(corpus_tfidf, id2word=dictionary, num_topics=topicsNum) ldaModel.show_topics() corpus_lda = ldaModel[corpus] #得到各文本的倾向 index = similarities.MatrixSimilarity(ldaModel[corpus_lda]) #corpora.MmCorpus.serialize('/tmp/deerwester.mm', corpus) #corpus_lda = [[(d[0],round(float(d[1]),5)) for d in doc] for doc in corpus_lda0] #for doc in corpus_lda: # print(doc) ''' #参考部分 doc = "Human computer interaction" vec_bow = dictionary.doc2bow(doc.lower().split()) vec_lsi = ldaModel[vec_bow] # convert the query to LSI space index = similarities.MatrixSimilarity(ldaModel[corpus_lda]) sims = index[vec_lsi] print(sims) ''' ''' paperList = list(paperDictT.keys()) reviewerList = list(reviewerDictT.keys()) TOPIC = [[0 for j in range(len(reviewerList)) ] for i in range(len(paperList))] paperDictT = {} #reviewerDictT = {} for i in range(len(corpus_lda)): if paperID[i][0] == True: #投稿文章 {title:[1,2,3,4,5],} #paperDictT[paperID[i][1]] = corpus_lda[i] paperDictT[paperID[i][1]] = else: #pass if paperID[i][2] in reviewerDictT.keys(): reviewerDictT[paperID[i][2]].append(corpus_lda[i]) else: reviewerDictT[paperID[i][2]] = [] reviewerDictT[paperID[i][2]].append(corpus_lda[i]) ''' corpusLen = 0 TOPIC = [[0 for j in range(len(reviewerList))] for i in range(len(paperList))] #metrx = [[0 for i in range(corpusLen)] for ] for i in range(len(paperList)): paperList[i] corpus_lda_vector = paperDictT[paperList[i]] sims = index[corpus_lda_vector] for j in range(len(reviewerList)): TOPIC[i][j] = sims[j] return #print(paperDictT_pickle) constructSeriz(paperDictT_pickle, paperDictT) constructSeriz(reviewerDictT_pickle, reviewerDictT) constructSeriz(corpus_lda_pickle, corpus_lda)
def basePrepare(): #expertNet xid,yid,status,attr #selectSQL = 'select id from dlurl1' #idList = getResultList(selectSQL,'id',cur) #constructSeriz(idList_pickle,idList) #print('read complete!') #构建学者的id网 idList = readSeriz(idList_pickle) idLength = len(idList) #expertNet = [[0 for j in range(idLength)] for i in range(idLength)] #用于最后的加和 for i in range(idLength): for j in range() insertSQL = 'insert into expertNet (xid,yid) values ('+str(idList[i])+','+str(yid)+')' #序列化 constructSeriz(expertNet_pickle,expertNet) del expertNet gc.collect() print('all complete') def buildInstitution(): # colleaguenet xid,yid,score ''' institutionNet = readSeriz(expertNet_pickle) institutionGraph = nx.Graph() #预读 totalSet = [] for i in range(len(idList)): #iid还没有设置,应该是status之类的项 xResult = getResult('select iid from experience1 where eid='+str(idList[i]),cur) #需要有个pid list 建立一个字典 key是id,time是内容 xDict = {} for x in xResult: xDict[x['id']] = x['time'] xID = xDict.keys() totalSet.append((xDict,xID)) #这里与paper/coauthor不一样,不用进行一次控制,可以直接放入,类似coauthornet那种 for i in range(len(idList)): for j in range((i+1),len(idList)): institutionList = list(set(totalSet[i][1]).intersection(totalSet[j][1])) if len(institutionList)>0: #全部插入数据库中 score = calculateColleague(institutionList) #xid yid 已经转成str insertSQL = 'insert into colleaguenet (xid,yid,score) values('+xid+','+yid+','+str(score)+')' cur.execute(insertSQL) conn.commit() institutionNet[i][j] = score #这里也做一个记录 institutionGraph(*(i,j),coinstitution=institutionList, weight=score) constructSeriz(institutionNet_pickle,institutionNet) constructSeriz(institutionGraph_pickle,institutionGraph) ''' def calculateColleague(institutionList): #具体的计算方法 pass def buildFinalGraph(): #对两个图进行加和 #coauthorGraph = readSeriz(coauthorGraph_pickle) #institutionGraph = readSeriz(institutionGraph_pickle) finalLen = len(coauthorNet) coauthorNet = readSeriz(coauthorNet_pickle) institutionNet = readSeriz(institutionNet_pickle) finalNet = readSeriz(expertNet_pickle) #初始化一个模板 finalGraph = nx.Graph() for i in range(finalLen): for j in range((i+1),finalLen): if i==j: continue finalNet[i][j] = coauthorNet[i][j] + institutionNet[i][j] finalNet[j][i] = finalNet[i][j] if finalNet[i][j] != 0: finalGraph.add_edge(*(i,j),score=finalNet[i][j]) constructSeriz(finalGraph_pickle,finalGraph) def mainFunction(): # basePrepare() buidPaperNet() buildCoauthorByYear(2012) #定下大于多少年 buildInstitution() def analysisGraph(finalGraph): if nx.is_connected(finalGraph): print('Yes') else: number_connected_components(finalGraph) nx.connected_component_subgraphs(finalGraph) #这个返回的是什么类型? def buildTopic(): # colleaguenet xid,yid,score topicNet = readSeriz(expertNet_pickle) topicNetMore = readSeriz(expertNet_pickle) topicGraph = nx.Graph() #预读 totalSet = [] for i in range(len(idList)): #other还没有设置,相当于tid xResult = getResult('select other from topic where eid='+str(idList[i]),cur) #需要有个other list 建立一个字典 key是other,num是内容 xDict = {} for x in xResult: xDict[x['other']] = x['num'] xID = xDict.keys() totalSet.append((xDict,xID)) for i in range(len(idList)): for j in range((i+1),len(idList)): topicList = list(set(totalSet[i][1]).intersection(totalSet[j][1])) if len(topicList)>0: #全部插入数据库中 score = calculateTopic(topicList) #xid yid 已经转成str insertSQL = 'insert into topicnet (xid,yid,score) values('+xid+','+yid+','+str(score)+')' cur.execute(insertSQL) conn.commit() topicNet[i][j] = score #这里也做一个记录 topicGraph(*(i,j),similar=score) topicNetMore[i][j] = topicList constructSeriz(topicNet_pickle,topicNet) constructSeriz(topicGraph_pickle,topicGraph) constructSeriz(topicNetMore_pickle,topicNetMore) def calculateTopic(): pass if __name__ == '__main__': basePrepare()
def selectEID(): #authours X reviewers 根据eid 找到其合作者 totalEID = readSeriz(totalEID_new_pickle) ''' Dict = readSeriz(reviewer_ID_pickle) #Dict = readSeriz(paperAuthorDict_pickle) #这里包有[] totalEID = list(set(totalEID)) print('begin totleEID len is '+str(len(totalEID))) for name in Dict.keys(): eid = Dict[name] if eid ==-1: continue selectEIDList1 = [] selectEIDList2 = [eid] #下一次要查询的列表 selectEIDCompleted = [] #已经完成查询的列表 for i in range(1): if len(selectEIDList2)==0: break selectEIDList1 = selectEIDList2[:] #print(selectEIDList1) selectEIDList2 = [] for j in selectEIDList1: print('--query: '+str(j)) selectSQL = 'select distinct eid from tmp_paper where paperid in (select distinct paperid from tmp_paper where eid='+str(j)+' ) limit 100' newEidList = getResultList(selectSQL,'eid',cur) newList = 0 for kid in newEidList: if (not kid in selectEIDCompleted) and (not kid in totalEID) and (not kid in selectEIDList2): selectEIDList2.append(kid) newList += 1 if newList>300: break selectEIDCompleted.append(j) print('++add: '+str(newList)) totalEID += selectEIDList2 totalEID = list(set(totalEID)) totalEID = list(set(totalEID)) #print('selectEIDList2 len is '+str(len(selectEIDList2))) #print('now len is '+str(len(selectEIDCompleted))) print('===========completed: '+name) print('now totleEID len is '+str(len(totalEID))) totalEID = list(set(totalEID)) constructSeriz(totalEID_new_pickle,totalEID) print('total totleEID len is '+str(len(totalEID))) totalEID = list(set(totalEID)) constructSeriz(totalEID_new_pickle,totalEID) ''' DictAuthor = readSeriz(paperAuthorDict_pickle) #这里包有[] DictID = readSeriz(personDict_pickle) #这里包有[] totalEID = list(set(totalEID)) print('begin totleEID len is ' + str(len(totalEID))) for title in DictAuthor.keys(): names = DictAuthor[title] #print(eid) if len(names) == 0: continue selectEIDList1 = [] selectEIDList2 = [DictID[na] for na in names] #下一次要查询的列表 selectEIDCompleted = [] #已经完成查询的列表 for i in range(1): if len(selectEIDList2) == 0: break selectEIDList1 = selectEIDList2[:] #print(selectEIDList1) selectEIDList2 = [] for j in selectEIDList1: print('--query: ' + str(j)) selectSQL = 'select distinct eid from tmp_paper where paperid in (select distinct paperid from tmp_paper where eid=' + str( j) + ' ) limit 100' newEidList = getResultList(selectSQL, 'eid', cur) newList = 0 for kid in newEidList: if (not kid in selectEIDCompleted) and ( not kid in totalEID) and (not kid in selectEIDList2): selectEIDList2.append(kid) newList += 1 if newList > 300: break selectEIDCompleted.append(j) print('++add: ' + str(newList)) totalEID += selectEIDList2 totalEID = list(set(totalEID)) totalEID = list(set(totalEID)) #print('selectEIDList2 len is '+str(len(selectEIDList2))) #print('now len is '+str(len(selectEIDCompleted))) print('===========completed: ' + title) print('now totleEID len is ' + str(len(totalEID))) totalEID = list(set(totalEID)) constructSeriz(totalEID_new_pickle, totalEID) #break print('total totleEID len is ' + str(len(totalEID))) totalEID = list(set(totalEID)) constructSeriz(totalEID_new_pickle, totalEID)
def indexEID(): # 根据author的institute对应dlurl1里面的id paperDict = readSeriz(paperDict_pickle) reviewerDict = readSeriz(reviewerDict_pickle) reviewer_ID = {} totalEID = [] personDict = {} #存放人 paperAuthorDict = {} #paper 部分 ------------ #if False: for title in paperDict.keys(): #title = '' authorsInstitu = paperDict[title] #print(authorsInstitu) paperAuthorDict[title] = [] for ai in authorsInstitu: name = ai[0] if name in personDict.keys(): continue Institu = ai[1] eidList = getResultList( 'select distinct eid from experience1 where eid in (select id from dlurl1 where name like "' + cleanName(name) + '") and institution like "' + Institu + '" limit 1', 'eid', cur) if len(eidList) > 0: personDict[name] = eidList[0] totalEID += eidList paperAuthorDict[title].append(name) print('comppleted: ' + name) else: eidList = getResultList( 'select id from dlurl1 where name like "' + cleanName(name) + '" limit 1', 'id', cur) if len(eidList) > 0: personDict[name] = eidList[0] totalEID += eidList print('comppleted: ' + name) paperAuthorDict[title].append(name) else: print('error!!!!!!1 ' + name) print('-----------completed:' + title) #break totalEID = list(set(totalEID)) constructSeriz(totalEID_pickle, totalEID) constructSeriz(personDict_pickle, personDict) constructSeriz(paperAuthorDict_pickle, paperAuthorDict) print('completed Paper') #reviewer部分 for name in reviewerDict.keys(): #print(reviewerDict[name]) Institu = reviewerDict[name][0] eidList = getResultList( 'select distinct eid from experience1 where eid in (select id from dlurl1 where name like "' + cleanName(name) + '") and institution like "' + Institu + '" limit 1', 'eid', cur) if len(eidList) > 0: reviewer_ID[name] = eidList[0] totalEID += eidList print('comppleted: ' + name) else: eidList = getResultList( 'select id from dlurl1 where name like "' + cleanName(name) + '" limit 1', 'id', cur) if len(eidList) > 0: reviewer_ID[name] = eidList[0] totalEID += eidList print('comppleted: ' + name) else: print('error!!!!!!1 ' + name) reviewer_ID[name] = -1 #break constructSeriz(totalEID_pickle, totalEID) constructSeriz(reviewer_ID_pickle, reviewer_ID)