def compareCoauthor(xID, yID): # xCheckSQL = 'select eid from paper where paperid in (select paperid from paper where eid=' + str( xID) + ')' yCheckSQL = 'select eid from paper where paperid in (select paperid from paper where eid=' + str( yID) + ')' xResult = getResult(xCheckSQL, cur) yResult = getResult(yCheckSQL, cur) if len(xResult) * len(yResult) == 0: return -1 sResult, lResult = compareLen(xResult, yResult) single = 0 for sR in sResult: if (sR['eid'] == xID) or (sR['eid'] == yID): #跳过自己 continue for lR in lResult: if (lR['eid'] == xID) or (lR['eid'] == yID): continue flag = False if sR['eid'] == lR['eid']: #相同了 flag = True if flag: single += 1 break #不用往下比较了 return round(single / len(sResult), 4)
def compareSame(xID, yID): # 考虑了名字完全相同,名字相像问题 #当姓名几乎不一样的时候,返回一个空的字典 #当姓名很相似的时候,进行各种对比 xName = getResult('select name from dlurl1 where id=' + str(xID), cur)[0]['name'] yName = getResult('select name from dlurl1 where id=' + str(yID), cur)[0]['name'] #flag = False ''' 应该给出一个N*N矩阵,每个格有一个4位的向量 对应数据库中的就是 id xid yid nameThreashold(0-1) nameRate(0-1) threshold(0-1) paper coauthor institu topic 后四个都用0、1表示 ''' valueSet = {} #应该是如果名字完全相同,则匹配率稍低,不然则适当升高 nameSameRate = nameSameOrNot(xName, yName) if nameSameRate == 0: return valueSet #直接返回 #开始比较 valueSet['nameRate'] = nameSameRate valueSet['paper'] = comparePaper(xID, yID) valueSet['coauthor'] = compareCoauthor(xID, yID) valueSet['institu'] = compareInstit(xID, yID) valueSet['topic'] = compareTopic(xID, yID) return valueSet
def countInstitu2(): selectResult = getResult('select id,xid,yid,nameRate from namecompare', cur) print('read completed') for sR in selectResult: xid = str(sR['xid']) yid = str(sR['yid']) nameRate = str(sR['nameRate']) tag = str(int(float(nameRate) * 100)) xResult = getResult('select id,yid from name2name where gid=' + xid, cur) yResult = getResult('select id,yid from name2name where gid=' + yid, cur) for xR in xResult: for yR in yResult: value = compareInstit(xR['yid'], yR['yid']) insertSQL = 'insert into name2compare (xid,yid,nameRate,tag,institu) values (' + str( xR['yid']) + ',' + str( yR['yid']) + ',' + nameRate + ',' + tag + ',' + str( value) + ')' #print(insertSQL) cur.execute(insertSQL) conn.commit() print('completed: ' + str(xR['yid']) + ' ' + str(yR['yid'])) print('completedID: ' + str(sR['id']))
def comparePaper2(xID, yID): # 对比paper 的year和title,比较相同率 sameRatio xCheckSQL = 'select title,doi,time from paper where eid=' + str(xID) yCheckSQL = 'select title,doi,time from paper where eid=' + str(yID) xResult = getResult(xCheckSQL, cur) yResult = getResult(yCheckSQL, cur) if len(xResult) * len(yResult) == 0: return -1 sResult, lResult = compareLen(xResult, yResult) single = 0 for sR in sResult: for lR in lResult: flag = False if sR['doi'].strip() == lR['doi'].strip(): #基本相同了 flag = True elif (int(sR['time']) == int( lR['time'])) and sR['title'] == ['title']: #相同 flag = True else: pass if flag: single += 1 break #不用往下比较了 return single / len(sResult)
def cleanPaperInstituNull(): # selectResult = getResult('select distinct id from dlurl1',cur) i = 0 total= len(selectResult) for sr in selectResult: i += 1 result = getResult('select id from paper where eid='+str(sr['id']),cur) if not len(result)>0: #delete cur.execute('delete from dlurl1 where id='+str(sr['id'])) conn.commit() cur.execute('delete from experience1 where eid='+str(sr['id'])) conn.commit() cur.execute('delete from topic where eid='+str(sr['id'])) conn.commit() print('deleted: '+str(round(i/total,3))+' || id:'+str(sr['id'])) continue result = getResult('select id from experience1 where eid='+str(sr['id']),cur) if not len(result)>0: #delete cur.execute('delete from dlurl1 where id='+str(sr['id'])) conn.commit() cur.execute('delete from paper where eid='+str(sr['id'])) conn.commit() cur.execute('delete from topic where eid='+str(sr['id'])) conn.commit() print('deleted: '+str(round(i/total,3))+' || id:'+str(sr['id'])) continue print('completed check: '+str(round(i/total,3))+' || id:'+str(sr['id']))
def cleanPaper(): # 这个已经完成了 conn, cur = getCursor() selectSQL = 'select id from dlurl1' eidList = getResultList(selectSQL, 'id', cur) for eid in eidList: selectSQL = 'select id from paper where eid=' + str(eid) idPidList = getResult(selectSQL, cur) #获得单独的pid列表 for ip in idPidList: pidList.append([ip['id'], False]) print('start ' + str(eid)) #sameEid = [] for i in range(len(pidList)): #分配paperid updateSQL = 'update paper set paperid=' + str( i) + ' where id=' + str(pidList[i][0]) cur.execute(updateSQL) conn.commit() if pidList[i][1] == True: continue for j in range((i + 1), len(pidList)): if pidList[j][1] == True: continue iR = getResult( 'select doi,time,title from where id=' + str(pidList[i][0]), cur) jR = getResult( 'select doi,time,title from where id=' + str(pidList[j][0]), cur) if iR[0]['doi'] == jR[0]['doi']: #相同 pidList[j][1] == True updateSQL = 'update paper set paperid=' + str( i) + ' where id=' + str(pidList[j][0]) cur.execute(updateSQL) conn.commit() continue elif (iR[0]['time'] == jR[0]['time']) and (iR[0]['title'] == jR[0]['title']): #相同 pidList[j][1] == True updateSQL = 'update paper set paperid=' + str( i) + ' where id=' + str(pidList[j][0]) cur.execute(updateSQL) conn.commit() continue else: #不相同 pass pidList[i][1] == True cur.close conn.close()
def buidPaperNet(): # #papernet id xid yid perid year paperNet = readSeriz(expertNet_pickle)#这相当于初始化模板 paperNetYear = readSeriz(expertNet_pickle)#这相当于初始化模板 paperGraph=nx.Graph() #初始化一个图 #预读 totalSet = [] for i in range(len(idList)): xResult = getResult('select id,time from paper where eid='+str(),cur) #需要有个pid list 建立一个字典 key是id,time是内容 xDict = {} for x in xResult: xDict[x['id']] = x['time'] xID = xDict.keys() totalSet.append((xDict,xID)) #预存一次表, 方便用时间这个条件进行控制 for i in range(len(idList)): for j in range((i+1),len(idList)): coauthorList = list(set(totalSet[i][1]).intersection(totalSet[j][1])) if len(coauthorList) >0: #全部插入数据库中 for cid in coauthorList: year = totalSet[i][0][cid] insertSQL = 'insert into papernet (xid,yid,perid,year) values('+str(idList[i])+','+str(idList[j])+','+str(cid)+','+str(year)+')' paperNet[i][j] = len(coauthorList) paperNetYear[i][j] = coauthorList paperGraph.add_edge(*(i,j),coauthoryear=coauthorList) #这个增加一条边 constructSeriz(paperNet_pickle,paperNet) constructSeriz(paperNetYear_pickle,paperNetYear) constructSeriz(paperGraph_pickle,paperGraph)
def cleanRedunOthers(): #这里基于名字的去除重复的基础上检查 ''' #把检查列表准备好 selectSQL = 'select distinct eid,group_concat(tem) temg from experience1 where eid in (select xid from new_table where tag=60) or eid in (select yid from new_table where tag=60) group by eid' #selectSQL = 'select id,xid,yid from new_table where tag=60' selectResult = getResult(selectSQL,cur) institutionDict = {} for sr in selectResult: institutionDict[sr['eid']] = sr['temg'].split(',') ''' #找出其它相似对 selectSQL = 'select id,xid,yid from new_table where tag=60 and id>74331 limit 30000' xyPair = getResult(selectSQL, cur) valueSet = {} for xy in xyPair: print('now compare id: ' + str(xy['id'])) xID = xy['xid'] yID = xy['yid'] #temlist1 = institutionDict[xID] #temlist2 = institutionDict[yID] #valueSet['institu'] = str(compareInstit(temlist1,temlist2)) valueSet['institu'] = str(compareInstit( xID, yID)) #做机构比较,对于机构insti位为0的对,再比较机构名称是否包含 valueSet['id'] = str(xy['id']) insertMySQLInstitu(valueSet) print(str(valueSet['institu']))
def cleanTopic(): # #tokenizer = nltk.RegexpTokenizer(r'w+') selectResult = getResult('select id,topic from topic', cur) print('read completed') i = 0 total = len(selectResult) for sr in selectResult: i += 1 topic = sr['topic'] #topic = 'get and got' newTopic = '' for t in topic.split(): nt = wn.morphy(t) try: newTopic += (nt + ' ') except Exception: newTopic += (t + ' ') #print(newTopic) #topic = tokenizer.tokenize(topic) #topic = nltk.PorterStemmer().stem(topic) updateSQL = 'update topic set topic="' + newTopic.strip( ) + '" where id=' + str(sr['id']) print(updateSQL) cur.execute(updateSQL) conn.commit() print('completed: ' + str(round(i / total, 3)) + ' || id:' + str(sr['id']))
def mainFunction(): http, uag = getHttpUa() for ua in uag: uaQueue.put(ua) for ip in http: ipQueue.put(ip) for i in range(5): pWorker = pageWorker(uaQueue, ipQueue, dlQueue) pWorker.daemon = True pWorker.start() conn, cur = getCursor() dlList = getResult(selectSQL, cur) ''' with open(expertList_path) as csvfile: reader = csv.DictReader(csvfile) for row in reader: dlList.append(row) ''' for dl in dlList: dlQueue.put(dl)
def findPage(): http,ua = getHttpUa() conn,cur = getCursor() dlList = getResult(sltCollNotNull,cur) for dl in dlList: # this is test!!!! read from a txt #html = readTXT('E:/Code/Test Data/Paul Robert Barford - ACM author profile page - colleagues.txt') #html = readTXT('E:/Code/Test Data/Yu Zheng - ACM author profile page.txt') #html = readTXT('E:/Code/Test Data/A. Smolic - ACM author profile page.txt') if ChangeOrNot() == True: editeProxies(http) editeHeader(ua) time.sleep(random.randint(1, 12)) html = str(getPage(dl['colleage']))#取出url if html != ' ': nameLink = analysisPage(html) for nl in nameLink: addInfo(conn,cur,nl) #print(nl) print('Now is '+str(dl['id'])) #break#only run one time cur.close() conn.close()
def prepareNULL(): # nullDOI = readSeriz(nullDOI_pickle) yearTitle = [[[] for j in range(27)] for i in range(60)] yearList = [] num = 0 for nid in nullDOI: #print(str(nid)) getR = getResult('select time,title from paper where id=' + str(nid), cur) sr = getR[0] if not (len(sr['title']) > 0 and sr['time'] > 0): continue if sr['time'] in yearList: yidx = yearList.index(sr['time']) else: yearList.append(sr['time']) yidx = yearList.index(sr['time']) nidx = assignNum(sr['title']) yearTitle[yidx][nidx].append((nid, sr['time'], sr['title'])) print('now is ' + str(num) + ' id is ' + str(nid)) num += 1 print('begin saving') constructSeriz(yearList_pickle_null, yearList) for i in range(len(yearList)): for j in range(27): path = idyeartitle_path_null + str(i) + '_' + str(j) + '.pickle' if len(yearTitle[i][j]) > 0: constructSeriz(path, yearTitle[i][j])
def mainFunction(): http, uag = getHttpUa() for ip in http: ipQueue.put(ip) for ua in uag: uaQueue.put(ua) for k in range(1): aWorker = analysisWorker(htmlQueue, infoQueue) aWorker.daemon = True aWorker.start() print('ok1') for i in range(4): pWorker = pageWorker(ipQueue, uaQueue, dlQueue, htmlQueue) pWorker.daemon = True pWorker.start() print('ok2') conn, cur = getCursor() dlList = getResult(sltDLNotCom, cur) #返回url实体的二维数组 for dl in dlList: dlQueue.put(dl) cur.close() conn.close() print('ok3') for j in range(1): mWorker = mysqlWorker(infoQueue) mWorker.daemon = True mWorker.start()
def prepareInstitution(): # conn, cur = getCursor() selectSQL = 'select institution,id from experience1' institResult = getResult(selectSQL, cur) institList = [] print('get ready') for tr in institResult: institList.append(tr['institution']) institList = list(set(institList)) print('begin insert') for i in range(len(institList)): #不要让id自增!!!调整数据库!!!!! insertSQL = 'insert into institution (id, instutition) values (' + str( i) + ',"' + institList[i] + '")' cur.execute(insertSQL) conn.commit() print('complete1') for ir in institResult: if ir['institution'] in institList: idx = institList.index(ir['institution']) updateSQL = 'update experience1 set tem=' + str( idx) + ' where id=' + str(ir['id']) cur.execute(updateSQL) conn.commit() print('complete2') cur.close() conn.close()
def buildTopic(): # colleaguenet xid,yid,score topicNet = readSeriz(expertNet_pickle) topicNetMore = readSeriz(expertNet_pickle) topicGraph = nx.Graph() #预读 totalSet = [] for i in range(len(idList)): #other还没有设置,相当于tid xResult = getResult('select other from topic where eid='+str(idList[i]),cur) #需要有个other list 建立一个字典 key是other,num是内容 xDict = {} for x in xResult: xDict[x['other']] = x['num'] xID = xDict.keys() totalSet.append((xDict,xID)) for i in range(len(idList)): for j in range((i+1),len(idList)): topicList = list(set(totalSet[i][1]).intersection(totalSet[j][1])) if len(topicList)>0: #全部插入数据库中 score = calculateTopic(topicList) #xid yid 已经转成str insertSQL = 'insert into topicnet (xid,yid,score) values('+xid+','+yid+','+str(score)+')' cur.execute(insertSQL) conn.commit() topicNet[i][j] = score #这里也做一个记录 topicGraph(*(i,j),similar=score) topicNetMore[i][j] = topicList constructSeriz(topicNet_pickle,topicNet) constructSeriz(topicGraph_pickle,topicGraph) constructSeriz(topicNetMore_pickle,topicNetMore)
def linkP2D(): # conn,cur = getCursor() resuID = getResult(selectSQL,cur) for re in resuID: updateSQL = 'update dlurl1 set tem=3 where id='+str(re['eid']) cur.execute(updateSQL) conn.commit() print('complete: '+str(re['eid']))
def cleanUrlMySQL(): conn, cur = getCursor() #获取数据库连接和游标 dlList = getResult(sltCollNotNull, cur) for dl in dlList: url, userid = analysisRecord(dl['url']) result = addInfo(url, userid, dl['id'], cur, conn) if result == 1: cur.execute('update dlurl1 set tem=1 where id=' + str(dl['id'])) #标记 conn.commit() print('Now is ' + str(dl['id']))
def comparePaper(xID, yID): # 对比paper 的year和title,比较相同率 sameRatio xCheckSQL = 'select paperid from paper where eid=' + str(xID) yCheckSQL = 'select paperid from paper where eid=' + str(yID) xResult = getResult(xCheckSQL, cur) yResult = getResult(yCheckSQL, cur) if len(xResult) * len(yResult) == 0: return -1 sResult, lResult = compareLen(xResult, yResult) single = 0 for sR in sResult: for lR in lResult: flag = False if sR['paperid'] == lR['paperid']: #基本相同了 flag = True if flag: single += 1 break #不用往下比较了 return round(single / len(sResult), 4)
def compareInstit2(xID, yID): #需要返回-1区别是否是有一个为空 xCheckSQL = 'select tem from experience1 where eid=' + str(xID) yCheckSQL = 'select tem from experience1 where eid=' + str(yID) instList1 = getResult(xCheckSQL, cur) instList2 = getResult(yCheckSQL, cur) if len(instList1) * len(instList2) == 0: return -1 sResult, lResult = compareLen(instList1, instList2) single = 0 for sR in sResult: for lR in lResult: flag = False if sR == lR: #相同了 flag = True if flag: single += 1 break #不用往下比较了 return single / len(sResult)
def compareInstit(xID, yID): # xCheckSQL = 'select institution from experience1 where eid=' + str(xID) yCheckSQL = 'select institution from experience1 where eid=' + str(yID) xResult = getResult(xCheckSQL, cur) yResult = getResult(yCheckSQL, cur) if len(xResult) * len(yResult) == 0: return -1 sResult, lResult = compareLen(xResult, yResult) single = 0 for sR in sResult: for lR in lResult: flag = False shortI, longI = compareLen(sR['institution'], lR['institution']) if shortI in longI: #相同了 flag = True if flag: single += 1 break #不用往下比较了 return single / len(lResult)
def compareTopic(xID, yID): xCheckSQL = 'select topic from topic where eid=' + str(xID) yCheckSQL = 'select topic from topic where eid=' + str(yID) xResult = getResult(xCheckSQL, cur) yResult = getResult(yCheckSQL, cur) if len(xResult) * len(yResult) == 0: return -1 sResult, lResult = compareLen(xResult, yResult) single = 0 for sR in sResult: for lR in lResult: flag = False sTopic, lTopic = compareLen(sR['topic'], lR['topic']) if sTopic in lTopic: #包含关系也算相同了 flag = True if flag: single += 1 break #不用往下比较了 return single / len(sResult)
def countInstituXXXX(): selectResult = getResult('select id,gid,yid from name2name', cur) print('read completed') for sr in selectResult: if sr['gid'] == sr['yid']: continue value = compareInstit(sr['gid'], sr['yid']) insertSQL = 'insert into name2compare (xid,yid,nameRate,tag,institu) values (' + str( sr['gid']) + ',' + str(sr['yid']) + ',1.0,100,' + str(value) + ')' #print(insertSQL) cur.execute(insertSQL) conn.commit() print('completed: ' + str(sr['id']))
def addInstitution(instInfo, cur, conn): # insertSQL = '' result = getResult('select * from expert order by eid desc limit 1', cur) eid = result[0]['eid'] for inst in instInfo: try: insertSQL = 'insert into experience (eid,institution) values(' + str( eid) + ', "' + inst + '")' cur.execute(insertSQL) conn.commit() except Exception: print('error:' + insertSQL)
def linkP2D(): # conn, cur = getCursor() resuID = getResult(selectSQL, cur) for re in resuID: updateSQL = 'update dlurl1 set status=2 where id=' + str(re['eid']) try: cur.execute(updateSQL) conn.commit() print('complete: ' + str(re['eid'])) except Exception: print('error: ' + str(re['eid'])) cur.close() conn.close()
def preparePublication(): #建立一个publication表 conn, cur = getCursor() selectSQL = 'select pid from publication' paperResult = getResult(selectSQL, cur) print('read completed') for i in range(len(paperResult)): if i < -1: continue print(str(paperResult[i]['pid'])) updateSQL = 'update paper set paperid=' + str( i) + ' where pid="' + paperResult[i]['pid'] + '"' cur.execute(updateSQL) conn.commit() print('completed: ' + str(i)) ''' paperList = [] print('get ready') for tr in paperResult: paperList.append(tr['pid']) paperList = list(set(paperList)) print('begin insert') for i in range(len(paperList)): #不要让id自增!!!调整数据库!!!!! insertSQL = 'insert into publication (id, pid) values ('+str(i)+',"'+paperList[i]+'")' cur.execute(insertSQL) conn.commit() print('complete1') pidDict = {} selectResult = getResult('select id, pid from publication',cur) for sr in selectResult: pidDict[sr['pid']] = sr['id'] for ir in paperResult: if ir['pid'] in paperList: idx = paperList.index(ir['pid']) updateSQL = 'update paper set paperid='+str(pidDict[ir['pid']])+' where id='+str(ir['id']) cur.execute(updateSQL) conn.commit() print('complete2') ''' cur.close() conn.close()
def combine(): # #selectSQL = 'select id,xid,yid from name2compare where institu>0.5 and nameRate>0.75' #selectSQL = 'select id,xid,yid from name2compare where institu>-1 and institu<0.5 and paper>0.2 and nameRate>0.6' #selectSQL = 'select id,xid,yid from name2compare where paper<0.2 and paper>0 and coauthor >0.01 and nameRate>0.6' #selectSQL = 'select id,xid,yid from name2compare where coauthor<0.01 and topic>0.06 and nameRate>0.6' selectSQL = 'select id,xid,yid from name2compare where institu<0.1 and institu>-1 and paper>=0 and coauthor>0' selectResult = getResult(selectSQL,cur) i = 0 total= len(selectResult) for sr in selectResult: i+=1 fg = updateCombine(sr['xid'],sr['yid']) if fg: print('completed: '+str(round(i/total,3))+' || id:'+str(sr['id'])) else: print('somewhere error: '+str(round(i/total,3))+' || id:'+str(sr['id']))
def getID(html): # eid = -1 #初始化 indx = '<![CDATA[' start = html.find(indx) end = html.find(']]></fullpath>') if start > 0: subjectURL = html[(start + len(indx)):end] url, userid = extractUserID(subjectURL) #从网址中分离出url地址 #回查数据库 selectSQL = 'select t.id from (select id,url from dlurl1 where status<>2) t where t.url="' + url + '"' result = getResult(selectSQL, cur) if len(result) == 1: eid = int(result[0]['id']) else: print('exist') return eid
def getID(html): # eid = -1 #初始化 indx = '<![CDATA[' start = html.find(indx) end = html.find(']]></fullpath>') if start > 0: subjectURL = html[(start + len(indx)):end] #url,userid = extractUserID(subjectURL)#从网址中分离出url地址 #回查数据库 #print('find 1') #selectSQL = 'select tem.id from (select id,url from dlurl1 where status<>2 and userid='+str(userid)+') tem where tem.url="'+url+'"' selectSQL = 'select tem.id from (select id,subject from dlurl1 where status<>2) tem where tem.subject="' + subjectURL + '"' result = getResult(selectSQL, cur) eid = [] for r in result: eid.append(r['id']) print('len len is ' + str(len(eid))) return eid
def analysisNull(): # ''' nullDOI = readSeriz(nullDOI_pickle) print('num is '+str(len(nullDOI))) nullDOI = readSeriz(nullDOI_pickle) for nid in nullDOI: updateSQL = 'update paper set paperid=null where id='+str(nid) cur.execute(updateSQL) conn.commit() print('completed: '+str(nid)) #break ''' yearTitle = [[[] for j in range(27)] for i in range(100)] yearList = [] # selectResult = getResult( 'select id,time,title from paper where paperid is not null', cur) print('read completed, the total is ' + str(len(selectResult))) num = 0 for sr in selectResult: if not (len(sr['title']) > 0 and sr['time'] > 0): continue if sr['time'] in yearList: yidx = yearList.index(sr['time']) else: yearList.append(sr['time']) yidx = yearList.index(sr['time']) nidx = assignNum(sr['title']) yearTitle[yidx][nidx].append((sr['id'], sr['time'], sr['title'])) print('now is ' + str(num) + ' id is ' + str(sr['id'])) num += 1 print('begin saving') constructSeriz(yearList_pickle, yearList) for i in range(len(yearList)): for j in range(27): path = idyeartitle_path + str(i) + '_' + str(j) + '.pickle' if len(yearTitle[i][j]) > 0: constructSeriz(path, yearTitle[i][j]) else: constructSeriz(path, [])
def dlInfo(html, soup): # indx = '<![CDATA[' start = html.find(indx) end = html.find(']]></fullpath>') if start > 0: url = html[(start + len(indx)):end] url, userid = extractUserID(url) selectSQL = 'select t.id,t.name from (select id,name,url from dlurl1 where userid=' + str( userid) + ') t where t.url="' + url + '"' result = getResult(selectSQL, cur) if len(result) == 1: id = int(result[0]['id']) name = result[0]['name'] advisorcsv = getCsvUrl(soup, html, id) writeMetrx(writePath, [[id, name, advisorcsv]]) #给二维数组 print('complete:' + str(id)) else: print('error or exist')