def getYear_DBIS_filetered(self,paperDict,PaperRawdict,filterDict): outputlist=[] outputlist_null=[] counter=0 counter_total=0 for key in paperDict: if key in filterDict: continue papername=PaperRawdict[key] if papername[0]=='"' and papername[-1]=='"': papername=papername[1:-1] papername= self.connect.escape(papername) self.cursor.execute(self.selectStr_easy % (papername)) resultlist=self.cursor.fetchall() if len(resultlist)>0: print resultlist year=resultlist[0][1] outputlist.append([key,year]) counter+=1 else: outputlist_null.append([key,papername]) counter_total+=1 print str(counter_total)+'/'+str(len(paperDict)-len(filterDict))+' '+papername print counter util.write_csv_inlist('paperwithyear_dbis.csv',outputlist) util.write_csv_inlist('paperwithnoresults.csv',outputlist_null)
def output(self): outputlist=[] for year in self.yearDict: confDict_oneyear=self.yearDict[year] for conf in confDict_oneyear: outputlist.append([year,conf,self.confDict[conf],confDict_oneyear[conf]]) util.write_csv_inlist('statistic_conf_byyear.csv',outputlist)
def output(self, keyList, labels, name): outputList = [] length = len(keyList) for i in range(length): clu = labels[i] theKey = keyList[i] outputList.append([theKey, self.authorObjDict[theKey].name, clu]) util.write_csv_inlist('./authorCluster_' + name + '.csv', outputList)
def selectData_onekey(self,outputfile,key): selectStr="""SELECT * FROM paper WHERE conference like %s """ key= self.connect.escape(key) self.cursor.execute(selectStr % (key)) outputlist=[] for row in self.cursor.fetchall(): outputlist.append([row[0].encode('utf-8'),row[1],row[2].encode('utf-8')]) util.write_csv_inlist(outputfile,outputlist)
def findDBIS_paper_withfilter(self,paper_confdict,paperdict,DBIS_cleaned,myfilter=None): DBIS_paperlist=list() counter=0 for key in paperdict: confkey=paper_confdict[key] if confkey in DBIS_cleaned: counter+=1 if key not in myfilter: DBIS_paperlist.append([key, paperdict[key]]) print counter util.write_csv_inlist('notmatched.csv',DBIS_paperlist)
def countAuthors(self): focusedAuthors = dict() print len(self.paperDict_obj) for key in self.paperDict_obj: paper = self.paperDict_obj[key] authors = paper.authors for author in authors: if author not in focusedAuthors: focusedAuthors[author] = 1 else: focusedAuthors[author] = focusedAuthors[author] + 1 util.write_csv_inlist('authorCount.csv', util.dict2list(focusedAuthors)) return focusedAuthors
def clustering(self, k): kmeans = KMeans(n_clusters=k, random_state=0).fit_predict(self.F) gobalIndex = 0 outputList = [] for key in self.nodeIndexDict: keyList = self.nodeIndexDict[key] length = len(keyList) for i in range(length): clu = kmeans[gobalIndex + i] theKey = keyList[i] outputList.append( [theKey, self.authorObjDict[theKey].name, key, clu]) gobalIndex += length util.write_csv_inlist('./temporal/authorCluster.csv', outputList)
def filterFeatureName(self, folder): featureNameSet = set() for year in range(self.start, self.end + 1): fp = open(folder + str(year) + '-featureNames.csv', 'r') while 1: line = fp.readline() if not line: break content = line.strip().split(',') for val in content: featureNameSet.add(val) output = '' for key in featureNameSet: output += key + ',' print len(featureNameSet) util.write_csv_inlist(folder + 'out-featureNames.csv', [output])
def getYear_DBIS_notmatched(self,paperDict,notmatchDict): outputlist=[] counter=0 for key in notmatchDict: papername=paperDict[key] if papername[0]=='"' and papername[-1]=='"': papername=papername[1:-1] papername= self.connect.escape(papername) # print papername self.cursor.execute(self.selectStr_easy % (papername)) resultlist=self.cursor.fetchall() if len(resultlist)>0: print resultlist year=resultlist[0][1] outputlist.append([key,year]) counter+=1 print counter util.write_csv_inlist('paperwithyear_dbis.csv',outputlist)
def matchAminerAndDBLP(self, focusedPaperDict, filterDict=None): haveCounter=0 havelist=[] nolist=[] html_parser = HTMLParser.HTMLParser() paperTitleDict=self.reverseDict() for key in focusedPaperDict: originalTitle = html_parser.unescape(focusedPaperDict[key].title) originalTitle=util.simplifyStr(originalTitle) # print originalTitle if originalTitle in paperTitleDict: havelist.append([paperTitleDict[originalTitle],key,focusedPaperDict[key].title]) haveCounter+=1 print haveCounter else: nolist.append([key,focusedPaperDict[key].title]) util.write_csv_inlist('havelist.csv',havelist,['aminerKey','dblpKey','title']) util.write_csv_inlist('nolist.csv',nolist,['aminerKey','title']) print haveCounter print len(focusedPaperDict)
def generateEdge(self): edgeDict = collections.OrderedDict() edgeIndexDict = collections.OrderedDict() for yearkey in self.author_nodes: author_node = self.author_nodes[yearkey] key_list = [] for key in author_node: key_list.append(key) mlength = len(key_list) ajmatrix = np.zeros((mlength, mlength)) for i in range(mlength): ikey = key_list[i] iauthor = self.authorObjDict[ikey] for pkey in iauthor.papers: thePaper = self.paperObjDict[pkey] theYear = thePaper.year if theYear > yearkey: continue theAuthors = thePaper.authors for akey in theAuthors: if akey == ikey: continue if akey in key_list: akey_index = key_list.index(akey) val = math.exp(-self.rou * (yearkey - theYear)) ajmatrix[i][ akey_index] = ajmatrix[i][akey_index] + val # ajmatrix[akey_index][i]=ajmatrix[i][akey_index] # b = np.nonzero(ajmatrix) # print(np.array(b).ndim) np.savetxt('./proces/temporal/year_' + str(yearkey) + '.csv', ajmatrix, fmt='%d', delimiter=',') util.write_csv_inlist( './proces/temporal/nodeslist_' + str(yearkey) + '.csv', key_list) edgeDict[yearkey] = ajmatrix edgeIndexDict[yearkey] = key_list return edgeDict, edgeIndexDict
def toCSV(self,filename): util.write_csv_inlist(filename,self.outputList)
def buildNetwork(self): edgeDict_new = collections.OrderedDict() edgeIndexDict_new = collections.OrderedDict() for key in self.nodeIndexDict: myGraph = snap.TNEANet.New() keyList = self.nodeIndexDict[key] length = len(keyList) for i in range(length): theKey = keyList[i] nid = myGraph.AddNode(i) myGraph.AddStrAttrDatN(nid, theKey, 'key') # there is only the first order!!!!!!!!!!!!!!!!!! A = self.edgeDict[key] # B=np.dot(A,A) # B=np.dot(A,np.dot(A,A)) # B_sim=cosine_similarity(A) C = A outputList = [] outputList_line = [] outputList_dw = [] for i in range(length): for j in range(i + 1, length): if C[i, j] > 0: eid = myGraph.AddEdge(i, j) myGraph.AddFltAttrDatE(eid, A[i, j], 'weigth') eid = myGraph.AddEdge(j, i) myGraph.AddFltAttrDatE(eid, A[j, i], 'weigth') outputList.append([keyList[i], keyList[j], A[i, j]]) outputList_line.append( [keyList[j], keyList[i], A[j, i]]) outputList_line.append( [keyList[i], keyList[j], A[i, j]]) outputList_dw.append([keyList[i], keyList[j]]) outputList_dw.append([keyList[j], keyList[i]]) util.write_csv_inlist(str(key) + '.csv', outputList) util.write_csv_inlist(str(key) + '_line.txt', outputList_line) util.write_csv_inlist(str(key) + '_dw.txt', outputList_dw) print str(key) + '-original: ' + str( myGraph.GetEdges()) + ' ' + str(myGraph.GetNodes()) MxWcc = snap.GetMxWcc(myGraph) print str(key) + '-mxWcc: ' + str(MxWcc.GetEdges()) + ' ' + str( MxWcc.GetNodes()) # labels = snap.TIntStrH() # for NI in MxWcc.Nodes(): # labels[NI.GetId()] = str(NI.GetId()) # snap.DrawGViz(MxWcc, snap.gvlSfdp, './graph/'+str(key)+".gif", " ", labels) keyList_new = [] for node in MxWcc.Nodes(): keyList_new.append(keyList[int(node.GetId())]) ajmatrix = np.zeros((MxWcc.GetNodes(), MxWcc.GetNodes())) counter_out = 0 for node_out in MxWcc.Nodes(): counter_in = 0 for node_in in MxWcc.Nodes(): ajmatrix[counter_out, counter_in] = A[int(node_out.GetId()), int(node_in.GetId())] counter_in += 1 counter_out += 1 edgeDict_new[key] = ajmatrix edgeIndexDict_new[key] = keyList_new self.graphList.append(MxWcc) return edgeDict_new, edgeIndexDict_new
def getNodeAttributes(self): attributeslist = [] outputList = [] for UGraph in self.graphList: attriList = [] for index in range(UGraph.GetNodes()): nodelist = [] attriList.append(nodelist) #page rank PRankH = snap.TIntFltH() snap.GetPageRank(UGraph, PRankH) counter = 0 for item in PRankH: attriList[counter].append(PRankH[item]) counter += 1 #HIN counter = 0 NIdHubH = snap.TIntFltH() NIdAuthH = snap.TIntFltH() snap.GetHits(UGraph, NIdHubH, NIdAuthH) for item in NIdHubH: attriList[counter].append(NIdHubH[item]) attriList[counter].append(NIdAuthH[item]) counter += 1 # Betweenness Centrality counter = 0 Nodes = snap.TIntFltH() Edges = snap.TIntPrFltH() snap.GetBetweennessCentr(UGraph, Nodes, Edges, 1.0) for node in Nodes: attriList[counter].append(Nodes[node]) counter += 1 # closeness centrality counter = 0 for NI in UGraph.Nodes(): CloseCentr = snap.GetClosenessCentr(UGraph, NI.GetId()) attriList[counter].append(CloseCentr) counter += 1 # farness centrality counter = 0 for NI in UGraph.Nodes(): FarCentr = snap.GetFarnessCentr(UGraph, NI.GetId()) attriList[counter].append(FarCentr) counter += 1 # node eccentricity counter = 0 for NI in UGraph.Nodes(): attriList[counter].append( snap.GetNodeEcc(UGraph, NI.GetId(), True)) counter += 1 atrriMarix = np.array(attriList) attributeslist.append(atrriMarix) outputList.append(attriList) # convert to undirected graph # GOut = snap.ConvertGraph(snap.PUNGraph, UGraph) # for NI in UGraph.Nodes(): # DegCentr = snap.GetDegreeCentr(UGraph, NI.GetId()) # print "node: %d centrality: %f" % (NI.GetId(), DegCentr) util.write_csv_inlist('attributeslist.csv', outputList) return attributeslist