def setUp(self): self.maxDiff = None # Construct template graph for tests graph = GraphFactory.createInstance() # Put references to graph objects on test object self.author = Author(0, 'author') self.coauthor = Author(1, 'coauthor') self.conference1 = Conference(0, 'conference1') self.conference2 = Conference(1, 'conference2') self.paper1 = Paper(0, 'paper1') self.paper2 = Paper(1, 'paper2') self.paper3 = Paper(2, 'paper3') # Construct graph graph.addNodes([ self.author, self.conference1, self.conference2, self.paper1, self.paper2, self.paper3 ]) graph.addBothEdges(self.paper1, self.author, Authorship()) graph.addBothEdges(self.paper2, self.author, Authorship()) graph.addBothEdges(self.paper3, self.author, Authorship()) graph.addBothEdges(self.paper3, self.coauthor, Authorship()) graph.addBothEdges(self.paper1, self.conference1, Publication()) graph.addBothEdges(self.paper2, self.conference1, Publication()) graph.addBothEdges(self.paper3, self.conference2, Publication()) graph.addEdge(self.paper1, self.paper2, Citation()) graph.addBothEdges(self.paper2, self.paper3, Citation()) self.templateGraph = graph self.metaPathUtility = self._getImplementation()
def testCoAuthorsGraph(self): """ Sample (simple) scenario as the first case, except that three authors exist, and two of them are co-authors. """ # Build sample data & expected output parsedData = { 0: { 'id': 0, 'arnetid': 1, 'authors': ['Author One', 'Author Three'], 'conference': 'Conference One', 'references': [], 'title': 'Databases', 'year': 1995 }, 1: { 'id': 1, 'arnetid': 2, 'authors': ['Author Two'], 'conference': 'Conference Two', 'references': [], 'title': 'Databases', 'year': 1999 } } expectedGraph = GraphFactory.createInstance() # Expect unspecified ids to auto-increment author1 = Author(0, 'Author One') author2 = Author(2, 'Author Two') author3 = Author(1, 'Author Three') paper1 = Paper(0, 'Databases') paper2 = Paper(1, 'Databases') topic = Topic(0, ['databas']) conference1 = Conference(0, 'Conference One') conference2 = Conference(1, 'Conference Two') expectedGraph.addNode(author1) expectedGraph.addNode(author2) expectedGraph.addNode(author3) expectedGraph.addNode(paper1) expectedGraph.addNode(paper2) expectedGraph.addNode(topic) expectedGraph.addNode(conference1) expectedGraph.addNode(conference2) expectedGraph.addBothEdges(author1, paper1, Authorship()) expectedGraph.addBothEdges(author3, paper1, Authorship()) expectedGraph.addBothEdges(author2, paper2, Authorship()) expectedGraph.addBothEdges(paper1, topic, Mention()) expectedGraph.addBothEdges(paper2, topic, Mention()) expectedGraph.addBothEdges(paper1, conference1, Publication()) expectedGraph.addBothEdges(paper2, conference2, Publication()) actualGraph = self.dataImporter.buildGraph(parsedData) self.assertGraphsEqual(actualGraph, expectedGraph)
def testMutualCitationGraph(self): # Build sample data & expected output parsedData = { 0: { 'id': 0, 'arnetid': 1, 'authors': ['Author One'], 'conference': 'Conference One', 'references': [1], 'title': 'Databases', 'year': 1999 }, 1: { 'id': 1, 'arnetid': 2, 'authors': ['Author Two'], 'conference': 'Conference Two', 'references': [0], 'title': 'Databases', 'year': 1999 } } expectedGraph = GraphFactory.createInstance() # Expect unspecified ids to auto-increment author1 = Author(0, 'Author One') author2 = Author(1, 'Author Two') paper1 = Paper(0, 'Databases') paper2 = Paper(1, 'Databases') topic = Topic(0, ['databas']) conference1 = Conference(0, 'Conference One') conference2 = Conference(1, 'Conference Two') expectedGraph.addNode(author1) expectedGraph.addNode(author2) expectedGraph.addNode(paper1) expectedGraph.addNode(paper2) expectedGraph.addNode(topic) expectedGraph.addNode(conference1) expectedGraph.addNode(conference2) expectedGraph.addBothEdges(author1, paper1, Authorship()) expectedGraph.addBothEdges(author2, paper2, Authorship()) expectedGraph.addBothEdges(paper1, topic, Mention()) expectedGraph.addBothEdges(paper2, topic, Mention()) expectedGraph.addBothEdges(paper1, conference1, Publication()) expectedGraph.addBothEdges(paper2, conference2, Publication()) # Symmetric in this case only! expectedGraph.addBothEdges(paper1, paper2, Citation()) actualGraph = self.dataImporter.buildGraph(parsedData) self.assertGraphsEqual(actualGraph, expectedGraph)
def testSeparatePapersAuthorsTopicSharedConferenceGraph(self): # Build sample data & expected output parsedData = { 0: { 'id': 0, 'arnetid': 1, 'authors': ['Author One'], 'conference': 'Conference One', 'references': [], 'title': 'Databases', 'year': 1995 }, 1: { 'id': 1, 'arnetid': 2, 'authors': ['Author Two'], 'conference': 'Conference One', 'references': [], 'title': 'All The Knowledge', 'year': 1999 } } expectedGraph = GraphFactory.createInstance() # Expect unspecified ids to auto-increment author1 = Author(0, 'Author One') author2 = Author(1, 'Author Two') paper1 = Paper(0, 'Databases') paper2 = Paper(1, 'All The Knowledge') topic1 = Topic(0, ['databas']) topic2 = Topic(1, ['knowledg']) conference = Conference(0, 'Conference One') expectedGraph.addNode(author1) expectedGraph.addNode(author2) expectedGraph.addNode(paper1) expectedGraph.addNode(paper2) expectedGraph.addNode(topic1) expectedGraph.addNode(topic2) expectedGraph.addNode(conference) expectedGraph.addBothEdges(author1, paper1, Authorship()) expectedGraph.addBothEdges(author2, paper2, Authorship()) expectedGraph.addBothEdges(paper1, topic1, Mention()) expectedGraph.addBothEdges(paper2, topic2, Mention()) expectedGraph.addBothEdges(paper1, conference, Publication()) expectedGraph.addBothEdges(paper2, conference, Publication()) actualGraph = self.dataImporter.buildGraph(parsedData) self.assertGraphsEqual(actualGraph, expectedGraph)
def addCitationPaper(citedPaper, citedAuthor): """ Helper method to add a 'citation' paper, which is only connected to the conference and the paper it cites """ nextId = SampleGraphUtility.__getNextId() citingPaper = Paper(nextId, "Citing Paper %d" % nextId) graph.addNode(citingPaper) graph.addBothEdges(citingPaper, conference) graph.addEdge(citingPaper, citedPaper) citationCount, publicationCount = actualCitationsPublications[citedAuthor] actualCitationsPublications[citedAuthor] = (citationCount + 1, publicationCount)
def addPublicationPaper(author): """ Helper method to add a 'publication' paper, connected to both an author and a conference """ nextId = SampleGraphUtility.__getNextId() paper = Paper(nextId, "%s's Paper %d" % (author.name, nextId)) graph.addNode(paper) graph.addBothEdges(author, paper) graph.addBothEdges(paper, conference) citationCount, publicationCount = actualCitationsPublications[author] actualCitationsPublications[author] = (citationCount, publicationCount + 1) return paper
def __addSimilarAuthorsPapers(graph, author, firstConference, secondConference, authorConferencePaperMap): """ Helper function to construct the papers & edges associated with the three very similar authors in example 3. (i.e. Mike, Mary, and Bob). Will only construct the third paper if these papers are not from Mary. """ paper1 = Paper(SampleGraphUtility.__getNextId(), 'Paper 1') paper2 = Paper(SampleGraphUtility.__getNextId(), 'Paper 2') graph.addNode(paper1) graph.addNode(paper2) graph.addBothEdges(author, paper1, Authorship()) graph.addBothEdges(author, paper2, Authorship()) graph.addBothEdges(paper1, firstConference, Publication()) graph.addBothEdges(paper2, firstConference, Publication()) authorConferencePaperMap[author][firstConference].append(paper1) authorConferencePaperMap[author][firstConference].append(paper2) paper3 = Paper(SampleGraphUtility.__getNextId(), 'Paper 3') graph.addNode(paper3) graph.addBothEdges(author, paper3, Authorship()) graph.addBothEdges(paper3, secondConference, Publication()) authorConferencePaperMap[author][secondConference].append(paper3)
def testCreateDBLPNode(self): paperDict = {'type': 'Paper', 'id': 68, 'title': 'VLDB Paper 57'} expectedPaper = Paper(id=68, title='VLDB Paper 57') actualPaper = GraphObjectFactory.createDBLPNode(paperDict) self.assertEqual(actualPaper, expectedPaper) authorDict = {'type': 'Author', 'id': 0, 'name': 'Mike'} expectedAuthor = Author(id=0, name='Mike') actualAuthor = GraphObjectFactory.createDBLPNode(authorDict) self.assertEqual(actualAuthor, expectedAuthor) conferenceDict = {'type': 'Conference', 'id': 6, 'name': 'VLDB'} expectedConference = Conference(id=6, name='VLDB') actualConference = GraphObjectFactory.createDBLPNode(conferenceDict) self.assertEqual(actualConference, expectedConference)
def buildGraph(self, parsedData): """ Form the DBLP graph structure from the parsed data """ graph = GraphFactory.createInstance() # First, build the nodes for the graph authors = {} # Indexed by name papers = {} # Indexed by paper id topics = {} # Indexed by keyword conferences = {} # Indexed by name citationMap = {} # Map of paper id to referenced paper ids # Construct everything except reference edges for paperId in parsedData: paperData = parsedData[paperId] paper = Paper(paperId, paperData['title']) citationMap[paperId] = paperData['references'] # Create or get conference for this paper conferenceName = paperData['conference'] if conferenceName not in conferences: conference = Conference(len(conferences), conferenceName) conferences[conferenceName] = conference graph.addNode(conference) else: conference = conferences[conferenceName] # Create or get authors for this paper paperAuthors = [] for authorName in paperData['authors']: if authorName not in authors: author = Author(len(authors), authorName) authors[authorName] = author graph.addNode(author) else: author = authors[authorName] paperAuthors.append(author) # Extract keywords from title, and use as topics keywords = self.__extractKeywords(paperData['title']) for keyword in keywords: if keyword not in topics: topic = Topic(len(topics), [keyword]) topics[keyword] = topic graph.addNode(topic) else: topic = topics[keyword] graph.addEdge(topic, paper, Mention()) graph.addEdge(paper, topic, Mention()) # Add new paper to the graph papers[paperId] = paper graph.addNode(paper) # Add corresponding edges in the graph for author in paperAuthors: graph.addEdge(paper, author, Authorship()) graph.addEdge(author, paper, Authorship()) graph.addEdge(paper, conference, Publication()) graph.addEdge(conference, paper, Publication()) # Add citations to the graph for paperId in citationMap: references = citationMap[paperId] paper = papers[paperId] for citedPaperId in references: citedPaper = papers[citedPaperId] graph.addEdge(paper, citedPaper, Citation()) return graph
def paperLineParser(line): paperData = line.split() paperId = int(self.__removeControlCharacters(paperData[0])) paperTitle = ' '.join(paperData[1:]) paper = Paper(paperId, paperTitle) return paperId, paper
def constructMultiDisciplinaryAuthorExample(indirectAuthor=False, uneven=False): """ Construct example DBLP graph where two authors are multi disciplinary, and no one else """ graph = GraphFactory.createInstance() authorMap = {} conferenceMap = {} # Add authors a = Author(SampleGraphUtility.__getNextId(), 'A') b = Author(SampleGraphUtility.__getNextId(), 'B') c = Author(SampleGraphUtility.__getNextId(), 'C') d = Author(SampleGraphUtility.__getNextId(), 'D') e = Author(SampleGraphUtility.__getNextId(), 'E') f = Author(SampleGraphUtility.__getNextId(), 'F') g = Author(SampleGraphUtility.__getNextId(), 'G') h = Author(SampleGraphUtility.__getNextId(), 'H') i = Author(SampleGraphUtility.__getNextId(), 'I') authors = [a, b, c, d, e, f, g, h, i] if indirectAuthor: authors.append(Author(SampleGraphUtility.__getNextId(), 'J')) graph.addNodes(authors) # Add conferences vldb = Conference(SampleGraphUtility.__getNextId(), 'VLDB') # Databases kdd = Conference(SampleGraphUtility.__getNextId(), 'KDD') # Data mining conferences = [vldb, kdd] graph.addNodes(conferences) # Add author / conference index for author in authors: authorMap[author.name] = author for conference in conferences: conferenceMap[conference.name] = conference # Helper dictionary of total citation counts for each author (to fabricate) -- all divisible by 5, and multi-discipline authors divisible by 10 # Results in the following total counts: {'A':100, 'B':80, 'C':10, 'D':120, 'E':60, 'F':100, 'G':80, 'H':10, 'I':24} citationCounts = {'A': 100, 'B': 80, 'C': 10, 'D': 60, 'E': 45, 'F': 100, 'G': 80, 'H': 10, 'I': 12, 'J': 60} # Create two papers for each author, one paper in each conference in each area dmAuthorNames = ['D', 'E', 'F', 'G', 'H', 'I'] dbAuthorNames = ['A', 'B', 'C', 'D', 'E', 'I'] if indirectAuthor: dmAuthorNames += ['J'] dbAuthorNames += ['J'] duplicateNames = set(dmAuthorNames).intersection(set(dbAuthorNames)) dmConferenceNames = ['KDD'] dbConferenceNames = ['VLDB'] def f(x): totalCitationCount[x] = 0 # Create equal number of citations from each other paper in the research area for each author's papers totalCitationCount = defaultdict(int) map(f, set(dmAuthorNames).union(set(dbAuthorNames))) for authorNames, conferenceNames in [(dmAuthorNames, dmConferenceNames), (dbAuthorNames, dbConferenceNames)]: for authorName in authorNames: citedPaperMap = {} for conferenceName in conferenceNames: # Add paper to be cited for author citedPaper = Paper(SampleGraphUtility.__getNextId(), '%sPaperIn%s' % (authorName, conferenceName)) graph.addNode(citedPaper) graph.addBothEdges(citedPaper, conferenceMap[conferenceName], Publication()) graph.addBothEdges(citedPaper, authorMap[authorName], Authorship()) citedPaperMap[conferenceName] = citedPaper # Figure out the number of incoming citation for this author from each other eligible authors if authorName in duplicateNames: citingAuthors = set(authorNames).difference(duplicateNames) else: citingAuthors = set(authorNames) citingAuthors.remove(authorName) citationsPerAuthor = citationCounts[authorName] / len(citingAuthors) # Make sure J is cited by the two non-D multi-disciplinary authors if authorName == 'J': citationsPerAuthor = citationCounts[authorName] / 2 citingAuthors = ['E', 'I'] # Loop through papers of all other authors for otherAuthorName in citingAuthors: if authorName == otherAuthorName: continue for conferenceName in conferenceNames: for i in xrange(0, citationsPerAuthor): # Add fake paper for citing the other author citingPaper = Paper(SampleGraphUtility.__getNextId(), 'Citation%d%sPaperIn%s' % (i, otherAuthorName, conferenceName)) graph.addNode(citingPaper) graph.addBothEdges(authorMap[otherAuthorName], citingPaper, Authorship()) graph.addBothEdges(citingPaper, conferenceMap[conferenceName], Publication()) # Add citation graph.addEdge(citingPaper, citedPaperMap[conferenceName], Citation()) totalCitationCount[authorName] += 1 if not uneven: return graph, authorMap, conferenceMap, totalCitationCount # If this flag is set, add three papers per author in data mining, and citations from all other authors for authorNamesList, conferenceNamesList in \ [(dmAuthorNames, dmConferenceNames), (dbAuthorNames, dbConferenceNames)]: extraPapers = [] # Add publications for authorName in authorNamesList: for conferenceName in conferenceNamesList: # Add paper to be cited for author citedPaper = Paper(SampleGraphUtility.__getNextId(), '%sPaperIn%s' % (authorName, conferenceName)) graph.addNode(citedPaper) graph.addBothEdges(citedPaper, conferenceMap[conferenceName], Publication()) graph.addBothEdges(citedPaper, authorMap[authorName], Authorship()) extraPapers.append((authorName, citedPaper)) random.seed() # Add randomized citations from authors to these papers for citingAuthorName in authorNamesList: for conferenceName in conferenceNamesList: for citedAuthorName, citedPaper in extraPapers: # Skip papers authored by this author if citedAuthorName == citingAuthorName: continue # Randomly add a number of citations to this paper for i in xrange(0, random.randint(0, 3)): # Add fake paper for citing the other author citingPaper = Paper(SampleGraphUtility.__getNextId(), 'Citation%d%sPaperIn%s' % ( i, citingAuthorName, conferenceName )) graph.addNode(citingPaper) graph.addBothEdges(authorMap[citingAuthorName], citingPaper, Authorship()) graph.addBothEdges(citingPaper, conferenceMap[conferenceName], Publication()) # Add citation graph.addEdge(citingPaper, citedPaper, Citation()) totalCitationCount[citedAuthorName] += 1 return graph, authorMap, conferenceMap, totalCitationCount
def constructPathSimExampleThree(extraAuthorsAndCitations=False, citationMap=None): """ Constructs "Example 3" from PathSim publication, ignoring topic nodes @see http://citeseer.ist.psu.edu/viewdoc/summary?doi=10.1.1.220.2455 """ graph = GraphFactory.createInstance() authorMap = {} conferenceMap = {} # Add authors mike = Author(SampleGraphUtility.__getNextId(), 'Mike') jim = Author(SampleGraphUtility.__getNextId(), 'Jim') mary = Author(SampleGraphUtility.__getNextId(), 'Mary') bob = Author(SampleGraphUtility.__getNextId(), 'Bob') ann = Author(SampleGraphUtility.__getNextId(), 'Ann') authors = [mike, jim, mary, bob, ann] if extraAuthorsAndCitations: joe = Author(SampleGraphUtility.__getNextId(), 'Joe') nancy = Author(SampleGraphUtility.__getNextId(), 'Nancy') authors += [joe, nancy] else: joe, nancy = None, None graph.addNodes(authors) # Add conferences sigmod = Conference(SampleGraphUtility.__getNextId(), 'SIGMOD') vldb = Conference(SampleGraphUtility.__getNextId(), 'VLDB') icde = Conference(SampleGraphUtility.__getNextId(), 'ICDE') kdd = Conference(SampleGraphUtility.__getNextId(), 'KDD') conferences = [sigmod, vldb, icde, kdd] graph.addNodes([sigmod, vldb, icde, kdd]) # Add author / conference index for author in authors: authorMap[author.name] = author for conference in conferences: conferenceMap[conference.name] = conference # Add author / conference / papers index authorConferencePaperMap = defaultdict(lambda : defaultdict(list)) # Add jim's papers for i in xrange(0, 70): conference = sigmod if i < 50 else vldb paper = Paper(SampleGraphUtility.__getNextId(), '%s Paper %d' % (conference.name, i + 1)) graph.addNode(paper) graph.addBothEdges(jim, paper, Authorship()) graph.addBothEdges(paper, conference, Publication()) authorConferencePaperMap[jim][conference].append(paper) # Add ann's papers annsPaper1 = Paper(SampleGraphUtility.__getNextId(), 'ICDE Paper') annsPaper2 = Paper(SampleGraphUtility.__getNextId(), 'KDD Paper') graph.addBothEdges(ann, annsPaper1, Authorship()) graph.addBothEdges(ann, annsPaper2, Authorship()) graph.addBothEdges(annsPaper1, icde, Publication()) graph.addBothEdges(annsPaper2, kdd, Publication()) authorConferencePaperMap[ann][icde].append(annsPaper1) authorConferencePaperMap[ann][kdd].append(annsPaper2) # Auto-add remaining authors (2,1) paper numbers SampleGraphUtility.__addSimilarAuthorsPapers(graph, mike, sigmod, vldb, authorConferencePaperMap) SampleGraphUtility.__addSimilarAuthorsPapers(graph, mary, sigmod, icde, authorConferencePaperMap) SampleGraphUtility.__addSimilarAuthorsPapers(graph, bob, sigmod, vldb, authorConferencePaperMap) # Add extra authors & citation data if extraAuthorsAndCitations: SampleGraphUtility.__addSimilarAuthorsPapers(graph, joe, sigmod, vldb, authorConferencePaperMap) SampleGraphUtility.__addSimilarAuthorsPapers(graph, nancy, sigmod, vldb, authorConferencePaperMap) SampleGraphUtility.__constructCitations(graph, authorMap, conferenceMap, authorConferencePaperMap, citationMap) return graph, authorMap, conferenceMap