def testSeparatePapersAuthorsTopicSharedConferenceGraph(self): # Build sample data & expected output parsedData = { 0: { 'id': 0, 'arnetid': 1, 'authors': ['Author One'], 'conference': 'Conference One', 'references': [], 'title': 'Databases', 'year': 1995 }, 1: { 'id': 1, 'arnetid': 2, 'authors': ['Author Two'], 'conference': 'Conference One', 'references': [], 'title': 'All The Knowledge', 'year': 1999 } } expectedGraph = GraphFactory.createInstance() # Expect unspecified ids to auto-increment author1 = Author(0, 'Author One') author2 = Author(1, 'Author Two') paper1 = Paper(0, 'Databases') paper2 = Paper(1, 'All The Knowledge') topic1 = Topic(0, ['databas']) topic2 = Topic(1, ['knowledg']) conference = Conference(0, 'Conference One') expectedGraph.addNode(author1) expectedGraph.addNode(author2) expectedGraph.addNode(paper1) expectedGraph.addNode(paper2) expectedGraph.addNode(topic1) expectedGraph.addNode(topic2) expectedGraph.addNode(conference) expectedGraph.addBothEdges(author1, paper1, Authorship()) expectedGraph.addBothEdges(author2, paper2, Authorship()) expectedGraph.addBothEdges(paper1, topic1, Mention()) expectedGraph.addBothEdges(paper2, topic2, Mention()) expectedGraph.addBothEdges(paper1, conference, Publication()) expectedGraph.addBothEdges(paper2, conference, Publication()) actualGraph = self.dataImporter.buildGraph(parsedData) self.assertGraphsEqual(actualGraph, expectedGraph)
def testCoAuthorsGraph(self): """ Sample (simple) scenario as the first case, except that three authors exist, and two of them are co-authors. """ # Build sample data & expected output parsedData = { 0: { 'id': 0, 'arnetid': 1, 'authors': ['Author One', 'Author Three'], 'conference': 'Conference One', 'references': [], 'title': 'Databases', 'year': 1995 }, 1: { 'id': 1, 'arnetid': 2, 'authors': ['Author Two'], 'conference': 'Conference Two', 'references': [], 'title': 'Databases', 'year': 1999 } } expectedGraph = GraphFactory.createInstance() # Expect unspecified ids to auto-increment author1 = Author(0, 'Author One') author2 = Author(2, 'Author Two') author3 = Author(1, 'Author Three') paper1 = Paper(0, 'Databases') paper2 = Paper(1, 'Databases') topic = Topic(0, ['databas']) conference1 = Conference(0, 'Conference One') conference2 = Conference(1, 'Conference Two') expectedGraph.addNode(author1) expectedGraph.addNode(author2) expectedGraph.addNode(author3) expectedGraph.addNode(paper1) expectedGraph.addNode(paper2) expectedGraph.addNode(topic) expectedGraph.addNode(conference1) expectedGraph.addNode(conference2) expectedGraph.addBothEdges(author1, paper1, Authorship()) expectedGraph.addBothEdges(author3, paper1, Authorship()) expectedGraph.addBothEdges(author2, paper2, Authorship()) expectedGraph.addBothEdges(paper1, topic, Mention()) expectedGraph.addBothEdges(paper2, topic, Mention()) expectedGraph.addBothEdges(paper1, conference1, Publication()) expectedGraph.addBothEdges(paper2, conference2, Publication()) actualGraph = self.dataImporter.buildGraph(parsedData) self.assertGraphsEqual(actualGraph, expectedGraph)
def testMutualCitationGraph(self): # Build sample data & expected output parsedData = { 0: { 'id': 0, 'arnetid': 1, 'authors': ['Author One'], 'conference': 'Conference One', 'references': [1], 'title': 'Databases', 'year': 1999 }, 1: { 'id': 1, 'arnetid': 2, 'authors': ['Author Two'], 'conference': 'Conference Two', 'references': [0], 'title': 'Databases', 'year': 1999 } } expectedGraph = GraphFactory.createInstance() # Expect unspecified ids to auto-increment author1 = Author(0, 'Author One') author2 = Author(1, 'Author Two') paper1 = Paper(0, 'Databases') paper2 = Paper(1, 'Databases') topic = Topic(0, ['databas']) conference1 = Conference(0, 'Conference One') conference2 = Conference(1, 'Conference Two') expectedGraph.addNode(author1) expectedGraph.addNode(author2) expectedGraph.addNode(paper1) expectedGraph.addNode(paper2) expectedGraph.addNode(topic) expectedGraph.addNode(conference1) expectedGraph.addNode(conference2) expectedGraph.addBothEdges(author1, paper1, Authorship()) expectedGraph.addBothEdges(author2, paper2, Authorship()) expectedGraph.addBothEdges(paper1, topic, Mention()) expectedGraph.addBothEdges(paper2, topic, Mention()) expectedGraph.addBothEdges(paper1, conference1, Publication()) expectedGraph.addBothEdges(paper2, conference2, Publication()) # Symmetric in this case only! expectedGraph.addBothEdges(paper1, paper2, Citation()) actualGraph = self.dataImporter.buildGraph(parsedData) self.assertGraphsEqual(actualGraph, expectedGraph)
def termLineParser(line): topicId, term = line.split() topicId = int(self.__removeControlCharacters(topicId)) term = self.stemmer.stemWord(term) if term in stemmedTermMap: topic = stemmedTermMap[term] else: topic = Topic(topicId, [term]) if term not in self.stopWords else None if topic is not None: stemmedTermMap[term] = topic return topicId, topic
def buildGraph(self, parsedData): """ Form the DBLP graph structure from the parsed data """ graph = GraphFactory.createInstance() # First, build the nodes for the graph authors = {} # Indexed by name papers = {} # Indexed by paper id topics = {} # Indexed by keyword conferences = {} # Indexed by name citationMap = {} # Map of paper id to referenced paper ids # Construct everything except reference edges for paperId in parsedData: paperData = parsedData[paperId] paper = Paper(paperId, paperData['title']) citationMap[paperId] = paperData['references'] # Create or get conference for this paper conferenceName = paperData['conference'] if conferenceName not in conferences: conference = Conference(len(conferences), conferenceName) conferences[conferenceName] = conference graph.addNode(conference) else: conference = conferences[conferenceName] # Create or get authors for this paper paperAuthors = [] for authorName in paperData['authors']: if authorName not in authors: author = Author(len(authors), authorName) authors[authorName] = author graph.addNode(author) else: author = authors[authorName] paperAuthors.append(author) # Extract keywords from title, and use as topics keywords = self.__extractKeywords(paperData['title']) for keyword in keywords: if keyword not in topics: topic = Topic(len(topics), [keyword]) topics[keyword] = topic graph.addNode(topic) else: topic = topics[keyword] graph.addEdge(topic, paper, Mention()) graph.addEdge(paper, topic, Mention()) # Add new paper to the graph papers[paperId] = paper graph.addNode(paper) # Add corresponding edges in the graph for author in paperAuthors: graph.addEdge(paper, author, Authorship()) graph.addEdge(author, paper, Authorship()) graph.addEdge(paper, conference, Publication()) graph.addEdge(conference, paper, Publication()) # Add citations to the graph for paperId in citationMap: references = citationMap[paperId] paper = papers[paperId] for citedPaperId in references: citedPaper = papers[citedPaperId] graph.addEdge(paper, citedPaper, Citation()) return graph