def setUp(self): self.maxDiff = None # Construct template graph for tests graph = GraphFactory.createInstance() # Put references to graph objects on test object self.author = Author(0, 'author') self.coauthor = Author(1, 'coauthor') self.conference1 = Conference(0, 'conference1') self.conference2 = Conference(1, 'conference2') self.paper1 = Paper(0, 'paper1') self.paper2 = Paper(1, 'paper2') self.paper3 = Paper(2, 'paper3') # Construct graph graph.addNodes([ self.author, self.conference1, self.conference2, self.paper1, self.paper2, self.paper3 ]) graph.addBothEdges(self.paper1, self.author, Authorship()) graph.addBothEdges(self.paper2, self.author, Authorship()) graph.addBothEdges(self.paper3, self.author, Authorship()) graph.addBothEdges(self.paper3, self.coauthor, Authorship()) graph.addBothEdges(self.paper1, self.conference1, Publication()) graph.addBothEdges(self.paper2, self.conference1, Publication()) graph.addBothEdges(self.paper3, self.conference2, Publication()) graph.addEdge(self.paper1, self.paper2, Citation()) graph.addBothEdges(self.paper2, self.paper3, Citation()) self.templateGraph = graph self.metaPathUtility = self._getImplementation()
def setUp(self): self.maxDiff = None # Construct template graph for tests graph = GraphFactory.createInstance() # Put references to graph objects on test object self.author = Author(0, "author") self.coauthor = Author(1, "coauthor") self.conference1 = Conference(0, "conference1") self.conference2 = Conference(1, "conference2") self.paper1 = Paper(0, "paper1") self.paper2 = Paper(1, "paper2") self.paper3 = Paper(2, "paper3") # Construct graph graph.addNodes([self.author, self.conference1, self.conference2, self.paper1, self.paper2, self.paper3]) graph.addBothEdges(self.paper1, self.author, Authorship()) graph.addBothEdges(self.paper2, self.author, Authorship()) graph.addBothEdges(self.paper3, self.author, Authorship()) graph.addBothEdges(self.paper3, self.coauthor, Authorship()) graph.addBothEdges(self.paper1, self.conference1, Publication()) graph.addBothEdges(self.paper2, self.conference1, Publication()) graph.addBothEdges(self.paper3, self.conference2, Publication()) graph.addEdge(self.paper1, self.paper2, Citation()) graph.addBothEdges(self.paper2, self.paper3, Citation()) self.templateGraph = graph self.metaPathUtility = self._getImplementation()
def testCoAuthorsGraph(self): """ Sample (simple) scenario as the first case, except that three authors exist, and two of them are co-authors. """ # Build sample data & expected output parsedData = { 0: { 'id': 0, 'arnetid': 1, 'authors': ['Author One', 'Author Three'], 'conference': 'Conference One', 'references': [], 'title': 'Databases', 'year': 1995 }, 1: { 'id': 1, 'arnetid': 2, 'authors': ['Author Two'], 'conference': 'Conference Two', 'references': [], 'title': 'Databases', 'year': 1999 } } expectedGraph = GraphFactory.createInstance() # Expect unspecified ids to auto-increment author1 = Author(0, 'Author One') author2 = Author(2, 'Author Two') author3 = Author(1, 'Author Three') paper1 = Paper(0, 'Databases') paper2 = Paper(1, 'Databases') topic = Topic(0, ['databas']) conference1 = Conference(0, 'Conference One') conference2 = Conference(1, 'Conference Two') expectedGraph.addNode(author1) expectedGraph.addNode(author2) expectedGraph.addNode(author3) expectedGraph.addNode(paper1) expectedGraph.addNode(paper2) expectedGraph.addNode(topic) expectedGraph.addNode(conference1) expectedGraph.addNode(conference2) expectedGraph.addBothEdges(author1, paper1, Authorship()) expectedGraph.addBothEdges(author3, paper1, Authorship()) expectedGraph.addBothEdges(author2, paper2, Authorship()) expectedGraph.addBothEdges(paper1, topic, Mention()) expectedGraph.addBothEdges(paper2, topic, Mention()) expectedGraph.addBothEdges(paper1, conference1, Publication()) expectedGraph.addBothEdges(paper2, conference2, Publication()) actualGraph = self.dataImporter.buildGraph(parsedData) self.assertGraphsEqual(actualGraph, expectedGraph)
def testMutualCitationGraph(self): # Build sample data & expected output parsedData = { 0: { 'id': 0, 'arnetid': 1, 'authors': ['Author One'], 'conference': 'Conference One', 'references': [1], 'title': 'Databases', 'year': 1999 }, 1: { 'id': 1, 'arnetid': 2, 'authors': ['Author Two'], 'conference': 'Conference Two', 'references': [0], 'title': 'Databases', 'year': 1999 } } expectedGraph = GraphFactory.createInstance() # Expect unspecified ids to auto-increment author1 = Author(0, 'Author One') author2 = Author(1, 'Author Two') paper1 = Paper(0, 'Databases') paper2 = Paper(1, 'Databases') topic = Topic(0, ['databas']) conference1 = Conference(0, 'Conference One') conference2 = Conference(1, 'Conference Two') expectedGraph.addNode(author1) expectedGraph.addNode(author2) expectedGraph.addNode(paper1) expectedGraph.addNode(paper2) expectedGraph.addNode(topic) expectedGraph.addNode(conference1) expectedGraph.addNode(conference2) expectedGraph.addBothEdges(author1, paper1, Authorship()) expectedGraph.addBothEdges(author2, paper2, Authorship()) expectedGraph.addBothEdges(paper1, topic, Mention()) expectedGraph.addBothEdges(paper2, topic, Mention()) expectedGraph.addBothEdges(paper1, conference1, Publication()) expectedGraph.addBothEdges(paper2, conference2, Publication()) # Symmetric in this case only! expectedGraph.addBothEdges(paper1, paper2, Citation()) actualGraph = self.dataImporter.buildGraph(parsedData) self.assertGraphsEqual(actualGraph, expectedGraph)
def parseNodeContent(self, nodeIndex): """ Parse the node content from the input files """ graph = GraphFactory.createInstance() # Parse authors from file def authorLineParser(line): authorData = line.split() authorId = int(self.__removeControlCharacters(authorData[0])) authorName = " ".join(authorData[1:]) author = Author(authorId, authorName) return authorId, author self.__parseNodeType(authorLineParser, "author", "author.txt", graph, nodeIndex) # Parse conferences from file def conferenceLineParser(line): conferenceData = line.split() conferenceId = int(self.__removeControlCharacters(conferenceData[0])) conferenceName = " ".join(conferenceData[1:]) conference = Conference(conferenceId, conferenceName) return conferenceId, conference self.__parseNodeType(conferenceLineParser, "conference", "conf.txt", graph, nodeIndex) # Parse papers def paperLineParser(line): paperData = line.split() paperId = int(self.__removeControlCharacters(paperData[0])) paperTitle = " ".join(paperData[1:]) paper = Paper(paperId, paperTitle) return paperId, paper self.__parseNodeType(paperLineParser, "paper", "paper.txt", graph, nodeIndex) # Parse terms stemmedTermMap = {} # Map of term to topic object, to handle stemmer collisions def termLineParser(line): topicId, term = line.split() topicId = int(self.__removeControlCharacters(topicId)) term = self.stemmer.stemWord(term) if term in stemmedTermMap: topic = stemmedTermMap[term] else: topic = Topic(topicId, [term]) if term not in self.stopWords else None if topic is not None: stemmedTermMap[term] = topic return topicId, topic self.__parseNodeType(termLineParser, "topic", "term.txt", graph, nodeIndex) return graph, nodeIndex
def buildGraph(self, coMoToData): graph = GraphFactory.createInstance() # Add semesters to graph analysisIdToAssignmentMap, offeringIdToSemesterMap = self.__addSemestersAndAssignmentsToGraph(coMoToData, graph) # Add submissions & students to graph, connect submissions with students and assignment, and students with assignments self.addStudentsAndSemestersToGraph(analysisIdToAssignmentMap, coMoToData, graph, offeringIdToSemesterMap) return graph
def parseNodeContent(self, nodeIndex): """ Parse the node content from the input files """ graph = GraphFactory.createInstance() # Parse authors from file def authorLineParser(line): authorData = line.split() authorId = int(self.__removeControlCharacters(authorData[0])) authorName = ' '.join(authorData[1:]) author = Author(authorId, authorName) return authorId, author self.__parseNodeType(authorLineParser, 'author', 'author.txt', graph, nodeIndex) # Parse conferences from file def conferenceLineParser(line): conferenceData = line.split() conferenceId = int(self.__removeControlCharacters(conferenceData[0])) conferenceName = ' '.join(conferenceData[1:]) conference = Conference(conferenceId, conferenceName) return conferenceId, conference self.__parseNodeType(conferenceLineParser, 'conference', 'conf.txt', graph, nodeIndex) # Parse papers def paperLineParser(line): paperData = line.split() paperId = int(self.__removeControlCharacters(paperData[0])) paperTitle = ' '.join(paperData[1:]) paper = Paper(paperId, paperTitle) return paperId, paper self.__parseNodeType(paperLineParser, 'paper', 'paper.txt', graph, nodeIndex) # Parse terms stemmedTermMap = {} # Map of term to topic object, to handle stemmer collisions def termLineParser(line): topicId, term = line.split() topicId = int(self.__removeControlCharacters(topicId)) term = self.stemmer.stemWord(term) if term in stemmedTermMap: topic = stemmedTermMap[term] else: topic = Topic(topicId, [term]) if term not in self.stopWords else None if topic is not None: stemmedTermMap[term] = topic return topicId, topic self.__parseNodeType(termLineParser, 'topic', 'term.txt', graph, nodeIndex) return graph, nodeIndex
def testSeparatePapersAuthorsTopicSharedConferenceGraph(self): # Build sample data & expected output parsedData = { 0: { 'id': 0, 'arnetid': 1, 'authors': ['Author One'], 'conference': 'Conference One', 'references': [], 'title': 'Databases', 'year': 1995 }, 1: { 'id': 1, 'arnetid': 2, 'authors': ['Author Two'], 'conference': 'Conference One', 'references': [], 'title': 'All The Knowledge', 'year': 1999 } } expectedGraph = GraphFactory.createInstance() # Expect unspecified ids to auto-increment author1 = Author(0, 'Author One') author2 = Author(1, 'Author Two') paper1 = Paper(0, 'Databases') paper2 = Paper(1, 'All The Knowledge') topic1 = Topic(0, ['databas']) topic2 = Topic(1, ['knowledg']) conference = Conference(0, 'Conference One') expectedGraph.addNode(author1) expectedGraph.addNode(author2) expectedGraph.addNode(paper1) expectedGraph.addNode(paper2) expectedGraph.addNode(topic1) expectedGraph.addNode(topic2) expectedGraph.addNode(conference) expectedGraph.addBothEdges(author1, paper1, Authorship()) expectedGraph.addBothEdges(author2, paper2, Authorship()) expectedGraph.addBothEdges(paper1, topic1, Mention()) expectedGraph.addBothEdges(paper2, topic2, Mention()) expectedGraph.addBothEdges(paper1, conference, Publication()) expectedGraph.addBothEdges(paper2, conference, Publication()) actualGraph = self.dataImporter.buildGraph(parsedData) self.assertGraphsEqual(actualGraph, expectedGraph)
def testSolutionMatchAnalysis(self): """ Tests that the graph is built correctly given some simple test analysis. This test case considers the case of: * Single assignment, single analysis, single semester * Three submissions * One (same semester) solution match """ # Setup CoMoTo data & expected graph analysisData = self.solutionMatchAnalysis student1 = Student(10001, 'Smith, John', 'johnsmith') student2 = Student(10002, 'Doe, Jane', 'janedoe') student3 = Student(10003, 'Smith, Joe', 'joesmith') submission1 = Submission(5001) submission2 = Submission(5002) submission3 = Submission(5003) solutionSubmission = Submission(5004, None, True) assignment = Assignment(1, 'MP1') semester = Semester(7, 'Spring', 2011) expectedGraph = GraphFactory.createInstance() expectedGraph.addNode(student1) expectedGraph.addNode(student2) expectedGraph.addNode(student3) expectedGraph.addNode(submission1) expectedGraph.addNode(submission2) expectedGraph.addNode(submission3) expectedGraph.addNode(solutionSubmission) expectedGraph.addNode(assignment) expectedGraph.addNode(semester) expectedGraph.addBothEdges(submission1, assignment, AssignmentSubmission()) expectedGraph.addBothEdges(submission2, assignment, AssignmentSubmission()) expectedGraph.addBothEdges(submission3, assignment, AssignmentSubmission()) expectedGraph.addBothEdges(solutionSubmission, assignment, AssignmentSubmission()) expectedGraph.addBothEdges(submission1, student1, Authorship()) expectedGraph.addBothEdges(submission2, student2, Authorship()) expectedGraph.addBothEdges(submission3, student3, Authorship()) expectedGraph.addBothEdges(student1, semester, Enrollment()) expectedGraph.addBothEdges(student2, semester, Enrollment()) expectedGraph.addBothEdges(student3, semester, Enrollment()) expectedGraph.addBothEdges(submission1, solutionSubmission, SolutionMatch(5000, 80)) expectedGraph.addBothEdges(semester, assignment, SemesterAssignment()) actualGraph = self.dataImporter.buildGraph(analysisData) self.assertGraphsEqual(expectedGraph, actualGraph)
def buildGraph(self, coMoToData): graph = GraphFactory.createInstance() # Add semesters to graph analysisIdToAssignmentMap, offeringIdToSemesterMap = self.__addSemestersAndAssignmentsToGraph( coMoToData, graph) # Add submissions & students to graph, connect submissions with students and assignment, and students with assignments self.addStudentsAndSemestersToGraph(analysisIdToAssignmentMap, coMoToData, graph, offeringIdToSemesterMap) return graph
def testInvalidSameSemesterMatchAnalysis(self): """ Tests that the graph is built correctly given some simple test analysis. This test case considers the case of: * Single assignment, single analysis, single semester * Three submissions * One (same semester) submission pair match Except, also includes extraneous data that should be discarded """ # Setup CoMoTo data & expected graph analysisData = self.invalidSameSemesterMatchAnalysis student1 = Student(10001, 'Smith, John', 'johnsmith') student2 = Student(10002, 'Doe, Jane', 'janedoe') student3 = Student(10003, 'Smith, Joe', 'joesmith') submission1 = Submission(5001) submission2 = Submission(5002) submission3 = Submission(5003) assignment = Assignment(1, 'MP1') semester = Semester(7, 'Spring', 2011) expectedGraph = GraphFactory.createInstance() expectedGraph.addNode(student1) expectedGraph.addNode(student2) expectedGraph.addNode(student3) expectedGraph.addNode(submission1) expectedGraph.addNode(submission2) expectedGraph.addNode(submission3) expectedGraph.addNode(assignment) expectedGraph.addNode(semester) expectedGraph.addBothEdges(submission1, assignment, AssignmentSubmission()) expectedGraph.addBothEdges(submission2, assignment, AssignmentSubmission()) expectedGraph.addBothEdges(submission3, assignment, AssignmentSubmission()) expectedGraph.addBothEdges(submission1, student1, Authorship()) expectedGraph.addBothEdges(submission2, student2, Authorship()) expectedGraph.addBothEdges(submission3, student3, Authorship()) expectedGraph.addBothEdges(student1, semester, Enrollment()) expectedGraph.addBothEdges(student2, semester, Enrollment()) expectedGraph.addBothEdges(student3, semester, Enrollment()) expectedGraph.addBothEdges(submission1, submission3, SameSemesterMatch(5000, 72.0)) expectedGraph.addBothEdges(semester, assignment, SemesterAssignment()) actualGraph = self.dataImporter.buildGraph(analysisData) self.assertGraphsEqual(expectedGraph, actualGraph)
def constructMultiDisciplinaryAuthorExample(indirectAuthor=False, uneven=False): """ Construct example DBLP graph where two authors are multi disciplinary, and no one else """ graph = GraphFactory.createInstance() authorMap = {} conferenceMap = {} # Add authors a = Author(SampleGraphUtility.__getNextId(), 'A') b = Author(SampleGraphUtility.__getNextId(), 'B') c = Author(SampleGraphUtility.__getNextId(), 'C') d = Author(SampleGraphUtility.__getNextId(), 'D') e = Author(SampleGraphUtility.__getNextId(), 'E') f = Author(SampleGraphUtility.__getNextId(), 'F') g = Author(SampleGraphUtility.__getNextId(), 'G') h = Author(SampleGraphUtility.__getNextId(), 'H') i = Author(SampleGraphUtility.__getNextId(), 'I') authors = [a, b, c, d, e, f, g, h, i] if indirectAuthor: authors.append(Author(SampleGraphUtility.__getNextId(), 'J')) graph.addNodes(authors) # Add conferences vldb = Conference(SampleGraphUtility.__getNextId(), 'VLDB') # Databases kdd = Conference(SampleGraphUtility.__getNextId(), 'KDD') # Data mining conferences = [vldb, kdd] graph.addNodes(conferences) # Add author / conference index for author in authors: authorMap[author.name] = author for conference in conferences: conferenceMap[conference.name] = conference # Helper dictionary of total citation counts for each author (to fabricate) -- all divisible by 5, and multi-discipline authors divisible by 10 # Results in the following total counts: {'A':100, 'B':80, 'C':10, 'D':120, 'E':60, 'F':100, 'G':80, 'H':10, 'I':24} citationCounts = {'A': 100, 'B': 80, 'C': 10, 'D': 60, 'E': 45, 'F': 100, 'G': 80, 'H': 10, 'I': 12, 'J': 60} # Create two papers for each author, one paper in each conference in each area dmAuthorNames = ['D', 'E', 'F', 'G', 'H', 'I'] dbAuthorNames = ['A', 'B', 'C', 'D', 'E', 'I'] if indirectAuthor: dmAuthorNames += ['J'] dbAuthorNames += ['J'] duplicateNames = set(dmAuthorNames).intersection(set(dbAuthorNames)) dmConferenceNames = ['KDD'] dbConferenceNames = ['VLDB'] def f(x): totalCitationCount[x] = 0 # Create equal number of citations from each other paper in the research area for each author's papers totalCitationCount = defaultdict(int) map(f, set(dmAuthorNames).union(set(dbAuthorNames))) for authorNames, conferenceNames in [(dmAuthorNames, dmConferenceNames), (dbAuthorNames, dbConferenceNames)]: for authorName in authorNames: citedPaperMap = {} for conferenceName in conferenceNames: # Add paper to be cited for author citedPaper = Paper(SampleGraphUtility.__getNextId(), '%sPaperIn%s' % (authorName, conferenceName)) graph.addNode(citedPaper) graph.addBothEdges(citedPaper, conferenceMap[conferenceName], Publication()) graph.addBothEdges(citedPaper, authorMap[authorName], Authorship()) citedPaperMap[conferenceName] = citedPaper # Figure out the number of incoming citation for this author from each other eligible authors if authorName in duplicateNames: citingAuthors = set(authorNames).difference(duplicateNames) else: citingAuthors = set(authorNames) citingAuthors.remove(authorName) citationsPerAuthor = citationCounts[authorName] / len(citingAuthors) # Make sure J is cited by the two non-D multi-disciplinary authors if authorName == 'J': citationsPerAuthor = citationCounts[authorName] / 2 citingAuthors = ['E', 'I'] # Loop through papers of all other authors for otherAuthorName in citingAuthors: if authorName == otherAuthorName: continue for conferenceName in conferenceNames: for i in xrange(0, citationsPerAuthor): # Add fake paper for citing the other author citingPaper = Paper(SampleGraphUtility.__getNextId(), 'Citation%d%sPaperIn%s' % (i, otherAuthorName, conferenceName)) graph.addNode(citingPaper) graph.addBothEdges(authorMap[otherAuthorName], citingPaper, Authorship()) graph.addBothEdges(citingPaper, conferenceMap[conferenceName], Publication()) # Add citation graph.addEdge(citingPaper, citedPaperMap[conferenceName], Citation()) totalCitationCount[authorName] += 1 if not uneven: return graph, authorMap, conferenceMap, totalCitationCount # If this flag is set, add three papers per author in data mining, and citations from all other authors for authorNamesList, conferenceNamesList in \ [(dmAuthorNames, dmConferenceNames), (dbAuthorNames, dbConferenceNames)]: extraPapers = [] # Add publications for authorName in authorNamesList: for conferenceName in conferenceNamesList: # Add paper to be cited for author citedPaper = Paper(SampleGraphUtility.__getNextId(), '%sPaperIn%s' % (authorName, conferenceName)) graph.addNode(citedPaper) graph.addBothEdges(citedPaper, conferenceMap[conferenceName], Publication()) graph.addBothEdges(citedPaper, authorMap[authorName], Authorship()) extraPapers.append((authorName, citedPaper)) random.seed() # Add randomized citations from authors to these papers for citingAuthorName in authorNamesList: for conferenceName in conferenceNamesList: for citedAuthorName, citedPaper in extraPapers: # Skip papers authored by this author if citedAuthorName == citingAuthorName: continue # Randomly add a number of citations to this paper for i in xrange(0, random.randint(0, 3)): # Add fake paper for citing the other author citingPaper = Paper(SampleGraphUtility.__getNextId(), 'Citation%d%sPaperIn%s' % ( i, citingAuthorName, conferenceName )) graph.addNode(citingPaper) graph.addBothEdges(authorMap[citingAuthorName], citingPaper, Authorship()) graph.addBothEdges(citingPaper, conferenceMap[conferenceName], Publication()) # Add citation graph.addEdge(citingPaper, citedPaper, Citation()) totalCitationCount[citedAuthorName] += 1 return graph, authorMap, conferenceMap, totalCitationCount
def constructPathSimExampleThree(extraAuthorsAndCitations=False, citationMap=None): """ Constructs "Example 3" from PathSim publication, ignoring topic nodes @see http://citeseer.ist.psu.edu/viewdoc/summary?doi=10.1.1.220.2455 """ graph = GraphFactory.createInstance() authorMap = {} conferenceMap = {} # Add authors mike = Author(SampleGraphUtility.__getNextId(), "Mike") jim = Author(SampleGraphUtility.__getNextId(), "Jim") mary = Author(SampleGraphUtility.__getNextId(), "Mary") bob = Author(SampleGraphUtility.__getNextId(), "Bob") ann = Author(SampleGraphUtility.__getNextId(), "Ann") authors = [mike, jim, mary, bob, ann] if extraAuthorsAndCitations: joe = Author(SampleGraphUtility.__getNextId(), "Joe") nancy = Author(SampleGraphUtility.__getNextId(), "Nancy") authors += [joe, nancy] else: joe, nancy = None, None graph.addNodes(authors) # Add conferences sigmod = Conference(SampleGraphUtility.__getNextId(), "SIGMOD") vldb = Conference(SampleGraphUtility.__getNextId(), "VLDB") icde = Conference(SampleGraphUtility.__getNextId(), "ICDE") kdd = Conference(SampleGraphUtility.__getNextId(), "KDD") conferences = [sigmod, vldb, icde, kdd] graph.addNodes([sigmod, vldb, icde, kdd]) # Add author / conference index for author in authors: authorMap[author.name] = author for conference in conferences: conferenceMap[conference.name] = conference # Add author / conference / papers index authorConferencePaperMap = defaultdict(lambda: defaultdict(list)) # Add jim's papers for i in xrange(0, 70): conference = sigmod if i < 50 else vldb paper = Paper(SampleGraphUtility.__getNextId(), "%s Paper %d" % (conference.name, i + 1)) graph.addNode(paper) graph.addBothEdges(jim, paper, Authorship()) graph.addBothEdges(paper, conference, Publication()) authorConferencePaperMap[jim][conference].append(paper) # Add ann's papers annsPaper1 = Paper(SampleGraphUtility.__getNextId(), "ICDE Paper") annsPaper2 = Paper(SampleGraphUtility.__getNextId(), "KDD Paper") graph.addBothEdges(ann, annsPaper1, Authorship()) graph.addBothEdges(ann, annsPaper2, Authorship()) graph.addBothEdges(annsPaper1, icde, Publication()) graph.addBothEdges(annsPaper2, kdd, Publication()) authorConferencePaperMap[ann][icde].append(annsPaper1) authorConferencePaperMap[ann][kdd].append(annsPaper2) # Auto-add remaining authors (2,1) paper numbers SampleGraphUtility.__addSimilarAuthorsPapers(graph, mike, sigmod, vldb, authorConferencePaperMap) SampleGraphUtility.__addSimilarAuthorsPapers(graph, mary, sigmod, icde, authorConferencePaperMap) SampleGraphUtility.__addSimilarAuthorsPapers(graph, bob, sigmod, vldb, authorConferencePaperMap) # Add extra authors & citation data if extraAuthorsAndCitations: SampleGraphUtility.__addSimilarAuthorsPapers(graph, joe, sigmod, vldb, authorConferencePaperMap) SampleGraphUtility.__addSimilarAuthorsPapers(graph, nancy, sigmod, vldb, authorConferencePaperMap) SampleGraphUtility.__constructCitations( graph, authorMap, conferenceMap, authorConferencePaperMap, citationMap ) return graph, authorMap, conferenceMap
def constructSkewedCitationPublicationExample(introduceRandomness=True, citationsPublicationsParameter=None): """ Build the graph for an example with skewed citation / publication count ratios NOTE: Extraneous authors are omitted """ graph = GraphFactory.createInstance() random.seed() # Create the authors & conference alice = Author(SampleGraphUtility.__getNextId(), "Alice") bob = Author(SampleGraphUtility.__getNextId(), "Bob") carol = Author(SampleGraphUtility.__getNextId(), "Carol") dave = Author(SampleGraphUtility.__getNextId(), "Dave") ed = Author(SampleGraphUtility.__getNextId(), "Ed") frank = Author(SampleGraphUtility.__getNextId(), "Frank") authors = [alice, bob, carol, dave, ed, frank] authorMap = {author.name: author for author in authors} conference = Conference(SampleGraphUtility.__getNextId(), "KDD") # Citation & publication count configuration if citationsPublicationsParameter is not None: citationsPublications = citationsPublicationsParameter else: citationsPublications = { "Alice": (100, 10), "Bob": (80, 10), "Carol": (100, 100), "Dave": (50, 10), "Ed": (10, 10), "Frank": (1000, 100), } actualCitationsPublications = defaultdict(lambda: (0, 0)) # Helper functions for repeatedly adding papers to the graph addPapersToAuthor = lambda n, author: [addPublicationPaper(author) for _ in itertools.repeat(None, n)] addCitationsToPaper = lambda n, paper, author: [ addCitationPaper(paper, author) for _ in itertools.repeat(None, n) ] def addPublicationPaper(author): """ Helper method to add a 'publication' paper, connected to both an author and a conference """ nextId = SampleGraphUtility.__getNextId() paper = Paper(nextId, "%s's Paper %d" % (author.name, nextId)) graph.addNode(paper) graph.addBothEdges(author, paper) graph.addBothEdges(paper, conference) citationCount, publicationCount = actualCitationsPublications[author] actualCitationsPublications[author] = (citationCount, publicationCount + 1) return paper def addCitationPaper(citedPaper, citedAuthor): """ Helper method to add a 'citation' paper, which is only connected to the conference and the paper it cites """ nextId = SampleGraphUtility.__getNextId() citingPaper = Paper(nextId, "Citing Paper %d" % nextId) graph.addNode(citingPaper) graph.addBothEdges(citingPaper, conference) graph.addEdge(citingPaper, citedPaper) citationCount, publicationCount = actualCitationsPublications[citedAuthor] actualCitationsPublications[citedAuthor] = (citationCount + 1, publicationCount) # Construct the graph graph.addNodes(authors + [conference]) for authorName in citationsPublications: citationCount, publicationCount = citationsPublications[authorName] # Optionally, introduce randomness if introduceRandomness: randomInterval = lambda x: (x + int(-0.1 * x), x + int(0.1 * x)) citationCount = random.randint(*randomInterval(citationCount)) publicationCount = random.randint(*randomInterval(publicationCount)) # Add citations & publications to author authorPapers = addPapersToAuthor(publicationCount, authorMap[authorName]) citationsPerPaper = citationCount / publicationCount remainingCitationsPerPaper = citationCount % publicationCount for paper in authorPapers: addCitationsToPaper(citationsPerPaper, paper, authorMap[authorName]) if ( actualCitationsPublications[authorMap[authorName]][0] < citationsPublications[authorName][0] and remainingCitationsPerPaper > 0 ): addCitationsToPaper(remainingCitationsPerPaper, paper, authorMap[authorName]) return graph, authorMap, conference, actualCitationsPublications
def constructMultiDisciplinaryAuthorExample(indirectAuthor=False, uneven=False): """ Construct example DBLP graph where two authors are multi disciplinary, and no one else """ graph = GraphFactory.createInstance() authorMap = {} conferenceMap = {} # Add authors a = Author(SampleGraphUtility.__getNextId(), "A") b = Author(SampleGraphUtility.__getNextId(), "B") c = Author(SampleGraphUtility.__getNextId(), "C") d = Author(SampleGraphUtility.__getNextId(), "D") e = Author(SampleGraphUtility.__getNextId(), "E") f = Author(SampleGraphUtility.__getNextId(), "F") g = Author(SampleGraphUtility.__getNextId(), "G") h = Author(SampleGraphUtility.__getNextId(), "H") i = Author(SampleGraphUtility.__getNextId(), "I") authors = [a, b, c, d, e, f, g, h, i] if indirectAuthor: authors.append(Author(SampleGraphUtility.__getNextId(), "J")) graph.addNodes(authors) # Add conferences vldb = Conference(SampleGraphUtility.__getNextId(), "VLDB") # Databases kdd = Conference(SampleGraphUtility.__getNextId(), "KDD") # Data mining conferences = [vldb, kdd] graph.addNodes(conferences) # Add author / conference index for author in authors: authorMap[author.name] = author for conference in conferences: conferenceMap[conference.name] = conference # Helper dictionary of total citation counts for each author (to fabricate) -- all divisible by 5, and multi-discipline authors divisible by 10 # Results in the following total counts: {'A':100, 'B':80, 'C':10, 'D':120, 'E':60, 'F':100, 'G':80, 'H':10, 'I':24} citationCounts = {"A": 100, "B": 80, "C": 10, "D": 60, "E": 45, "F": 100, "G": 80, "H": 10, "I": 12, "J": 60} # Create two papers for each author, one paper in each conference in each area dmAuthorNames = ["D", "E", "F", "G", "H", "I"] dbAuthorNames = ["A", "B", "C", "D", "E", "I"] if indirectAuthor: dmAuthorNames += ["J"] dbAuthorNames += ["J"] duplicateNames = set(dmAuthorNames).intersection(set(dbAuthorNames)) dmConferenceNames = ["KDD"] dbConferenceNames = ["VLDB"] def f(x): totalCitationCount[x] = 0 # Create equal number of citations from each other paper in the research area for each author's papers totalCitationCount = defaultdict(int) map(f, set(dmAuthorNames).union(set(dbAuthorNames))) for authorNames, conferenceNames in [(dmAuthorNames, dmConferenceNames), (dbAuthorNames, dbConferenceNames)]: for authorName in authorNames: citedPaperMap = {} for conferenceName in conferenceNames: # Add paper to be cited for author citedPaper = Paper(SampleGraphUtility.__getNextId(), "%sPaperIn%s" % (authorName, conferenceName)) graph.addNode(citedPaper) graph.addBothEdges(citedPaper, conferenceMap[conferenceName], Publication()) graph.addBothEdges(citedPaper, authorMap[authorName], Authorship()) citedPaperMap[conferenceName] = citedPaper # Figure out the number of incoming citation for this author from each other eligible authors if authorName in duplicateNames: citingAuthors = set(authorNames).difference(duplicateNames) else: citingAuthors = set(authorNames) citingAuthors.remove(authorName) citationsPerAuthor = citationCounts[authorName] / len(citingAuthors) # Make sure J is cited by the two non-D multi-disciplinary authors if authorName == "J": citationsPerAuthor = citationCounts[authorName] / 2 citingAuthors = ["E", "I"] # Loop through papers of all other authors for otherAuthorName in citingAuthors: if authorName == otherAuthorName: continue for conferenceName in conferenceNames: for i in xrange(0, citationsPerAuthor): # Add fake paper for citing the other author citingPaper = Paper( SampleGraphUtility.__getNextId(), "Citation%d%sPaperIn%s" % (i, otherAuthorName, conferenceName), ) graph.addNode(citingPaper) graph.addBothEdges(authorMap[otherAuthorName], citingPaper, Authorship()) graph.addBothEdges(citingPaper, conferenceMap[conferenceName], Publication()) # Add citation graph.addEdge(citingPaper, citedPaperMap[conferenceName], Citation()) totalCitationCount[authorName] += 1 if not uneven: return graph, authorMap, conferenceMap, totalCitationCount # If this flag is set, add three papers per author in data mining, and citations from all other authors for authorNamesList, conferenceNamesList in [ (dmAuthorNames, dmConferenceNames), (dbAuthorNames, dbConferenceNames), ]: extraPapers = [] # Add publications for authorName in authorNamesList: for conferenceName in conferenceNamesList: # Add paper to be cited for author citedPaper = Paper(SampleGraphUtility.__getNextId(), "%sPaperIn%s" % (authorName, conferenceName)) graph.addNode(citedPaper) graph.addBothEdges(citedPaper, conferenceMap[conferenceName], Publication()) graph.addBothEdges(citedPaper, authorMap[authorName], Authorship()) extraPapers.append((authorName, citedPaper)) random.seed() # Add randomized citations from authors to these papers for citingAuthorName in authorNamesList: for conferenceName in conferenceNamesList: for citedAuthorName, citedPaper in extraPapers: # Skip papers authored by this author if citedAuthorName == citingAuthorName: continue # Randomly add a number of citations to this paper for i in xrange(0, random.randint(0, 3)): # Add fake paper for citing the other author citingPaper = Paper( SampleGraphUtility.__getNextId(), "Citation%d%sPaperIn%s" % (i, citingAuthorName, conferenceName), ) graph.addNode(citingPaper) graph.addBothEdges(authorMap[citingAuthorName], citingPaper, Authorship()) graph.addBothEdges(citingPaper, conferenceMap[conferenceName], Publication()) # Add citation graph.addEdge(citingPaper, citedPaper, Citation()) totalCitationCount[citedAuthorName] += 1 return graph, authorMap, conferenceMap, totalCitationCount
def buildGraph(self, parsedData): """ Form the DBLP graph structure from the parsed data """ graph = GraphFactory.createInstance() # First, build the nodes for the graph authors = {} # Indexed by name papers = {} # Indexed by paper id topics = {} # Indexed by keyword conferences = {} # Indexed by name citationMap = {} # Map of paper id to referenced paper ids # Construct everything except reference edges for paperId in parsedData: paperData = parsedData[paperId] paper = Paper(paperId, paperData['title']) citationMap[paperId] = paperData['references'] # Create or get conference for this paper conferenceName = paperData['conference'] if conferenceName not in conferences: conference = Conference(len(conferences), conferenceName) conferences[conferenceName] = conference graph.addNode(conference) else: conference = conferences[conferenceName] # Create or get authors for this paper paperAuthors = [] for authorName in paperData['authors']: if authorName not in authors: author = Author(len(authors), authorName) authors[authorName] = author graph.addNode(author) else: author = authors[authorName] paperAuthors.append(author) # Extract keywords from title, and use as topics keywords = self.__extractKeywords(paperData['title']) for keyword in keywords: if keyword not in topics: topic = Topic(len(topics), [keyword]) topics[keyword] = topic graph.addNode(topic) else: topic = topics[keyword] graph.addEdge(topic, paper, Mention()) graph.addEdge(paper, topic, Mention()) # Add new paper to the graph papers[paperId] = paper graph.addNode(paper) # Add corresponding edges in the graph for author in paperAuthors: graph.addEdge(paper, author, Authorship()) graph.addEdge(author, paper, Authorship()) graph.addEdge(paper, conference, Publication()) graph.addEdge(conference, paper, Publication()) # Add citations to the graph for paperId in citationMap: references = citationMap[paperId] paper = papers[paperId] for citedPaperId in references: citedPaper = papers[citedPaperId] graph.addEdge(paper, citedPaper, Citation()) return graph
def testRetakingStudentAnalyses(self): """ Tests that the graph is built correctly given some more complex test analysis. This test case considers the case of: * Two assignments, two analyses, two semesters * Five submissions, two by a single student (in two semesters) * One (cross semester) match between two submissions from the same student In this case, the match should be removed """ # Setup CoMoTo data & expected graph analysisData = self.retakingStudentAnalysis student1 = Student(10001, 'Smith, John', 'johnsmith', True) student2 = Student(10002, 'Doe, Jane', 'janedoe') student3 = Student(10003, 'Smith, Joe', 'joesmith') student4 = Student(10004, 'Smith, Alex', 'alexsmith') submission2 = Submission(5002) submission3 = Submission(5003) submission4 = Submission(5004) submission5 = Submission(5005) assignment1 = Assignment(1, 'MP1') assignment2 = Assignment(2, 'MP2') semester1 = Semester(7, 'Spring', 2011) semester2 = Semester(8, 'Spring', 2012) expectedGraph = GraphFactory.createInstance() expectedGraph.addNode(student1) expectedGraph.addNode(student2) expectedGraph.addNode(student3) expectedGraph.addNode(student4) expectedGraph.addNode(submission2) expectedGraph.addNode(submission3) expectedGraph.addNode(submission4) expectedGraph.addNode(submission5) expectedGraph.addNode(assignment1) expectedGraph.addNode(assignment2) expectedGraph.addNode(semester1) expectedGraph.addNode(semester2) expectedGraph.addBothEdges(submission2, assignment1, AssignmentSubmission()) expectedGraph.addBothEdges(submission3, assignment1, AssignmentSubmission()) expectedGraph.addBothEdges(submission4, assignment2, AssignmentSubmission()) expectedGraph.addBothEdges(submission5, assignment2, AssignmentSubmission()) expectedGraph.addBothEdges(submission2, student2, Authorship()) expectedGraph.addBothEdges(submission3, student3, Authorship()) expectedGraph.addBothEdges(submission4, student1, Authorship()) expectedGraph.addBothEdges(submission5, student4, Authorship()) expectedGraph.addBothEdges(student1, semester2, Enrollment()) expectedGraph.addBothEdges(student2, semester1, Enrollment()) expectedGraph.addBothEdges(student3, semester1, Enrollment()) expectedGraph.addBothEdges(student4, semester2, Enrollment()) expectedGraph.addBothEdges(semester1, assignment1, SemesterAssignment()) expectedGraph.addBothEdges(semester2, assignment2, SemesterAssignment()) # Test actualGraph = self.dataImporter.buildGraph(analysisData) # Verify self.assertGraphsEqual(expectedGraph, actualGraph)
def testCrossSemesterMatchAnalyses(self): """ Tests that the graph is built correctly given some more complex test analysis. This test case considers the case of: * Two assignments, two analyses, two semesters * Four submissions * One (cross semester) match """ # Setup CoMoTo data & expected graph analysisData = self.crossSemesterMatchAnalysis student1 = Student(10001, 'Smith, John', 'johnsmith') student2 = Student(10002, 'Doe, Jane', 'janedoe') student3 = Student(10003, 'Smith, Joe', 'joesmith') student4 = Student(10004, 'Smith, Alex', 'alexsmith') submission1 = Submission(5001) submission2 = Submission(5002) submission3 = Submission(5003) submission4 = Submission(5004) assignment1 = Assignment(1, 'MP1') assignment2 = Assignment(2, 'MP2') semester1 = Semester(7, 'Spring', 2011) semester2 = Semester(8, 'Spring', 2012) expectedGraph = GraphFactory.createInstance() expectedGraph.addNode(student1) expectedGraph.addNode(student2) expectedGraph.addNode(student3) expectedGraph.addNode(student4) expectedGraph.addNode(submission1) expectedGraph.addNode(submission2) expectedGraph.addNode(submission3) expectedGraph.addNode(submission4) expectedGraph.addNode(assignment1) expectedGraph.addNode(assignment2) expectedGraph.addNode(semester1) expectedGraph.addNode(semester2) expectedGraph.addBothEdges(submission1, assignment1, AssignmentSubmission()) expectedGraph.addBothEdges(submission2, assignment1, AssignmentSubmission()) expectedGraph.addBothEdges(submission3, assignment1, AssignmentSubmission()) expectedGraph.addBothEdges(submission4, assignment2, AssignmentSubmission()) expectedGraph.addBothEdges(submission1, student1, Authorship()) expectedGraph.addBothEdges(submission2, student2, Authorship()) expectedGraph.addBothEdges(submission3, student3, Authorship()) expectedGraph.addBothEdges(submission4, student4, Authorship()) expectedGraph.addBothEdges(student1, semester1, Enrollment()) expectedGraph.addBothEdges(student2, semester1, Enrollment()) expectedGraph.addBothEdges(student3, semester1, Enrollment()) expectedGraph.addBothEdges(student4, semester2, Enrollment()) expectedGraph.addBothEdges(semester1, assignment1, SemesterAssignment()) expectedGraph.addBothEdges(semester2, assignment2, SemesterAssignment()) # Every type of edge should be symmetric, except for the cross-semester match (since current submissions can # match past submissions, but not vice versa) expectedGraph.addEdge(submission1, submission4, CrossSemesterMatch(5000, 72.0)) # Test actualGraph = self.dataImporter.buildGraph(analysisData) # Verify self.assertGraphsEqual(expectedGraph, actualGraph)
def constructSkewedCitationPublicationExample(introduceRandomness=True, citationsPublicationsParameter=None): """ Build the graph for an example with skewed citation / publication count ratios NOTE: Extraneous authors are omitted """ graph = GraphFactory.createInstance() random.seed() # Create the authors & conference alice = Author(SampleGraphUtility.__getNextId(), 'Alice') bob = Author(SampleGraphUtility.__getNextId(), 'Bob') carol = Author(SampleGraphUtility.__getNextId(), 'Carol') dave = Author(SampleGraphUtility.__getNextId(), 'Dave') ed = Author(SampleGraphUtility.__getNextId(), 'Ed') frank = Author(SampleGraphUtility.__getNextId(), 'Frank') authors = [alice, bob, carol, dave, ed, frank] authorMap = {author.name: author for author in authors} conference = Conference(SampleGraphUtility.__getNextId(), 'KDD') # Citation & publication count configuration if citationsPublicationsParameter is not None: citationsPublications = citationsPublicationsParameter else: citationsPublications = { 'Alice': (100, 10), 'Bob': (80, 10), 'Carol': (100, 100), 'Dave': (50, 10), 'Ed': (10, 10), 'Frank': (1000, 100) } actualCitationsPublications = defaultdict(lambda: (0, 0)) # Helper functions for repeatedly adding papers to the graph addPapersToAuthor = lambda n, author: [addPublicationPaper(author) for _ in itertools.repeat(None, n)] addCitationsToPaper = lambda n, paper, author: [addCitationPaper(paper, author) for _ in itertools.repeat(None, n)] def addPublicationPaper(author): """ Helper method to add a 'publication' paper, connected to both an author and a conference """ nextId = SampleGraphUtility.__getNextId() paper = Paper(nextId, "%s's Paper %d" % (author.name, nextId)) graph.addNode(paper) graph.addBothEdges(author, paper) graph.addBothEdges(paper, conference) citationCount, publicationCount = actualCitationsPublications[author] actualCitationsPublications[author] = (citationCount, publicationCount + 1) return paper def addCitationPaper(citedPaper, citedAuthor): """ Helper method to add a 'citation' paper, which is only connected to the conference and the paper it cites """ nextId = SampleGraphUtility.__getNextId() citingPaper = Paper(nextId, "Citing Paper %d" % nextId) graph.addNode(citingPaper) graph.addBothEdges(citingPaper, conference) graph.addEdge(citingPaper, citedPaper) citationCount, publicationCount = actualCitationsPublications[citedAuthor] actualCitationsPublications[citedAuthor] = (citationCount + 1, publicationCount) # Construct the graph graph.addNodes(authors + [conference]) for authorName in citationsPublications: citationCount, publicationCount = citationsPublications[authorName] # Optionally, introduce randomness if introduceRandomness: randomInterval = lambda x: (x + int(-0.1 * x), x + int(0.1 * x)) citationCount = random.randint(*randomInterval(citationCount)) publicationCount = random.randint(*randomInterval(publicationCount)) # Add citations & publications to author authorPapers = addPapersToAuthor(publicationCount, authorMap[authorName]) citationsPerPaper = citationCount / publicationCount remainingCitationsPerPaper = citationCount % publicationCount for paper in authorPapers: addCitationsToPaper(citationsPerPaper, paper, authorMap[authorName]) if actualCitationsPublications[authorMap[authorName]][0] < citationsPublications[authorName][0] \ and remainingCitationsPerPaper > 0: addCitationsToPaper(remainingCitationsPerPaper, paper, authorMap[authorName]) return graph, authorMap, conference, actualCitationsPublications
def constructPathSimExampleThree(extraAuthorsAndCitations=False, citationMap=None): """ Constructs "Example 3" from PathSim publication, ignoring topic nodes @see http://citeseer.ist.psu.edu/viewdoc/summary?doi=10.1.1.220.2455 """ graph = GraphFactory.createInstance() authorMap = {} conferenceMap = {} # Add authors mike = Author(SampleGraphUtility.__getNextId(), 'Mike') jim = Author(SampleGraphUtility.__getNextId(), 'Jim') mary = Author(SampleGraphUtility.__getNextId(), 'Mary') bob = Author(SampleGraphUtility.__getNextId(), 'Bob') ann = Author(SampleGraphUtility.__getNextId(), 'Ann') authors = [mike, jim, mary, bob, ann] if extraAuthorsAndCitations: joe = Author(SampleGraphUtility.__getNextId(), 'Joe') nancy = Author(SampleGraphUtility.__getNextId(), 'Nancy') authors += [joe, nancy] else: joe, nancy = None, None graph.addNodes(authors) # Add conferences sigmod = Conference(SampleGraphUtility.__getNextId(), 'SIGMOD') vldb = Conference(SampleGraphUtility.__getNextId(), 'VLDB') icde = Conference(SampleGraphUtility.__getNextId(), 'ICDE') kdd = Conference(SampleGraphUtility.__getNextId(), 'KDD') conferences = [sigmod, vldb, icde, kdd] graph.addNodes([sigmod, vldb, icde, kdd]) # Add author / conference index for author in authors: authorMap[author.name] = author for conference in conferences: conferenceMap[conference.name] = conference # Add author / conference / papers index authorConferencePaperMap = defaultdict(lambda : defaultdict(list)) # Add jim's papers for i in xrange(0, 70): conference = sigmod if i < 50 else vldb paper = Paper(SampleGraphUtility.__getNextId(), '%s Paper %d' % (conference.name, i + 1)) graph.addNode(paper) graph.addBothEdges(jim, paper, Authorship()) graph.addBothEdges(paper, conference, Publication()) authorConferencePaperMap[jim][conference].append(paper) # Add ann's papers annsPaper1 = Paper(SampleGraphUtility.__getNextId(), 'ICDE Paper') annsPaper2 = Paper(SampleGraphUtility.__getNextId(), 'KDD Paper') graph.addBothEdges(ann, annsPaper1, Authorship()) graph.addBothEdges(ann, annsPaper2, Authorship()) graph.addBothEdges(annsPaper1, icde, Publication()) graph.addBothEdges(annsPaper2, kdd, Publication()) authorConferencePaperMap[ann][icde].append(annsPaper1) authorConferencePaperMap[ann][kdd].append(annsPaper2) # Auto-add remaining authors (2,1) paper numbers SampleGraphUtility.__addSimilarAuthorsPapers(graph, mike, sigmod, vldb, authorConferencePaperMap) SampleGraphUtility.__addSimilarAuthorsPapers(graph, mary, sigmod, icde, authorConferencePaperMap) SampleGraphUtility.__addSimilarAuthorsPapers(graph, bob, sigmod, vldb, authorConferencePaperMap) # Add extra authors & citation data if extraAuthorsAndCitations: SampleGraphUtility.__addSimilarAuthorsPapers(graph, joe, sigmod, vldb, authorConferencePaperMap) SampleGraphUtility.__addSimilarAuthorsPapers(graph, nancy, sigmod, vldb, authorConferencePaperMap) SampleGraphUtility.__constructCitations(graph, authorMap, conferenceMap, authorConferencePaperMap, citationMap) return graph, authorMap, conferenceMap