def setUp(self):

        self.maxDiff = None

        # Construct template graph for tests
        graph = GraphFactory.createInstance()

        # Put references to graph objects on test object
        self.author = Author(0, 'author')
        self.coauthor = Author(1, 'coauthor')
        self.conference1 = Conference(0, 'conference1')
        self.conference2 = Conference(1, 'conference2')
        self.paper1 = Paper(0, 'paper1')
        self.paper2 = Paper(1, 'paper2')
        self.paper3 = Paper(2, 'paper3')

        # Construct graph
        graph.addNodes([
            self.author, self.conference1, self.conference2, self.paper1,
            self.paper2, self.paper3
        ])
        graph.addBothEdges(self.paper1, self.author, Authorship())
        graph.addBothEdges(self.paper2, self.author, Authorship())
        graph.addBothEdges(self.paper3, self.author, Authorship())
        graph.addBothEdges(self.paper3, self.coauthor, Authorship())
        graph.addBothEdges(self.paper1, self.conference1, Publication())
        graph.addBothEdges(self.paper2, self.conference1, Publication())
        graph.addBothEdges(self.paper3, self.conference2, Publication())
        graph.addEdge(self.paper1, self.paper2, Citation())
        graph.addBothEdges(self.paper2, self.paper3, Citation())

        self.templateGraph = graph

        self.metaPathUtility = self._getImplementation()
    def setUp(self):

        self.maxDiff = None

        # Construct template graph for tests
        graph = GraphFactory.createInstance()

        # Put references to graph objects on test object
        self.author = Author(0, "author")
        self.coauthor = Author(1, "coauthor")
        self.conference1 = Conference(0, "conference1")
        self.conference2 = Conference(1, "conference2")
        self.paper1 = Paper(0, "paper1")
        self.paper2 = Paper(1, "paper2")
        self.paper3 = Paper(2, "paper3")

        # Construct graph
        graph.addNodes([self.author, self.conference1, self.conference2, self.paper1, self.paper2, self.paper3])
        graph.addBothEdges(self.paper1, self.author, Authorship())
        graph.addBothEdges(self.paper2, self.author, Authorship())
        graph.addBothEdges(self.paper3, self.author, Authorship())
        graph.addBothEdges(self.paper3, self.coauthor, Authorship())
        graph.addBothEdges(self.paper1, self.conference1, Publication())
        graph.addBothEdges(self.paper2, self.conference1, Publication())
        graph.addBothEdges(self.paper3, self.conference2, Publication())
        graph.addEdge(self.paper1, self.paper2, Citation())
        graph.addBothEdges(self.paper2, self.paper3, Citation())

        self.templateGraph = graph

        self.metaPathUtility = self._getImplementation()
Esempio n. 3
0
    def testCoAuthorsGraph(self):
        """
          Sample (simple) scenario as the first case, except that three authors exist, and two of them are co-authors.
        """

        # Build sample data & expected output
        parsedData = {
            0: {
                'id': 0,
                'arnetid': 1,
                'authors': ['Author One', 'Author Three'],
                'conference': 'Conference One',
                'references': [],
                'title': 'Databases',
                'year': 1995
            },
            1: {
                'id': 1,
                'arnetid': 2,
                'authors': ['Author Two'],
                'conference': 'Conference Two',
                'references': [],
                'title': 'Databases',
                'year': 1999
            }
        }

        expectedGraph = GraphFactory.createInstance()

        # Expect unspecified ids to auto-increment
        author1 = Author(0, 'Author One')
        author2 = Author(2, 'Author Two')
        author3 = Author(1, 'Author Three')
        paper1 = Paper(0, 'Databases')
        paper2 = Paper(1, 'Databases')
        topic = Topic(0, ['databas'])
        conference1 = Conference(0, 'Conference One')
        conference2 = Conference(1, 'Conference Two')
        expectedGraph.addNode(author1)
        expectedGraph.addNode(author2)
        expectedGraph.addNode(author3)
        expectedGraph.addNode(paper1)
        expectedGraph.addNode(paper2)
        expectedGraph.addNode(topic)
        expectedGraph.addNode(conference1)
        expectedGraph.addNode(conference2)

        expectedGraph.addBothEdges(author1, paper1, Authorship())
        expectedGraph.addBothEdges(author3, paper1, Authorship())
        expectedGraph.addBothEdges(author2, paper2, Authorship())
        expectedGraph.addBothEdges(paper1, topic, Mention())
        expectedGraph.addBothEdges(paper2, topic, Mention())
        expectedGraph.addBothEdges(paper1, conference1, Publication())
        expectedGraph.addBothEdges(paper2, conference2, Publication())

        actualGraph = self.dataImporter.buildGraph(parsedData)

        self.assertGraphsEqual(actualGraph, expectedGraph)
    def testCoAuthorsGraph(self):
        """
          Sample (simple) scenario as the first case, except that three authors exist, and two of them are co-authors.
        """

        # Build sample data & expected output
        parsedData = {
            0: {
                'id': 0,
                'arnetid': 1,
                'authors': ['Author One', 'Author Three'],
                'conference': 'Conference One',
                'references': [],
                'title': 'Databases',
                'year': 1995
            },
            1: {
                'id': 1,
                'arnetid': 2,
                'authors': ['Author Two'],
                'conference': 'Conference Two',
                'references': [],
                'title': 'Databases',
                'year': 1999
            }
        }

        expectedGraph = GraphFactory.createInstance()

        # Expect unspecified ids to auto-increment
        author1 = Author(0, 'Author One')
        author2 = Author(2, 'Author Two')
        author3 = Author(1, 'Author Three')
        paper1 = Paper(0, 'Databases')
        paper2 = Paper(1, 'Databases')
        topic = Topic(0, ['databas'])
        conference1 = Conference(0, 'Conference One')
        conference2 = Conference(1, 'Conference Two')
        expectedGraph.addNode(author1)
        expectedGraph.addNode(author2)
        expectedGraph.addNode(author3)
        expectedGraph.addNode(paper1)
        expectedGraph.addNode(paper2)
        expectedGraph.addNode(topic)
        expectedGraph.addNode(conference1)
        expectedGraph.addNode(conference2)

        expectedGraph.addBothEdges(author1, paper1, Authorship())
        expectedGraph.addBothEdges(author3, paper1, Authorship())
        expectedGraph.addBothEdges(author2, paper2, Authorship())
        expectedGraph.addBothEdges(paper1, topic, Mention())
        expectedGraph.addBothEdges(paper2, topic, Mention())
        expectedGraph.addBothEdges(paper1, conference1, Publication())
        expectedGraph.addBothEdges(paper2, conference2, Publication())

        actualGraph = self.dataImporter.buildGraph(parsedData)

        self.assertGraphsEqual(actualGraph, expectedGraph)
    def testMutualCitationGraph(self):

        # Build sample data & expected output
        parsedData = {
            0: {
                'id': 0,
                'arnetid': 1,
                'authors': ['Author One'],
                'conference': 'Conference One',
                'references': [1],
                'title': 'Databases',
                'year': 1999
            },
            1: {
                'id': 1,
                'arnetid': 2,
                'authors': ['Author Two'],
                'conference': 'Conference Two',
                'references': [0],
                'title': 'Databases',
                'year': 1999
            }
        }

        expectedGraph = GraphFactory.createInstance()

        # Expect unspecified ids to auto-increment
        author1 = Author(0, 'Author One')
        author2 = Author(1, 'Author Two')
        paper1 = Paper(0, 'Databases')
        paper2 = Paper(1, 'Databases')
        topic = Topic(0, ['databas'])
        conference1 = Conference(0, 'Conference One')
        conference2 = Conference(1, 'Conference Two')
        expectedGraph.addNode(author1)
        expectedGraph.addNode(author2)
        expectedGraph.addNode(paper1)
        expectedGraph.addNode(paper2)
        expectedGraph.addNode(topic)
        expectedGraph.addNode(conference1)
        expectedGraph.addNode(conference2)

        expectedGraph.addBothEdges(author1, paper1, Authorship())
        expectedGraph.addBothEdges(author2, paper2, Authorship())
        expectedGraph.addBothEdges(paper1, topic, Mention())
        expectedGraph.addBothEdges(paper2, topic, Mention())
        expectedGraph.addBothEdges(paper1, conference1, Publication())
        expectedGraph.addBothEdges(paper2, conference2, Publication())

        # Symmetric in this case only!
        expectedGraph.addBothEdges(paper1, paper2, Citation())

        actualGraph = self.dataImporter.buildGraph(parsedData)

        self.assertGraphsEqual(actualGraph, expectedGraph)
Esempio n. 6
0
    def testMutualCitationGraph(self):

        # Build sample data & expected output
        parsedData = {
            0: {
                'id': 0,
                'arnetid': 1,
                'authors': ['Author One'],
                'conference': 'Conference One',
                'references': [1],
                'title': 'Databases',
                'year': 1999
            },
            1: {
                'id': 1,
                'arnetid': 2,
                'authors': ['Author Two'],
                'conference': 'Conference Two',
                'references': [0],
                'title': 'Databases',
                'year': 1999
            }
        }

        expectedGraph = GraphFactory.createInstance()

        # Expect unspecified ids to auto-increment
        author1 = Author(0, 'Author One')
        author2 = Author(1, 'Author Two')
        paper1 = Paper(0, 'Databases')
        paper2 = Paper(1, 'Databases')
        topic = Topic(0, ['databas'])
        conference1 = Conference(0, 'Conference One')
        conference2 = Conference(1, 'Conference Two')
        expectedGraph.addNode(author1)
        expectedGraph.addNode(author2)
        expectedGraph.addNode(paper1)
        expectedGraph.addNode(paper2)
        expectedGraph.addNode(topic)
        expectedGraph.addNode(conference1)
        expectedGraph.addNode(conference2)

        expectedGraph.addBothEdges(author1, paper1, Authorship())
        expectedGraph.addBothEdges(author2, paper2, Authorship())
        expectedGraph.addBothEdges(paper1, topic, Mention())
        expectedGraph.addBothEdges(paper2, topic, Mention())
        expectedGraph.addBothEdges(paper1, conference1, Publication())
        expectedGraph.addBothEdges(paper2, conference2, Publication())

        # Symmetric in this case only!
        expectedGraph.addBothEdges(paper1, paper2, Citation())

        actualGraph = self.dataImporter.buildGraph(parsedData)

        self.assertGraphsEqual(actualGraph, expectedGraph)
    def parseNodeContent(self, nodeIndex):
        """
          Parse the node content from the input files
        """

        graph = GraphFactory.createInstance()

        # Parse authors from file
        def authorLineParser(line):
            authorData = line.split()
            authorId = int(self.__removeControlCharacters(authorData[0]))
            authorName = " ".join(authorData[1:])
            author = Author(authorId, authorName)
            return authorId, author

        self.__parseNodeType(authorLineParser, "author", "author.txt", graph, nodeIndex)

        # Parse conferences from file
        def conferenceLineParser(line):
            conferenceData = line.split()
            conferenceId = int(self.__removeControlCharacters(conferenceData[0]))
            conferenceName = " ".join(conferenceData[1:])
            conference = Conference(conferenceId, conferenceName)
            return conferenceId, conference

        self.__parseNodeType(conferenceLineParser, "conference", "conf.txt", graph, nodeIndex)

        # Parse papers
        def paperLineParser(line):
            paperData = line.split()
            paperId = int(self.__removeControlCharacters(paperData[0]))
            paperTitle = " ".join(paperData[1:])
            paper = Paper(paperId, paperTitle)
            return paperId, paper

        self.__parseNodeType(paperLineParser, "paper", "paper.txt", graph, nodeIndex)

        # Parse terms
        stemmedTermMap = {}  # Map of term to topic object, to handle stemmer collisions

        def termLineParser(line):
            topicId, term = line.split()
            topicId = int(self.__removeControlCharacters(topicId))
            term = self.stemmer.stemWord(term)
            if term in stemmedTermMap:
                topic = stemmedTermMap[term]
            else:
                topic = Topic(topicId, [term]) if term not in self.stopWords else None
                if topic is not None:
                    stemmedTermMap[term] = topic
            return topicId, topic

        self.__parseNodeType(termLineParser, "topic", "term.txt", graph, nodeIndex)

        return graph, nodeIndex
    def buildGraph(self, coMoToData):

        graph = GraphFactory.createInstance()

        # Add semesters to graph
        analysisIdToAssignmentMap, offeringIdToSemesterMap = self.__addSemestersAndAssignmentsToGraph(coMoToData, graph)

        # Add submissions & students to graph, connect submissions with students and assignment, and students with assignments
        self.addStudentsAndSemestersToGraph(analysisIdToAssignmentMap, coMoToData, graph, offeringIdToSemesterMap)

        return graph
    def parseNodeContent(self, nodeIndex):
        """
          Parse the node content from the input files
        """

        graph = GraphFactory.createInstance()

        # Parse authors from file
        def authorLineParser(line):
            authorData = line.split()
            authorId = int(self.__removeControlCharacters(authorData[0]))
            authorName = ' '.join(authorData[1:])
            author = Author(authorId, authorName)
            return authorId, author
        self.__parseNodeType(authorLineParser, 'author', 'author.txt', graph, nodeIndex)


        # Parse conferences from file
        def conferenceLineParser(line):
            conferenceData = line.split()
            conferenceId = int(self.__removeControlCharacters(conferenceData[0]))
            conferenceName = ' '.join(conferenceData[1:])
            conference = Conference(conferenceId, conferenceName)
            return conferenceId, conference
        self.__parseNodeType(conferenceLineParser, 'conference', 'conf.txt', graph, nodeIndex)


        # Parse papers
        def paperLineParser(line):
            paperData = line.split()
            paperId = int(self.__removeControlCharacters(paperData[0]))
            paperTitle = ' '.join(paperData[1:])
            paper = Paper(paperId, paperTitle)
            return paperId, paper
        self.__parseNodeType(paperLineParser, 'paper', 'paper.txt', graph, nodeIndex)


        # Parse terms
        stemmedTermMap = {} # Map of term to topic object, to handle stemmer collisions
        def termLineParser(line):
            topicId, term = line.split()
            topicId = int(self.__removeControlCharacters(topicId))
            term = self.stemmer.stemWord(term)
            if term in stemmedTermMap:
                topic = stemmedTermMap[term]
            else:
                topic = Topic(topicId, [term]) if term not in self.stopWords else None
                if topic is not None:
                    stemmedTermMap[term] = topic
            return topicId, topic
        self.__parseNodeType(termLineParser, 'topic', 'term.txt', graph, nodeIndex)

        return graph, nodeIndex
Esempio n. 10
0
    def testSeparatePapersAuthorsTopicSharedConferenceGraph(self):

        # Build sample data & expected output
        parsedData = {
            0: {
                'id': 0,
                'arnetid': 1,
                'authors': ['Author One'],
                'conference': 'Conference One',
                'references': [],
                'title': 'Databases',
                'year': 1995
            },
            1: {
                'id': 1,
                'arnetid': 2,
                'authors': ['Author Two'],
                'conference': 'Conference One',
                'references': [],
                'title': 'All The Knowledge',
                'year': 1999
            }
        }

        expectedGraph = GraphFactory.createInstance()

        # Expect unspecified ids to auto-increment
        author1 = Author(0, 'Author One')
        author2 = Author(1, 'Author Two')
        paper1 = Paper(0, 'Databases')
        paper2 = Paper(1, 'All The Knowledge')
        topic1 = Topic(0, ['databas'])
        topic2 = Topic(1, ['knowledg'])
        conference = Conference(0, 'Conference One')
        expectedGraph.addNode(author1)
        expectedGraph.addNode(author2)
        expectedGraph.addNode(paper1)
        expectedGraph.addNode(paper2)
        expectedGraph.addNode(topic1)
        expectedGraph.addNode(topic2)
        expectedGraph.addNode(conference)

        expectedGraph.addBothEdges(author1, paper1, Authorship())
        expectedGraph.addBothEdges(author2, paper2, Authorship())
        expectedGraph.addBothEdges(paper1, topic1, Mention())
        expectedGraph.addBothEdges(paper2, topic2, Mention())
        expectedGraph.addBothEdges(paper1, conference, Publication())
        expectedGraph.addBothEdges(paper2, conference, Publication())

        actualGraph = self.dataImporter.buildGraph(parsedData)

        self.assertGraphsEqual(actualGraph, expectedGraph)
    def testSeparatePapersAuthorsTopicSharedConferenceGraph(self):

        # Build sample data & expected output
        parsedData = {
            0: {
                'id': 0,
                'arnetid': 1,
                'authors': ['Author One'],
                'conference': 'Conference One',
                'references': [],
                'title': 'Databases',
                'year': 1995
            },
            1: {
                'id': 1,
                'arnetid': 2,
                'authors': ['Author Two'],
                'conference': 'Conference One',
                'references': [],
                'title': 'All The Knowledge',
                'year': 1999
            }
        }

        expectedGraph = GraphFactory.createInstance()

        # Expect unspecified ids to auto-increment
        author1 = Author(0, 'Author One')
        author2 = Author(1, 'Author Two')
        paper1 = Paper(0, 'Databases')
        paper2 = Paper(1, 'All The Knowledge')
        topic1 = Topic(0, ['databas'])
        topic2 = Topic(1, ['knowledg'])
        conference = Conference(0, 'Conference One')
        expectedGraph.addNode(author1)
        expectedGraph.addNode(author2)
        expectedGraph.addNode(paper1)
        expectedGraph.addNode(paper2)
        expectedGraph.addNode(topic1)
        expectedGraph.addNode(topic2)
        expectedGraph.addNode(conference)

        expectedGraph.addBothEdges(author1, paper1, Authorship())
        expectedGraph.addBothEdges(author2, paper2, Authorship())
        expectedGraph.addBothEdges(paper1, topic1, Mention())
        expectedGraph.addBothEdges(paper2, topic2, Mention())
        expectedGraph.addBothEdges(paper1, conference, Publication())
        expectedGraph.addBothEdges(paper2, conference, Publication())

        actualGraph = self.dataImporter.buildGraph(parsedData)

        self.assertGraphsEqual(actualGraph, expectedGraph)
    def testSolutionMatchAnalysis(self):
        """
          Tests that the graph is built correctly given some simple test analysis. This
          test case considers the case of:

            * Single assignment, single analysis, single semester
            * Three submissions
            * One (same semester) solution match
        """

        # Setup CoMoTo data & expected graph
        analysisData = self.solutionMatchAnalysis

        student1 = Student(10001, 'Smith, John', 'johnsmith')
        student2 = Student(10002, 'Doe, Jane', 'janedoe')
        student3 = Student(10003, 'Smith, Joe', 'joesmith')
        submission1 = Submission(5001)
        submission2 = Submission(5002)
        submission3 = Submission(5003)
        solutionSubmission = Submission(5004, None, True)
        assignment = Assignment(1, 'MP1')
        semester = Semester(7, 'Spring', 2011)

        expectedGraph = GraphFactory.createInstance()
        expectedGraph.addNode(student1)
        expectedGraph.addNode(student2)
        expectedGraph.addNode(student3)
        expectedGraph.addNode(submission1)
        expectedGraph.addNode(submission2)
        expectedGraph.addNode(submission3)
        expectedGraph.addNode(solutionSubmission)
        expectedGraph.addNode(assignment)
        expectedGraph.addNode(semester)

        expectedGraph.addBothEdges(submission1, assignment, AssignmentSubmission())
        expectedGraph.addBothEdges(submission2, assignment, AssignmentSubmission())
        expectedGraph.addBothEdges(submission3, assignment, AssignmentSubmission())
        expectedGraph.addBothEdges(solutionSubmission, assignment, AssignmentSubmission())
        expectedGraph.addBothEdges(submission1, student1, Authorship())
        expectedGraph.addBothEdges(submission2, student2, Authorship())
        expectedGraph.addBothEdges(submission3, student3, Authorship())
        expectedGraph.addBothEdges(student1, semester, Enrollment())
        expectedGraph.addBothEdges(student2, semester, Enrollment())
        expectedGraph.addBothEdges(student3, semester, Enrollment())
        expectedGraph.addBothEdges(submission1, solutionSubmission, SolutionMatch(5000, 80))
        expectedGraph.addBothEdges(semester, assignment, SemesterAssignment())

        actualGraph = self.dataImporter.buildGraph(analysisData)

        self.assertGraphsEqual(expectedGraph, actualGraph)
Esempio n. 13
0
    def buildGraph(self, coMoToData):

        graph = GraphFactory.createInstance()

        # Add semesters to graph
        analysisIdToAssignmentMap, offeringIdToSemesterMap = self.__addSemestersAndAssignmentsToGraph(
            coMoToData, graph)

        # Add submissions & students to graph, connect submissions with students and assignment, and students with assignments
        self.addStudentsAndSemestersToGraph(analysisIdToAssignmentMap,
                                            coMoToData, graph,
                                            offeringIdToSemesterMap)

        return graph
    def testInvalidSameSemesterMatchAnalysis(self):
        """
          Tests that the graph is built correctly given some simple test analysis. This
          test case considers the case of:

            * Single assignment, single analysis, single semester
            * Three submissions
            * One (same semester) submission pair match

          Except, also includes extraneous data that should be discarded
        """

        # Setup CoMoTo data & expected graph
        analysisData = self.invalidSameSemesterMatchAnalysis

        student1 = Student(10001, 'Smith, John', 'johnsmith')
        student2 = Student(10002, 'Doe, Jane', 'janedoe')
        student3 = Student(10003, 'Smith, Joe', 'joesmith')
        submission1 = Submission(5001)
        submission2 = Submission(5002)
        submission3 = Submission(5003)
        assignment = Assignment(1, 'MP1')
        semester = Semester(7, 'Spring', 2011)

        expectedGraph = GraphFactory.createInstance()
        expectedGraph.addNode(student1)
        expectedGraph.addNode(student2)
        expectedGraph.addNode(student3)
        expectedGraph.addNode(submission1)
        expectedGraph.addNode(submission2)
        expectedGraph.addNode(submission3)
        expectedGraph.addNode(assignment)
        expectedGraph.addNode(semester)

        expectedGraph.addBothEdges(submission1, assignment, AssignmentSubmission())
        expectedGraph.addBothEdges(submission2, assignment, AssignmentSubmission())
        expectedGraph.addBothEdges(submission3, assignment, AssignmentSubmission())
        expectedGraph.addBothEdges(submission1, student1, Authorship())
        expectedGraph.addBothEdges(submission2, student2, Authorship())
        expectedGraph.addBothEdges(submission3, student3, Authorship())
        expectedGraph.addBothEdges(student1, semester, Enrollment())
        expectedGraph.addBothEdges(student2, semester, Enrollment())
        expectedGraph.addBothEdges(student3, semester, Enrollment())
        expectedGraph.addBothEdges(submission1, submission3, SameSemesterMatch(5000, 72.0))
        expectedGraph.addBothEdges(semester, assignment, SemesterAssignment())

        actualGraph = self.dataImporter.buildGraph(analysisData)

        self.assertGraphsEqual(expectedGraph, actualGraph)
Esempio n. 15
0
    def constructMultiDisciplinaryAuthorExample(indirectAuthor=False, uneven=False):
        """
            Construct example DBLP graph where two authors are multi disciplinary, and no one else
        """

        graph = GraphFactory.createInstance()
        authorMap = {}
        conferenceMap = {}

        # Add authors
        a = Author(SampleGraphUtility.__getNextId(), 'A')
        b = Author(SampleGraphUtility.__getNextId(), 'B')
        c = Author(SampleGraphUtility.__getNextId(), 'C')
        d = Author(SampleGraphUtility.__getNextId(), 'D')
        e = Author(SampleGraphUtility.__getNextId(), 'E')
        f = Author(SampleGraphUtility.__getNextId(), 'F')
        g = Author(SampleGraphUtility.__getNextId(), 'G')
        h = Author(SampleGraphUtility.__getNextId(), 'H')
        i = Author(SampleGraphUtility.__getNextId(), 'I')
        authors = [a, b, c, d, e, f, g, h, i]
        if indirectAuthor:
            authors.append(Author(SampleGraphUtility.__getNextId(), 'J'))
        graph.addNodes(authors)

        # Add conferences
        vldb = Conference(SampleGraphUtility.__getNextId(), 'VLDB')  # Databases
        kdd = Conference(SampleGraphUtility.__getNextId(), 'KDD')  # Data mining
        conferences = [vldb, kdd]
        graph.addNodes(conferences)

        # Add author / conference index
        for author in authors:
            authorMap[author.name] = author
        for conference in conferences:
            conferenceMap[conference.name] = conference

        # Helper dictionary of total citation counts for each author (to fabricate) -- all divisible by 5, and multi-discipline authors divisible by 10
        # Results in the following total counts: {'A':100, 'B':80, 'C':10, 'D':120, 'E':60, 'F':100, 'G':80, 'H':10, 'I':24}
        citationCounts = {'A': 100, 'B': 80, 'C': 10, 'D': 60, 'E': 45, 'F': 100, 'G': 80, 'H': 10, 'I': 12, 'J': 60}

        # Create two papers for each author, one paper in each conference in each area
        dmAuthorNames = ['D', 'E', 'F', 'G', 'H', 'I']
        dbAuthorNames = ['A', 'B', 'C', 'D', 'E', 'I']
        if indirectAuthor:
            dmAuthorNames += ['J']
            dbAuthorNames += ['J']
        duplicateNames = set(dmAuthorNames).intersection(set(dbAuthorNames))
        dmConferenceNames = ['KDD']
        dbConferenceNames = ['VLDB']

        def f(x):
            totalCitationCount[x] = 0

        # Create equal number of citations from each other paper in the research area for each author's papers
        totalCitationCount = defaultdict(int)
        map(f, set(dmAuthorNames).union(set(dbAuthorNames)))
        for authorNames, conferenceNames in [(dmAuthorNames, dmConferenceNames), (dbAuthorNames, dbConferenceNames)]:
            for authorName in authorNames:

                citedPaperMap = {}
                for conferenceName in conferenceNames:

                    # Add paper to be cited for author
                    citedPaper = Paper(SampleGraphUtility.__getNextId(), '%sPaperIn%s' % (authorName, conferenceName))
                    graph.addNode(citedPaper)
                    graph.addBothEdges(citedPaper, conferenceMap[conferenceName], Publication())
                    graph.addBothEdges(citedPaper, authorMap[authorName], Authorship())

                    citedPaperMap[conferenceName] = citedPaper

                # Figure out the number of incoming citation for this author from each other eligible authors
                if authorName in duplicateNames:
                    citingAuthors = set(authorNames).difference(duplicateNames)
                else:
                    citingAuthors = set(authorNames)
                    citingAuthors.remove(authorName)
                citationsPerAuthor = citationCounts[authorName] / len(citingAuthors)

                # Make sure J is cited by the two non-D multi-disciplinary authors
                if authorName == 'J':
                    citationsPerAuthor = citationCounts[authorName] / 2
                    citingAuthors = ['E', 'I']

                # Loop through papers of all other authors
                for otherAuthorName in citingAuthors:
                    if authorName == otherAuthorName: continue
                    for conferenceName in conferenceNames:
                        for i in xrange(0, citationsPerAuthor):

                            # Add fake paper for citing the other author
                            citingPaper = Paper(SampleGraphUtility.__getNextId(), 'Citation%d%sPaperIn%s' % (i, otherAuthorName, conferenceName))
                            graph.addNode(citingPaper)
                            graph.addBothEdges(authorMap[otherAuthorName], citingPaper, Authorship())
                            graph.addBothEdges(citingPaper, conferenceMap[conferenceName], Publication())

                            # Add citation
                            graph.addEdge(citingPaper, citedPaperMap[conferenceName], Citation())
                            totalCitationCount[authorName] += 1

        if not uneven:
            return graph, authorMap, conferenceMap, totalCitationCount

        # If this flag is set, add three papers per author in data mining, and citations from all other authors
        for authorNamesList, conferenceNamesList in \
                [(dmAuthorNames, dmConferenceNames), (dbAuthorNames, dbConferenceNames)]:

            extraPapers = []

            # Add publications
            for authorName in authorNamesList:
                for conferenceName in conferenceNamesList:

                    # Add paper to be cited for author
                    citedPaper = Paper(SampleGraphUtility.__getNextId(), '%sPaperIn%s' % (authorName, conferenceName))
                    graph.addNode(citedPaper)
                    graph.addBothEdges(citedPaper, conferenceMap[conferenceName], Publication())
                    graph.addBothEdges(citedPaper, authorMap[authorName], Authorship())
                    extraPapers.append((authorName, citedPaper))

            random.seed()

            # Add randomized citations from authors to these papers
            for citingAuthorName in authorNamesList:
                for conferenceName in conferenceNamesList:
                    for citedAuthorName, citedPaper in extraPapers:

                        # Skip papers authored by this author
                        if citedAuthorName == citingAuthorName:
                            continue

                        # Randomly add a number of citations to this paper
                        for i in xrange(0, random.randint(0, 3)):

                            # Add fake paper for citing the other author
                            citingPaper = Paper(SampleGraphUtility.__getNextId(), 'Citation%d%sPaperIn%s' % (
                                i, citingAuthorName, conferenceName
                            ))
                            graph.addNode(citingPaper)
                            graph.addBothEdges(authorMap[citingAuthorName], citingPaper, Authorship())
                            graph.addBothEdges(citingPaper, conferenceMap[conferenceName], Publication())

                            # Add citation
                            graph.addEdge(citingPaper, citedPaper, Citation())
                            totalCitationCount[citedAuthorName] += 1



        return graph, authorMap, conferenceMap, totalCitationCount
Esempio n. 16
0
    def constructPathSimExampleThree(extraAuthorsAndCitations=False, citationMap=None):
        """
          Constructs "Example 3" from PathSim publication, ignoring topic nodes

            @see    http://citeseer.ist.psu.edu/viewdoc/summary?doi=10.1.1.220.2455
        """

        graph = GraphFactory.createInstance()
        authorMap = {}
        conferenceMap = {}

        # Add authors
        mike = Author(SampleGraphUtility.__getNextId(), "Mike")
        jim = Author(SampleGraphUtility.__getNextId(), "Jim")
        mary = Author(SampleGraphUtility.__getNextId(), "Mary")
        bob = Author(SampleGraphUtility.__getNextId(), "Bob")
        ann = Author(SampleGraphUtility.__getNextId(), "Ann")
        authors = [mike, jim, mary, bob, ann]
        if extraAuthorsAndCitations:
            joe = Author(SampleGraphUtility.__getNextId(), "Joe")
            nancy = Author(SampleGraphUtility.__getNextId(), "Nancy")
            authors += [joe, nancy]
        else:
            joe, nancy = None, None
        graph.addNodes(authors)

        # Add conferences
        sigmod = Conference(SampleGraphUtility.__getNextId(), "SIGMOD")
        vldb = Conference(SampleGraphUtility.__getNextId(), "VLDB")
        icde = Conference(SampleGraphUtility.__getNextId(), "ICDE")
        kdd = Conference(SampleGraphUtility.__getNextId(), "KDD")
        conferences = [sigmod, vldb, icde, kdd]
        graph.addNodes([sigmod, vldb, icde, kdd])

        # Add author / conference index
        for author in authors:
            authorMap[author.name] = author
        for conference in conferences:
            conferenceMap[conference.name] = conference

        # Add author / conference / papers index
        authorConferencePaperMap = defaultdict(lambda: defaultdict(list))

        # Add jim's papers
        for i in xrange(0, 70):
            conference = sigmod if i < 50 else vldb
            paper = Paper(SampleGraphUtility.__getNextId(), "%s Paper %d" % (conference.name, i + 1))
            graph.addNode(paper)
            graph.addBothEdges(jim, paper, Authorship())
            graph.addBothEdges(paper, conference, Publication())
            authorConferencePaperMap[jim][conference].append(paper)

        # Add ann's papers
        annsPaper1 = Paper(SampleGraphUtility.__getNextId(), "ICDE Paper")
        annsPaper2 = Paper(SampleGraphUtility.__getNextId(), "KDD Paper")
        graph.addBothEdges(ann, annsPaper1, Authorship())
        graph.addBothEdges(ann, annsPaper2, Authorship())
        graph.addBothEdges(annsPaper1, icde, Publication())
        graph.addBothEdges(annsPaper2, kdd, Publication())
        authorConferencePaperMap[ann][icde].append(annsPaper1)
        authorConferencePaperMap[ann][kdd].append(annsPaper2)

        # Auto-add remaining authors (2,1) paper numbers
        SampleGraphUtility.__addSimilarAuthorsPapers(graph, mike, sigmod, vldb, authorConferencePaperMap)
        SampleGraphUtility.__addSimilarAuthorsPapers(graph, mary, sigmod, icde, authorConferencePaperMap)
        SampleGraphUtility.__addSimilarAuthorsPapers(graph, bob, sigmod, vldb, authorConferencePaperMap)

        # Add extra authors & citation data
        if extraAuthorsAndCitations:
            SampleGraphUtility.__addSimilarAuthorsPapers(graph, joe, sigmod, vldb, authorConferencePaperMap)
            SampleGraphUtility.__addSimilarAuthorsPapers(graph, nancy, sigmod, vldb, authorConferencePaperMap)
            SampleGraphUtility.__constructCitations(
                graph, authorMap, conferenceMap, authorConferencePaperMap, citationMap
            )

        return graph, authorMap, conferenceMap
Esempio n. 17
0
    def constructSkewedCitationPublicationExample(introduceRandomness=True, citationsPublicationsParameter=None):
        """
          Build the graph for an example with skewed citation / publication count ratios

            NOTE: Extraneous authors are omitted
        """

        graph = GraphFactory.createInstance()
        random.seed()

        # Create the authors & conference
        alice = Author(SampleGraphUtility.__getNextId(), "Alice")
        bob = Author(SampleGraphUtility.__getNextId(), "Bob")
        carol = Author(SampleGraphUtility.__getNextId(), "Carol")
        dave = Author(SampleGraphUtility.__getNextId(), "Dave")
        ed = Author(SampleGraphUtility.__getNextId(), "Ed")
        frank = Author(SampleGraphUtility.__getNextId(), "Frank")
        authors = [alice, bob, carol, dave, ed, frank]
        authorMap = {author.name: author for author in authors}
        conference = Conference(SampleGraphUtility.__getNextId(), "KDD")

        # Citation & publication count configuration
        if citationsPublicationsParameter is not None:
            citationsPublications = citationsPublicationsParameter
        else:
            citationsPublications = {
                "Alice": (100, 10),
                "Bob": (80, 10),
                "Carol": (100, 100),
                "Dave": (50, 10),
                "Ed": (10, 10),
                "Frank": (1000, 100),
            }

        actualCitationsPublications = defaultdict(lambda: (0, 0))

        # Helper functions for repeatedly adding papers to the graph
        addPapersToAuthor = lambda n, author: [addPublicationPaper(author) for _ in itertools.repeat(None, n)]
        addCitationsToPaper = lambda n, paper, author: [
            addCitationPaper(paper, author) for _ in itertools.repeat(None, n)
        ]

        def addPublicationPaper(author):
            """
              Helper method to add a 'publication' paper, connected to both an author and a conference
            """
            nextId = SampleGraphUtility.__getNextId()
            paper = Paper(nextId, "%s's Paper %d" % (author.name, nextId))
            graph.addNode(paper)
            graph.addBothEdges(author, paper)
            graph.addBothEdges(paper, conference)

            citationCount, publicationCount = actualCitationsPublications[author]
            actualCitationsPublications[author] = (citationCount, publicationCount + 1)

            return paper

        def addCitationPaper(citedPaper, citedAuthor):
            """
              Helper method to add a 'citation' paper, which is only connected to the conference and the paper it cites
            """
            nextId = SampleGraphUtility.__getNextId()
            citingPaper = Paper(nextId, "Citing Paper %d" % nextId)
            graph.addNode(citingPaper)
            graph.addBothEdges(citingPaper, conference)
            graph.addEdge(citingPaper, citedPaper)

            citationCount, publicationCount = actualCitationsPublications[citedAuthor]
            actualCitationsPublications[citedAuthor] = (citationCount + 1, publicationCount)

        # Construct the graph
        graph.addNodes(authors + [conference])
        for authorName in citationsPublications:
            citationCount, publicationCount = citationsPublications[authorName]

            # Optionally, introduce randomness
            if introduceRandomness:
                randomInterval = lambda x: (x + int(-0.1 * x), x + int(0.1 * x))
                citationCount = random.randint(*randomInterval(citationCount))
                publicationCount = random.randint(*randomInterval(publicationCount))

            # Add citations & publications to author
            authorPapers = addPapersToAuthor(publicationCount, authorMap[authorName])
            citationsPerPaper = citationCount / publicationCount
            remainingCitationsPerPaper = citationCount % publicationCount
            for paper in authorPapers:
                addCitationsToPaper(citationsPerPaper, paper, authorMap[authorName])
                if (
                    actualCitationsPublications[authorMap[authorName]][0] < citationsPublications[authorName][0]
                    and remainingCitationsPerPaper > 0
                ):
                    addCitationsToPaper(remainingCitationsPerPaper, paper, authorMap[authorName])

        return graph, authorMap, conference, actualCitationsPublications
Esempio n. 18
0
    def constructMultiDisciplinaryAuthorExample(indirectAuthor=False, uneven=False):
        """
            Construct example DBLP graph where two authors are multi disciplinary, and no one else
        """

        graph = GraphFactory.createInstance()
        authorMap = {}
        conferenceMap = {}

        # Add authors
        a = Author(SampleGraphUtility.__getNextId(), "A")
        b = Author(SampleGraphUtility.__getNextId(), "B")
        c = Author(SampleGraphUtility.__getNextId(), "C")
        d = Author(SampleGraphUtility.__getNextId(), "D")
        e = Author(SampleGraphUtility.__getNextId(), "E")
        f = Author(SampleGraphUtility.__getNextId(), "F")
        g = Author(SampleGraphUtility.__getNextId(), "G")
        h = Author(SampleGraphUtility.__getNextId(), "H")
        i = Author(SampleGraphUtility.__getNextId(), "I")
        authors = [a, b, c, d, e, f, g, h, i]
        if indirectAuthor:
            authors.append(Author(SampleGraphUtility.__getNextId(), "J"))
        graph.addNodes(authors)

        # Add conferences
        vldb = Conference(SampleGraphUtility.__getNextId(), "VLDB")  # Databases
        kdd = Conference(SampleGraphUtility.__getNextId(), "KDD")  # Data mining
        conferences = [vldb, kdd]
        graph.addNodes(conferences)

        # Add author / conference index
        for author in authors:
            authorMap[author.name] = author
        for conference in conferences:
            conferenceMap[conference.name] = conference

        # Helper dictionary of total citation counts for each author (to fabricate) -- all divisible by 5, and multi-discipline authors divisible by 10
        # Results in the following total counts: {'A':100, 'B':80, 'C':10, 'D':120, 'E':60, 'F':100, 'G':80, 'H':10, 'I':24}
        citationCounts = {"A": 100, "B": 80, "C": 10, "D": 60, "E": 45, "F": 100, "G": 80, "H": 10, "I": 12, "J": 60}

        # Create two papers for each author, one paper in each conference in each area
        dmAuthorNames = ["D", "E", "F", "G", "H", "I"]
        dbAuthorNames = ["A", "B", "C", "D", "E", "I"]
        if indirectAuthor:
            dmAuthorNames += ["J"]
            dbAuthorNames += ["J"]
        duplicateNames = set(dmAuthorNames).intersection(set(dbAuthorNames))
        dmConferenceNames = ["KDD"]
        dbConferenceNames = ["VLDB"]

        def f(x):
            totalCitationCount[x] = 0

        # Create equal number of citations from each other paper in the research area for each author's papers
        totalCitationCount = defaultdict(int)
        map(f, set(dmAuthorNames).union(set(dbAuthorNames)))
        for authorNames, conferenceNames in [(dmAuthorNames, dmConferenceNames), (dbAuthorNames, dbConferenceNames)]:
            for authorName in authorNames:

                citedPaperMap = {}
                for conferenceName in conferenceNames:

                    # Add paper to be cited for author
                    citedPaper = Paper(SampleGraphUtility.__getNextId(), "%sPaperIn%s" % (authorName, conferenceName))
                    graph.addNode(citedPaper)
                    graph.addBothEdges(citedPaper, conferenceMap[conferenceName], Publication())
                    graph.addBothEdges(citedPaper, authorMap[authorName], Authorship())

                    citedPaperMap[conferenceName] = citedPaper

                # Figure out the number of incoming citation for this author from each other eligible authors
                if authorName in duplicateNames:
                    citingAuthors = set(authorNames).difference(duplicateNames)
                else:
                    citingAuthors = set(authorNames)
                    citingAuthors.remove(authorName)
                citationsPerAuthor = citationCounts[authorName] / len(citingAuthors)

                # Make sure J is cited by the two non-D multi-disciplinary authors
                if authorName == "J":
                    citationsPerAuthor = citationCounts[authorName] / 2
                    citingAuthors = ["E", "I"]

                # Loop through papers of all other authors
                for otherAuthorName in citingAuthors:
                    if authorName == otherAuthorName:
                        continue
                    for conferenceName in conferenceNames:
                        for i in xrange(0, citationsPerAuthor):

                            # Add fake paper for citing the other author
                            citingPaper = Paper(
                                SampleGraphUtility.__getNextId(),
                                "Citation%d%sPaperIn%s" % (i, otherAuthorName, conferenceName),
                            )
                            graph.addNode(citingPaper)
                            graph.addBothEdges(authorMap[otherAuthorName], citingPaper, Authorship())
                            graph.addBothEdges(citingPaper, conferenceMap[conferenceName], Publication())

                            # Add citation
                            graph.addEdge(citingPaper, citedPaperMap[conferenceName], Citation())
                            totalCitationCount[authorName] += 1

        if not uneven:
            return graph, authorMap, conferenceMap, totalCitationCount

        # If this flag is set, add three papers per author in data mining, and citations from all other authors
        for authorNamesList, conferenceNamesList in [
            (dmAuthorNames, dmConferenceNames),
            (dbAuthorNames, dbConferenceNames),
        ]:

            extraPapers = []

            # Add publications
            for authorName in authorNamesList:
                for conferenceName in conferenceNamesList:

                    # Add paper to be cited for author
                    citedPaper = Paper(SampleGraphUtility.__getNextId(), "%sPaperIn%s" % (authorName, conferenceName))
                    graph.addNode(citedPaper)
                    graph.addBothEdges(citedPaper, conferenceMap[conferenceName], Publication())
                    graph.addBothEdges(citedPaper, authorMap[authorName], Authorship())
                    extraPapers.append((authorName, citedPaper))

            random.seed()

            # Add randomized citations from authors to these papers
            for citingAuthorName in authorNamesList:
                for conferenceName in conferenceNamesList:
                    for citedAuthorName, citedPaper in extraPapers:

                        # Skip papers authored by this author
                        if citedAuthorName == citingAuthorName:
                            continue

                        # Randomly add a number of citations to this paper
                        for i in xrange(0, random.randint(0, 3)):

                            # Add fake paper for citing the other author
                            citingPaper = Paper(
                                SampleGraphUtility.__getNextId(),
                                "Citation%d%sPaperIn%s" % (i, citingAuthorName, conferenceName),
                            )
                            graph.addNode(citingPaper)
                            graph.addBothEdges(authorMap[citingAuthorName], citingPaper, Authorship())
                            graph.addBothEdges(citingPaper, conferenceMap[conferenceName], Publication())

                            # Add citation
                            graph.addEdge(citingPaper, citedPaper, Citation())
                            totalCitationCount[citedAuthorName] += 1

        return graph, authorMap, conferenceMap, totalCitationCount
    def buildGraph(self, parsedData):
        """
          Form the DBLP graph structure from the parsed data
        """

        graph = GraphFactory.createInstance()

        # First, build the nodes for the graph
        authors = {} # Indexed by name
        papers = {} # Indexed by paper id
        topics = {} # Indexed by keyword
        conferences = {} # Indexed by name
        citationMap = {} # Map of paper id to referenced paper ids

        # Construct everything except reference edges
        for paperId in parsedData:
            paperData = parsedData[paperId]

            paper = Paper(paperId, paperData['title'])
            citationMap[paperId] = paperData['references']

            # Create or get conference for this paper
            conferenceName = paperData['conference']
            if conferenceName not in conferences:
                conference = Conference(len(conferences), conferenceName)
                conferences[conferenceName] = conference
                graph.addNode(conference)
            else:
                conference = conferences[conferenceName]

            # Create or get authors for this paper
            paperAuthors = []
            for authorName in paperData['authors']:
                if authorName not in authors:
                    author = Author(len(authors), authorName)
                    authors[authorName] = author
                    graph.addNode(author)
                else:
                    author = authors[authorName]
                paperAuthors.append(author)

            # Extract keywords from title, and use as topics
            keywords = self.__extractKeywords(paperData['title'])
            for keyword in keywords:
                if keyword not in topics:
                    topic = Topic(len(topics), [keyword])
                    topics[keyword] = topic
                    graph.addNode(topic)
                else:
                    topic = topics[keyword]
                graph.addEdge(topic, paper, Mention())
                graph.addEdge(paper, topic, Mention())

            # Add new paper to the graph
            papers[paperId] = paper
            graph.addNode(paper)

            # Add corresponding edges in the graph
            for author in paperAuthors:
                graph.addEdge(paper, author, Authorship())
                graph.addEdge(author, paper, Authorship())
            graph.addEdge(paper, conference, Publication())
            graph.addEdge(conference, paper, Publication())

        # Add citations to the graph
        for paperId in citationMap:
            references = citationMap[paperId]
            paper = papers[paperId]
            for citedPaperId in references:
                citedPaper = papers[citedPaperId]
                graph.addEdge(paper, citedPaper, Citation())

        return graph
    def testRetakingStudentAnalyses(self):
        """
          Tests that the graph is built correctly given some more complex test analysis. This
          test case considers the case of:

            * Two assignments, two analyses, two semesters
            * Five submissions, two by a single student (in two semesters)
            * One (cross semester) match between two submissions from the same student

          In this case, the match should be removed
        """

        # Setup CoMoTo data & expected graph
        analysisData = self.retakingStudentAnalysis

        student1 = Student(10001, 'Smith, John', 'johnsmith', True)
        student2 = Student(10002, 'Doe, Jane', 'janedoe')
        student3 = Student(10003, 'Smith, Joe', 'joesmith')
        student4 = Student(10004, 'Smith, Alex', 'alexsmith')
        submission2 = Submission(5002)
        submission3 = Submission(5003)
        submission4 = Submission(5004)
        submission5 = Submission(5005)
        assignment1 = Assignment(1, 'MP1')
        assignment2 = Assignment(2, 'MP2')
        semester1 = Semester(7, 'Spring', 2011)
        semester2 = Semester(8, 'Spring', 2012)

        expectedGraph = GraphFactory.createInstance()
        expectedGraph.addNode(student1)
        expectedGraph.addNode(student2)
        expectedGraph.addNode(student3)
        expectedGraph.addNode(student4)
        expectedGraph.addNode(submission2)
        expectedGraph.addNode(submission3)
        expectedGraph.addNode(submission4)
        expectedGraph.addNode(submission5)
        expectedGraph.addNode(assignment1)
        expectedGraph.addNode(assignment2)
        expectedGraph.addNode(semester1)
        expectedGraph.addNode(semester2)

        expectedGraph.addBothEdges(submission2, assignment1, AssignmentSubmission())
        expectedGraph.addBothEdges(submission3, assignment1, AssignmentSubmission())
        expectedGraph.addBothEdges(submission4, assignment2, AssignmentSubmission())
        expectedGraph.addBothEdges(submission5, assignment2, AssignmentSubmission())
        expectedGraph.addBothEdges(submission2, student2, Authorship())
        expectedGraph.addBothEdges(submission3, student3, Authorship())
        expectedGraph.addBothEdges(submission4, student1, Authorship())
        expectedGraph.addBothEdges(submission5, student4, Authorship())
        expectedGraph.addBothEdges(student1, semester2, Enrollment())
        expectedGraph.addBothEdges(student2, semester1, Enrollment())
        expectedGraph.addBothEdges(student3, semester1, Enrollment())
        expectedGraph.addBothEdges(student4, semester2, Enrollment())
        expectedGraph.addBothEdges(semester1, assignment1, SemesterAssignment())
        expectedGraph.addBothEdges(semester2, assignment2, SemesterAssignment())

        # Test
        actualGraph = self.dataImporter.buildGraph(analysisData)

        # Verify
        self.assertGraphsEqual(expectedGraph, actualGraph)
    def testCrossSemesterMatchAnalyses(self):
        """
          Tests that the graph is built correctly given some more complex test analysis. This
          test case considers the case of:

            * Two assignments, two analyses, two semesters
            * Four submissions
            * One (cross semester) match
        """

        # Setup CoMoTo data & expected graph
        analysisData = self.crossSemesterMatchAnalysis

        student1 = Student(10001, 'Smith, John', 'johnsmith')
        student2 = Student(10002, 'Doe, Jane', 'janedoe')
        student3 = Student(10003, 'Smith, Joe', 'joesmith')
        student4 = Student(10004, 'Smith, Alex', 'alexsmith')
        submission1 = Submission(5001)
        submission2 = Submission(5002)
        submission3 = Submission(5003)
        submission4 = Submission(5004)
        assignment1 = Assignment(1, 'MP1')
        assignment2 = Assignment(2, 'MP2')
        semester1 = Semester(7, 'Spring', 2011)
        semester2 = Semester(8, 'Spring', 2012)

        expectedGraph = GraphFactory.createInstance()
        expectedGraph.addNode(student1)
        expectedGraph.addNode(student2)
        expectedGraph.addNode(student3)
        expectedGraph.addNode(student4)
        expectedGraph.addNode(submission1)
        expectedGraph.addNode(submission2)
        expectedGraph.addNode(submission3)
        expectedGraph.addNode(submission4)
        expectedGraph.addNode(assignment1)
        expectedGraph.addNode(assignment2)
        expectedGraph.addNode(semester1)
        expectedGraph.addNode(semester2)

        expectedGraph.addBothEdges(submission1, assignment1, AssignmentSubmission())
        expectedGraph.addBothEdges(submission2, assignment1, AssignmentSubmission())
        expectedGraph.addBothEdges(submission3, assignment1, AssignmentSubmission())
        expectedGraph.addBothEdges(submission4, assignment2, AssignmentSubmission())
        expectedGraph.addBothEdges(submission1, student1, Authorship())
        expectedGraph.addBothEdges(submission2, student2, Authorship())
        expectedGraph.addBothEdges(submission3, student3, Authorship())
        expectedGraph.addBothEdges(submission4, student4, Authorship())
        expectedGraph.addBothEdges(student1, semester1, Enrollment())
        expectedGraph.addBothEdges(student2, semester1, Enrollment())
        expectedGraph.addBothEdges(student3, semester1, Enrollment())
        expectedGraph.addBothEdges(student4, semester2, Enrollment())
        expectedGraph.addBothEdges(semester1, assignment1, SemesterAssignment())
        expectedGraph.addBothEdges(semester2, assignment2, SemesterAssignment())

        # Every type of edge should be symmetric, except for the cross-semester match (since current submissions can
        # match past submissions, but not vice versa)
        expectedGraph.addEdge(submission1, submission4, CrossSemesterMatch(5000, 72.0))

        # Test
        actualGraph = self.dataImporter.buildGraph(analysisData)

        # Verify
        self.assertGraphsEqual(expectedGraph, actualGraph)
Esempio n. 22
0
    def constructSkewedCitationPublicationExample(introduceRandomness=True, citationsPublicationsParameter=None):
        """
          Build the graph for an example with skewed citation / publication count ratios

            NOTE: Extraneous authors are omitted
        """

        graph = GraphFactory.createInstance()
        random.seed()

        # Create the authors & conference
        alice = Author(SampleGraphUtility.__getNextId(), 'Alice')
        bob = Author(SampleGraphUtility.__getNextId(), 'Bob')
        carol = Author(SampleGraphUtility.__getNextId(), 'Carol')
        dave = Author(SampleGraphUtility.__getNextId(), 'Dave')
        ed = Author(SampleGraphUtility.__getNextId(), 'Ed')
        frank = Author(SampleGraphUtility.__getNextId(), 'Frank')
        authors = [alice, bob, carol, dave, ed, frank]
        authorMap = {author.name: author for author in authors}
        conference = Conference(SampleGraphUtility.__getNextId(), 'KDD')

        # Citation & publication count configuration
        if citationsPublicationsParameter is not None:
            citationsPublications = citationsPublicationsParameter
        else:
            citationsPublications = {
                'Alice': (100, 10),
                'Bob': (80, 10),
                'Carol': (100, 100),
                'Dave': (50, 10),
                'Ed': (10, 10),
                'Frank': (1000, 100)
            }

        actualCitationsPublications = defaultdict(lambda: (0, 0))

        # Helper functions for repeatedly adding papers to the graph
        addPapersToAuthor = lambda n, author: [addPublicationPaper(author) for _ in itertools.repeat(None, n)]
        addCitationsToPaper = lambda n, paper, author: [addCitationPaper(paper, author) for _ in itertools.repeat(None, n)]

        def addPublicationPaper(author):
            """
              Helper method to add a 'publication' paper, connected to both an author and a conference
            """
            nextId = SampleGraphUtility.__getNextId()
            paper = Paper(nextId, "%s's Paper %d" % (author.name, nextId))
            graph.addNode(paper)
            graph.addBothEdges(author, paper)
            graph.addBothEdges(paper, conference)

            citationCount, publicationCount = actualCitationsPublications[author]
            actualCitationsPublications[author] = (citationCount, publicationCount + 1)

            return paper

        def addCitationPaper(citedPaper, citedAuthor):
            """
              Helper method to add a 'citation' paper, which is only connected to the conference and the paper it cites
            """
            nextId = SampleGraphUtility.__getNextId()
            citingPaper = Paper(nextId, "Citing Paper %d" % nextId)
            graph.addNode(citingPaper)
            graph.addBothEdges(citingPaper, conference)
            graph.addEdge(citingPaper, citedPaper)

            citationCount, publicationCount = actualCitationsPublications[citedAuthor]
            actualCitationsPublications[citedAuthor] = (citationCount + 1, publicationCount)

        # Construct the graph
        graph.addNodes(authors + [conference])
        for authorName in citationsPublications:
            citationCount, publicationCount = citationsPublications[authorName]

            # Optionally, introduce randomness
            if introduceRandomness:
                randomInterval = lambda x: (x + int(-0.1 * x), x + int(0.1 * x))
                citationCount = random.randint(*randomInterval(citationCount))
                publicationCount = random.randint(*randomInterval(publicationCount))

            # Add citations & publications to author
            authorPapers = addPapersToAuthor(publicationCount, authorMap[authorName])
            citationsPerPaper = citationCount / publicationCount
            remainingCitationsPerPaper = citationCount % publicationCount
            for paper in authorPapers:
                addCitationsToPaper(citationsPerPaper, paper, authorMap[authorName])
                if actualCitationsPublications[authorMap[authorName]][0] < citationsPublications[authorName][0] \
                        and remainingCitationsPerPaper > 0:
                    addCitationsToPaper(remainingCitationsPerPaper, paper, authorMap[authorName])

        return graph, authorMap, conference, actualCitationsPublications
    def buildGraph(self, parsedData):
        """
          Form the DBLP graph structure from the parsed data
        """

        graph = GraphFactory.createInstance()

        # First, build the nodes for the graph
        authors = {}  # Indexed by name
        papers = {}  # Indexed by paper id
        topics = {}  # Indexed by keyword
        conferences = {}  # Indexed by name
        citationMap = {}  # Map of paper id to referenced paper ids

        # Construct everything except reference edges
        for paperId in parsedData:
            paperData = parsedData[paperId]

            paper = Paper(paperId, paperData['title'])
            citationMap[paperId] = paperData['references']

            # Create or get conference for this paper
            conferenceName = paperData['conference']
            if conferenceName not in conferences:
                conference = Conference(len(conferences), conferenceName)
                conferences[conferenceName] = conference
                graph.addNode(conference)
            else:
                conference = conferences[conferenceName]

            # Create or get authors for this paper
            paperAuthors = []
            for authorName in paperData['authors']:
                if authorName not in authors:
                    author = Author(len(authors), authorName)
                    authors[authorName] = author
                    graph.addNode(author)
                else:
                    author = authors[authorName]
                paperAuthors.append(author)

            # Extract keywords from title, and use as topics
            keywords = self.__extractKeywords(paperData['title'])
            for keyword in keywords:
                if keyword not in topics:
                    topic = Topic(len(topics), [keyword])
                    topics[keyword] = topic
                    graph.addNode(topic)
                else:
                    topic = topics[keyword]
                graph.addEdge(topic, paper, Mention())
                graph.addEdge(paper, topic, Mention())

            # Add new paper to the graph
            papers[paperId] = paper
            graph.addNode(paper)

            # Add corresponding edges in the graph
            for author in paperAuthors:
                graph.addEdge(paper, author, Authorship())
                graph.addEdge(author, paper, Authorship())
            graph.addEdge(paper, conference, Publication())
            graph.addEdge(conference, paper, Publication())

        # Add citations to the graph
        for paperId in citationMap:
            references = citationMap[paperId]
            paper = papers[paperId]
            for citedPaperId in references:
                citedPaper = papers[citedPaperId]
                graph.addEdge(paper, citedPaper, Citation())

        return graph
Esempio n. 24
0
    def constructPathSimExampleThree(extraAuthorsAndCitations=False, citationMap=None):
        """
          Constructs "Example 3" from PathSim publication, ignoring topic nodes

            @see    http://citeseer.ist.psu.edu/viewdoc/summary?doi=10.1.1.220.2455
        """

        graph = GraphFactory.createInstance()
        authorMap = {}
        conferenceMap = {}

        # Add authors
        mike = Author(SampleGraphUtility.__getNextId(), 'Mike')
        jim = Author(SampleGraphUtility.__getNextId(), 'Jim')
        mary = Author(SampleGraphUtility.__getNextId(), 'Mary')
        bob = Author(SampleGraphUtility.__getNextId(), 'Bob')
        ann = Author(SampleGraphUtility.__getNextId(), 'Ann')
        authors = [mike, jim, mary, bob, ann]
        if extraAuthorsAndCitations:
            joe = Author(SampleGraphUtility.__getNextId(), 'Joe')
            nancy = Author(SampleGraphUtility.__getNextId(), 'Nancy')
            authors += [joe, nancy]
        else:
            joe, nancy = None, None
        graph.addNodes(authors)

        # Add conferences
        sigmod = Conference(SampleGraphUtility.__getNextId(), 'SIGMOD')
        vldb = Conference(SampleGraphUtility.__getNextId(), 'VLDB')
        icde = Conference(SampleGraphUtility.__getNextId(), 'ICDE')
        kdd = Conference(SampleGraphUtility.__getNextId(), 'KDD')
        conferences = [sigmod, vldb, icde, kdd]
        graph.addNodes([sigmod, vldb, icde, kdd])

        # Add author / conference index
        for author in authors:
            authorMap[author.name] = author
        for conference in conferences:
            conferenceMap[conference.name] = conference

        # Add author / conference / papers index
        authorConferencePaperMap = defaultdict(lambda : defaultdict(list))

        # Add jim's papers
        for i in xrange(0, 70):
            conference = sigmod if i < 50 else vldb
            paper = Paper(SampleGraphUtility.__getNextId(), '%s Paper %d' % (conference.name, i + 1))
            graph.addNode(paper)
            graph.addBothEdges(jim, paper, Authorship())
            graph.addBothEdges(paper, conference, Publication())
            authorConferencePaperMap[jim][conference].append(paper)

        # Add ann's papers
        annsPaper1 = Paper(SampleGraphUtility.__getNextId(), 'ICDE Paper')
        annsPaper2 = Paper(SampleGraphUtility.__getNextId(), 'KDD Paper')
        graph.addBothEdges(ann, annsPaper1, Authorship())
        graph.addBothEdges(ann, annsPaper2, Authorship())
        graph.addBothEdges(annsPaper1, icde, Publication())
        graph.addBothEdges(annsPaper2, kdd, Publication())
        authorConferencePaperMap[ann][icde].append(annsPaper1)
        authorConferencePaperMap[ann][kdd].append(annsPaper2)

        # Auto-add remaining authors (2,1) paper numbers
        SampleGraphUtility.__addSimilarAuthorsPapers(graph, mike, sigmod, vldb, authorConferencePaperMap)
        SampleGraphUtility.__addSimilarAuthorsPapers(graph, mary, sigmod, icde, authorConferencePaperMap)
        SampleGraphUtility.__addSimilarAuthorsPapers(graph, bob, sigmod, vldb, authorConferencePaperMap)

        # Add extra authors & citation data
        if extraAuthorsAndCitations:
            SampleGraphUtility.__addSimilarAuthorsPapers(graph, joe, sigmod, vldb, authorConferencePaperMap)
            SampleGraphUtility.__addSimilarAuthorsPapers(graph, nancy, sigmod, vldb, authorConferencePaperMap)
            SampleGraphUtility.__constructCitations(graph, authorMap, conferenceMap, authorConferencePaperMap, citationMap)

        return graph, authorMap, conferenceMap