def setUp(self):
     self.dataImporter = SerializedDBLPDataImporter(os.path.join("data", "dbis"), os.path.join("graphs", "dbis"))
 def setUp(self):
     self.dataImporter = SerializedDBLPDataImporter(
         os.path.join('data','four_area'),
         os.path.join('graphs','fourArea')
     )
class DBISDataImporterTest(ImporterTest):
    def setUp(self):
        self.dataImporter = SerializedDBLPDataImporter(os.path.join("data", "dbis"), os.path.join("graphs", "dbis"))

    def testBasicParsedIndexData(self):
        """
          Checks that the expected number of nodes are parsed from input files, and that parsed data is non-null
        """

        graph, actualNodeIndex = self.dataImporter.parseNodeContent({"author": {}})

        # For conferences, authors, and papers, count should be the same (don't check exact count for topics)
        expectedAuthorCount = 60694
        expectedConferenceCount = 464
        expectedPaperCount = 72902
        self.assertEquals(expectedAuthorCount, len(actualNodeIndex["author"]))
        self.assertEquals(expectedConferenceCount, len(actualNodeIndex["conference"]))
        self.assertEquals(expectedPaperCount, len(actualNodeIndex["paper"]))

        # Assert that non-null data was parsed into all objects (including topics)
        for nodeType in actualNodeIndex:
            for nodeId in actualNodeIndex[nodeType]:
                nodeData = actualNodeIndex[nodeType][nodeId].toDict()
                for key in nodeData:
                    self.assertIsNotNone(nodeData[key])
                    nodeValue = len(nodeData[key]) if type(nodeData[key]) == type("") else nodeData[key]
                    self.assertTrue(nodeValue > 0)

    def testParsedIndexDataTopicKeywords(self):
        """
          Checks that the topics are parsed correctly (i.e. stop words are removed, and words are stemmed)
        """

        graph, actualNodeIndex = self.dataImporter.parseNodeContent({"author": {}})

        actualKeywords = set()
        for nodeId in actualNodeIndex["topic"]:
            actualKeywords.add(actualNodeIndex["topic"][nodeId].keywords[0])

        # Check that a few stop words are removed
        sampleStopWords = {"of", "the", "for", "or", "to", "a"}
        self.assertEqual(0, len(sampleStopWords.intersection(actualKeywords)))

        # Check that a few samples (known to be in the actual input) are properly stemmed
        stemmedRemovedWords = {"individuals", "formalisms", "challenges", "challenging"}
        self.assertEqual(0, len(stemmedRemovedWords.intersection(actualKeywords)))

        # Check that the number of keywords is at least 15% smaller than the input keywords
        self.assertLess(len(actualKeywords), 16798 * 0.85)

    def testParsedGraphNodes(self):
        """
          Checks nodes are constructed corresponding to every entry in the index
        """

        graph, actualNodeIndex = self.dataImporter.parseNodeContent({"author": {}})

        expectedTypeCounts = {}
        for key in actualNodeIndex:
            expectedTypeCounts[key] = 0
            for nodeId in actualNodeIndex[key]:
                expectedTypeCounts[key] += 1

        actualTypeCounts = {"topic": 0, "paper": 0, "author": 0, "conference": 0}
        otherTypeCounts = 0
        for node in graph.getNodes():
            if isinstance(node, Topic):
                actualTypeCounts["topic"] += 1
            elif isinstance(node, Paper):
                actualTypeCounts["paper"] += 1
            elif isinstance(node, Conference):
                actualTypeCounts["conference"] += 1
            elif isinstance(node, Author):
                actualTypeCounts["author"] += 1
            else:
                otherTypeCounts += 1

        # Don't test topic count, since stop words list & stemming are involved
        actualTypeCounts["topic"] = expectedTypeCounts["topic"]

        self.assertEquals(expectedTypeCounts, actualTypeCounts)
        self.assertEquals(0, otherTypeCounts)

    def testParsedGraphAuthorshipEdges(self):
        """
          Checks that parsing the basic authorship edge content of the graph works as expected, by spot checking a few
          edges that do & don't exist
        """

        graph, nodeIndex = self.dataImporter.parseNodeContent({})
        graph = self.dataImporter.parseEdgeContent(graph, nodeIndex)

        # Test single paper / author
        singleAuthorPaper = nodeIndex["paper"][21530]
        singlePaperAuthor = nodeIndex["author"][33483]
        self.assertTrue(graph.hasEdge(singleAuthorPaper, singlePaperAuthor))
        self.assertTrue(graph.hasEdge(singlePaperAuthor, singleAuthorPaper))

        # Test multiple authors for a paper
        multiAuthorPaper = nodeIndex["paper"][21536]
        multiAuthorPaperAuthor1 = nodeIndex["author"][2247]
        multiAuthorPaperAuthor2 = nodeIndex["author"][5763]
        self.assertTrue(graph.hasEdge(multiAuthorPaper, multiAuthorPaperAuthor1))
        self.assertTrue(graph.hasEdge(multiAuthorPaperAuthor1, multiAuthorPaper))
        self.assertTrue(graph.hasEdge(multiAuthorPaper, multiAuthorPaperAuthor2))
        self.assertTrue(graph.hasEdge(multiAuthorPaperAuthor2, multiAuthorPaper))

        # Test author for only one paper
        self.assertTrue(singleAuthorPaper in graph.getSuccessors(singlePaperAuthor))
        self.assertTrue(singleAuthorPaper in graph.getPredecessors(singlePaperAuthor))

    def testParsedGraphPublicationEdges(self):
        """
          Checks that parsing the basic publication edge content of the graph works as expected, by spot checking a few
          edges that do & don't exist
        """

        graph, nodeIndex = self.dataImporter.parseNodeContent({})
        graph = self.dataImporter.parseEdgeContent(graph, nodeIndex)

        conference = nodeIndex["conference"][512]
        conferencePublicationCount = 9
        paper = nodeIndex["paper"][67300]

        # Check basic publication case
        self.assertTrue(graph.hasEdge(paper, conference))
        self.assertTrue(graph.hasEdge(conference, paper))

        # Check that papers are only connected with one conference
        for node in graph.getSuccessors(paper):
            if isinstance(node, Conference):
                self.assertEquals(conference, node)

        # Check the number of publications for a conference
        self.assertEquals(conferencePublicationCount, len(graph.getPredecessors(conference)))

    def testParsedTopicEdges(self):
        """
          Checks that parsing topic edge content of the graph works as expected, by spot checking a few stemmer
          collisions that should occur
        """

        graph, nodeIndex = self.dataImporter.parseNodeContent({})
        graph = self.dataImporter.parseEdgeContent(graph, nodeIndex)

        # Check for 'procedure' and 'procedural' are stemmed to the same topic
        topic = nodeIndex["topic"][39]  # 'procedure'
        self.assertIs(nodeIndex["topic"][1875], topic)  # 'procedural'
class FourAreaDataImporterTest(ImporterTest):

    def setUp(self):
        self.dataImporter = SerializedDBLPDataImporter(
            os.path.join('data','four_area'),
            os.path.join('graphs','fourArea')
        )


    def testBasicParsedIndexData(self):
        """
          Checks that the expected number of nodes are parsed from input files, and that parsed data is non-null
        """

        graph, actualNodeIndex = self.dataImporter.parseNodeContent({'author':{}})

        # For conferences, authors, and papers, count should be the same (don't check exact count for topics)
        expectedAuthorCount = 28702
        expectedConferenceCount = 20
        expectedPaperCount = 28569
        self.assertEquals(expectedAuthorCount, len(actualNodeIndex['author']))
        self.assertEquals(expectedConferenceCount, len(actualNodeIndex['conference']))
        self.assertEquals(expectedPaperCount, len(actualNodeIndex['paper']))

        # Assert that non-null data was parsed into all objects (including topics)
        for nodeType in actualNodeIndex:
            for nodeId in actualNodeIndex[nodeType]:
                nodeData = actualNodeIndex[nodeType][nodeId].toDict()
                for key in nodeData:
                    self.assertIsNotNone(nodeData[key])
                    nodeValue = len(nodeData[key]) if type(nodeData[key]) == type('') else nodeData[key]
                    self.assertTrue(nodeValue > 0)


    def testParsedIndexDataTopicKeywords(self):
        """
          Checks that the topics are parsed correctly (i.e. stop words are removed, and words are stemmed)
        """

        graph, actualNodeIndex = self.dataImporter.parseNodeContent({'author':{}})

        actualKeywords = set()
        for nodeId in actualNodeIndex['topic']:
            actualKeywords.add(actualNodeIndex['topic'][nodeId].keywords[0])

        # Check that a few stop words are removed
        sampleStopWords = {'of', 'the', 'for', 'or', 'to', 'a'}
        self.assertEqual(0, len(sampleStopWords.intersection(actualKeywords)))

        # Check that a few samples (known to be in the actual input) are properly stemmed
        stemmedRemovedWords = {'individuals', 'formalisms', 'challenges', 'challenging'}
        self.assertEqual(0, len(stemmedRemovedWords.intersection(actualKeywords)))

        # Check that the number of keywords is at least 20% smaller than the input keywords
        self.assertLess(len(actualKeywords), 13575 * 0.8)


    def testParsedGraphNodes(self):
        """
          Checks nodes are constructed corresponding to every entry in the index
        """

        graph, actualNodeIndex = self.dataImporter.parseNodeContent({'author':{}})

        expectedTypeCounts = {}
        for key in actualNodeIndex:
            expectedTypeCounts[key] = 0
            for nodeId in actualNodeIndex[key]:
                expectedTypeCounts[key] += 1

        actualTypeCounts = {
            'topic': 0,
            'paper': 0,
            'author': 0,
            'conference': 0
        }
        otherTypeCounts = 0
        for node in graph.getNodes():
            if isinstance(node, Topic):
                actualTypeCounts['topic'] += 1
            elif isinstance(node, Paper):
                actualTypeCounts['paper'] += 1
            elif isinstance(node, Conference):
                actualTypeCounts['conference'] += 1
            elif isinstance(node, Author):
                actualTypeCounts['author'] += 1
            else:
                otherTypeCounts += 1

        # Don't test topic count, since stop words list & stemming are involved
        actualTypeCounts['topic'] = expectedTypeCounts['topic']

        self.assertEquals(expectedTypeCounts, actualTypeCounts)
        self.assertEquals(0, otherTypeCounts)


    def testParsedGraphAuthorshipEdges(self):
        """
          Checks that parsing the basic authorship edge content of the graph works as expected, by spot checking a few
          edges that do & don't exist
        """

        graph, nodeIndex = self.dataImporter.parseNodeContent({})
        graph = self.dataImporter.parseEdgeContent(graph, nodeIndex)

        # Test single paper / author
        singleAuthorPaper = nodeIndex['paper'][7600]
        singlePaperAuthor = nodeIndex['author'][15134]
        self.assertTrue(graph.hasEdge(singleAuthorPaper, singlePaperAuthor))
        self.assertTrue(graph.hasEdge(singlePaperAuthor, singleAuthorPaper))

        # Test multiple authors for a paper
        multiAuthorPaper = nodeIndex['paper'][7605]
        multiAuthorPaperAuthor1 = nodeIndex['author'][15138]
        multiAuthorPaperAuthor2 = nodeIndex['author'][15139]
        self.assertTrue(graph.hasEdge(multiAuthorPaper, multiAuthorPaperAuthor1))
        self.assertTrue(graph.hasEdge(multiAuthorPaperAuthor1, multiAuthorPaper))
        self.assertTrue(graph.hasEdge(multiAuthorPaper, multiAuthorPaperAuthor2))
        self.assertTrue(graph.hasEdge(multiAuthorPaperAuthor2, multiAuthorPaper))

        # Test author for only one paper
        self.assertEqual([singleAuthorPaper], graph.getSuccessors(singlePaperAuthor))
        self.assertEqual([singleAuthorPaper], graph.getPredecessors(singlePaperAuthor))


    def testParsedGraphPublicationEdges(self):
        """
          Checks that parsing the basic publication edge content of the graph works as expected, by spot checking a few
          edges that do & don't exist
        """

        graph, nodeIndex = self.dataImporter.parseNodeContent({})
        graph = self.dataImporter.parseEdgeContent(graph, nodeIndex)

        conference = nodeIndex['conference'][36]
        conferencePublicationCount = 3375
        paper = nodeIndex['paper'][7600]

        # Check basic publication case
        self.assertTrue(graph.hasEdge(paper, conference))
        self.assertTrue(graph.hasEdge(conference, paper))

        # Check that papers are only connected with one conference
        for node in graph.getSuccessors(paper):
            if isinstance(node, Conference):
                self.assertEquals(conference, node)

        # Check the number of publications for a conference
        self.assertEquals(conferencePublicationCount, len(graph.getPredecessors(conference)))


    def testParsedTopicEdges(self):
        """
          Checks that parsing topic edge content of the graph works as expected, by spot checking a few stemmer
          collisions that should occur
        """

        graph, nodeIndex = self.dataImporter.parseNodeContent({})
        graph = self.dataImporter.parseEdgeContent(graph, nodeIndex)

        # Check for 'challenging', 'challenges', and 'challenge' are all stemmed to the same topic
        topic = nodeIndex['topic'][25] # 'challenge'
        self.assertIs(nodeIndex['topic'][451], topic) # 'challenges'
        self.assertIs(nodeIndex['topic'][5821], topic) # 'challenging'