Ejemplo n.º 1
0
 def testFlushBatches(self):
     """
     L{WikipediaPageHandler} automatically flushes cached data to disk,
     when the batch size is reached.
     """
     handler = WikipediaPageHandler(self.outputPath, batchSize=1)
     handler.handle(WikipediaPage('Sample page 1'))
     self.assertEqual(['wikipedia-titles-00001.json'],
                      os.listdir(self.outputPath))
     handler.handle(WikipediaPage('Sample page 2'))
     self.assertEqual(['wikipedia-titles-00001.json',
                       'wikipedia-titles-00002.json'],
                      sorted(os.listdir(self.outputPath)))
Ejemplo n.º 2
0
class WikipediaPageHandlerTest(TestCase):

    def setUp(self):
        super(WikipediaPageHandlerTest, self).setUp()
        self.outputPath = mkdtemp()
        self.handler = WikipediaPageHandler(self.outputPath)

    def tearDown(self):
        rmtree(self.outputPath)
        super(WikipediaPageHandlerTest, self).tearDown()

    def testCloseWithoutPages(self):
        """
        L{WikipediaPageHandler.close} is a no-op if no pages have been
        generated.
        """
        self.handler.close()
        self.assertEqual([], os.listdir(self.outputPath))

    def testCloseFlushesPages(self):
        """
        L{WikipediaPageHandler.close} flushes any pages that have not yet been
        written to disk.
        """
        self.handler.handle(WikipediaPage('Sample page'))
        self.handler.close()
        path = os.path.join(self.outputPath, 'wikipedia-titles-00001.json')
        with open(path, 'r') as file:
            data = load(file)
        url = 'http://en.wikipedia.org/wiki/Sample_page'
        self.assertEqual(
            {'objects': [{'about': 'sample page',
                          'values': {'en.wikipedia.org/url': url}}]},
            data)

    def testFlushBatches(self):
        """
        L{WikipediaPageHandler} automatically flushes cached data to disk,
        when the batch size is reached.
        """
        handler = WikipediaPageHandler(self.outputPath, batchSize=1)
        handler.handle(WikipediaPage('Sample page 1'))
        self.assertEqual(['wikipedia-titles-00001.json'],
                         os.listdir(self.outputPath))
        handler.handle(WikipediaPage('Sample page 2'))
        self.assertEqual(['wikipedia-titles-00001.json',
                          'wikipedia-titles-00002.json'],
                         sorted(os.listdir(self.outputPath)))