def test_getPageTitles(self):
        # This test download the title list using API and index.php
        # Compare both lists in length and title by title
        # Check the presence of some special titles, like odd chars
        # The tested wikis are from different wikifarms and some alone
        
        print '\n', '#'*73, '\n', 'test_getPageTitles', '\n', '#'*73
        tests = [
            # Alone wikis
            ['http://archiveteam.org/index.php', 'http://archiveteam.org/api.php', u'April Fools\' Day'],
            ['http://skilledtests.com/wiki/index.php', 'http://skilledtests.com/wiki/api.php', u'Conway\'s Game of Life'],

            # Test old allpages API behaviour
            ['http://wiki.damirsystems.com/index.php', 'http://wiki.damirsystems.com/api.php', 'SQL Server Tips'],

            # Test BOM encoding
            ['http://www.libreidea.org/w/index.php', 'http://www.libreidea.org/w/api.php', 'Main Page'],
        ]
        
        session = requests.Session()
        session.headers = {'User-Agent': getUserAgent()}
        for index, api, pagetocheck in tests:
            # Testing with API
            print '\nTesting', api
            print 'Trying to parse', pagetocheck, 'with API'
            config_api = {'api': api, 'index': '', 'delay': 0, 'namespaces': ['all'], 'exnamespaces': [], 'date': datetime.datetime.now().strftime('%Y%m%d'), 'path': '.'}
            getPageTitles(config=config_api, session=session)
            titles_api = './%s-%s-titles.txt' % (domain2prefix(config=config_api), config_api['date'])
            result_api = open(titles_api, 'r').read().splitlines()
            os.remove(titles_api)
            self.assertTrue(pagetocheck in result_api)
            
            # Testing with index
            print 'Testing', index
            print 'Trying to parse', pagetocheck, 'with index'
            config_index = {'index': index, 'api': '', 'delay': 0, 'namespaces': ['all'], 'exnamespaces': [], 'date': datetime.datetime.now().strftime('%Y%m%d'), 'path': '.'}
            getPageTitles(config=config_index, session=session)
            titles_index = './%s-%s-titles.txt' % (domain2prefix(config=config_index), config_index['date'])
            result_index = open(titles_index, 'r').read().splitlines()
            os.remove(titles_index)
            self.assertTrue(pagetocheck in result_index)
            self.assertEqual(len(result_api), len(result_index))
            
            # Compare every page in both lists, with/without API
            c = 0
            for pagename_api in result_api:
                self.assertEqual(pagename_api.decode('utf8'), result_index[c].decode('utf8'), u'{0} and {1} are different'.format(pagename_api.decode('utf8'), result_index[c].decode('utf8')))
                c += 1
Exemple #2
0
    def test_getPageTitles(self):
        # This test download the title list using API and index.php
        # Compare both lists in length and title by title
        # Check the presence of some special titles, like odd chars
        # The tested wikis are from different wikifarms and some alone
        
        print '#'*73, '\n', 'test_getPageTitles', '\n', '#'*73
        tests = [
            # Alone wikis
            ['http://archiveteam.org/index.php', 'http://archiveteam.org/api.php', u'April Fools\' Day'],
            ['http://skilledtests.com/wiki/index.php', 'http://skilledtests.com/wiki/api.php', u'Conway\'s Game of Life'],

            # Test old allpages API behaviour
            ['http://wiki.damirsystems.com/index.php', 'http://wiki.damirsystems.com/api.php', 'SQL Server Tips'],

            # Gentoo wikifarm
            ['http://wiki.gentoo.org/index.php', 'http://wiki.gentoo.org/api.php', u'/usr move'],
        ]
        
        session = requests.Session()
        session.headers = {'User-Agent': getUserAgent()}
        for index, api, pagetocheck in tests:
            # Testing with API
            print '\nTesting', api
            print 'Trying to parse', pagetocheck, 'with API'
            config_api = {'api': api, 'delay': 0, 'namespaces': ['all'], 'exnamespaces': []}
            result_api = getPageTitles(config=config_api, session=session)
            self.assertTrue(pagetocheck in result_api)
            
            # Testing with index
            print 'Testing', index
            print 'Trying to parse', pagetocheck, 'with index'
            config_index = {'index': index, 'delay': 0, 'namespaces': ['all'], 'exnamespaces': []}
            result_index = getPageTitles(config=config_index, session=session)
            self.assertTrue(pagetocheck in result_index)
            self.assertEqual(len(result_api), len(result_index))
            
            # Compare every page in both lists, with/without API
            c = 0
            for pagename_api in result_api:
                self.assertEqual(pagename_api, result_index[c], u'{0} and {1} are different'.format(pagename_api, result_index[c]))
                c += 1
    def test_getPageTitles(self):
        # This test download the title list using API and index.php
        # Compare both lists in length and title by title
        # Check the presence of some special titles, like odd chars
        # The tested wikis are from different wikifarms and some alone

        print '#' * 73, '\n', 'test_getPageTitles', '\n', '#' * 73
        tests = [
            # Alone wikis
            [
                'http://archiveteam.org/index.php',
                'http://archiveteam.org/api.php', u'April Fools\' Day'
            ],
            [
                'http://skilledtests.com/wiki/index.php',
                'http://skilledtests.com/wiki/api.php',
                u'Conway\'s Game of Life'
            ],

            # Test old allpages API behaviour
            [
                'http://wiki.damirsystems.com/index.php',
                'http://wiki.damirsystems.com/api.php', 'SQL Server Tips'
            ],
        ]

        session = requests.Session()
        session.headers = {'User-Agent': getUserAgent()}
        for index, api, pagetocheck in tests:
            # Testing with API
            print '\nTesting', api
            print 'Trying to parse', pagetocheck, 'with API'
            config_api = {
                'api': api,
                'delay': 0,
                'namespaces': ['all'],
                'exnamespaces': []
            }
            result_api = getPageTitles(config=config_api, session=session)
            self.assertTrue(pagetocheck in result_api)

            # Testing with index
            print 'Testing', index
            print 'Trying to parse', pagetocheck, 'with index'
            config_index = {
                'index': index,
                'delay': 0,
                'namespaces': ['all'],
                'exnamespaces': []
            }
            result_index = getPageTitles(config=config_index, session=session)
            self.assertTrue(pagetocheck in result_index)
            self.assertEqual(len(result_api), len(result_index))

            # Compare every page in both lists, with/without API
            c = 0
            for pagename_api in result_api:
                self.assertEqual(
                    pagename_api, result_index[c],
                    u'{0} and {1} are different'.format(
                        pagename_api, result_index[c]))
                c += 1
    def test_getPageTitles(self):
        # This test download the title list using API and index.php
        # Compare both lists in length and title by title
        # Check the presence of some special titles, like odd chars
        # The tested wikis are from different wikifarms and some alone

        print '\n', '#' * 73, '\n', 'test_getPageTitles', '\n', '#' * 73
        tests = [
            # Alone wikis
            [
                'http://archiveteam.org/index.php',
                'http://archiveteam.org/api.php', u'April Fools\' Day'
            ],
            [
                'http://skilledtests.com/wiki/index.php',
                'http://skilledtests.com/wiki/api.php',
                u'Conway\'s Game of Life'
            ],

            # Test old allpages API behaviour
            [
                'http://wiki.damirsystems.com/index.php',
                'http://wiki.damirsystems.com/api.php', 'SQL Server Tips'
            ],

            # Test BOM encoding
            #['http://www.libreidea.org/w/index.php', 'http://www.libreidea.org/w/api.php', 'Main Page'],
        ]

        session = requests.Session()
        session.headers = {'User-Agent': getUserAgent()}
        for index, api, pagetocheck in tests:
            # Testing with API
            print '\nTesting', api
            print 'Trying to parse', pagetocheck, 'with API'
            config_api = {
                'api': api,
                'index': '',
                'delay': 0,
                'namespaces': ['all'],
                'exnamespaces': [],
                'date': datetime.datetime.now().strftime('%Y%m%d'),
                'path': '.',
                'retries': 5,
            }

            titles_api = getPageTitles(config=config_api, session=session)
            result_api = open(titles_api,
                              'r').read().decode('utf8').splitlines()
            os.remove(titles_api)
            self.assertTrue(pagetocheck in result_api)

            # Testing with index
            print 'Testing', index
            print 'Trying to parse', pagetocheck, 'with index'
            config_index = {
                'index': index,
                'api': '',
                'delay': 0,
                'namespaces': ['all'],
                'exnamespaces': [],
                'date': datetime.datetime.now().strftime('%Y%m%d'),
                'path': '.',
                'retries': 5
            }

            titles_index = getPageTitles(config=config_index, session=session)
            result_index = open(titles_index,
                                'r').read().decode('utf8').splitlines()
            os.remove(titles_index)
            self.assertTrue(pagetocheck in result_index)
            self.assertEqual(len(result_api), len(result_index))

            # Compare every page in both lists, with/without API
            c = 0
            for pagename_api in result_api:
                chk = pagename_api in result_index
                self.assertEqual(chk, True,
                                 u'%s not in result_index' % (pagename_api))
                c += 1