def test_getPageTitles(self): # This test download the title list using API and index.php # Compare both lists in length and title by title # Check the presence of some special titles, like odd chars # The tested wikis are from different wikifarms and some alone print '\n', '#'*73, '\n', 'test_getPageTitles', '\n', '#'*73 tests = [ # Alone wikis ['http://archiveteam.org/index.php', 'http://archiveteam.org/api.php', u'April Fools\' Day'], ['http://skilledtests.com/wiki/index.php', 'http://skilledtests.com/wiki/api.php', u'Conway\'s Game of Life'], # Test old allpages API behaviour ['http://wiki.damirsystems.com/index.php', 'http://wiki.damirsystems.com/api.php', 'SQL Server Tips'], # Test BOM encoding ['http://www.libreidea.org/w/index.php', 'http://www.libreidea.org/w/api.php', 'Main Page'], ] session = requests.Session() session.headers = {'User-Agent': getUserAgent()} for index, api, pagetocheck in tests: # Testing with API print '\nTesting', api print 'Trying to parse', pagetocheck, 'with API' config_api = {'api': api, 'index': '', 'delay': 0, 'namespaces': ['all'], 'exnamespaces': [], 'date': datetime.datetime.now().strftime('%Y%m%d'), 'path': '.'} getPageTitles(config=config_api, session=session) titles_api = './%s-%s-titles.txt' % (domain2prefix(config=config_api), config_api['date']) result_api = open(titles_api, 'r').read().splitlines() os.remove(titles_api) self.assertTrue(pagetocheck in result_api) # Testing with index print 'Testing', index print 'Trying to parse', pagetocheck, 'with index' config_index = {'index': index, 'api': '', 'delay': 0, 'namespaces': ['all'], 'exnamespaces': [], 'date': datetime.datetime.now().strftime('%Y%m%d'), 'path': '.'} getPageTitles(config=config_index, session=session) titles_index = './%s-%s-titles.txt' % (domain2prefix(config=config_index), config_index['date']) result_index = open(titles_index, 'r').read().splitlines() os.remove(titles_index) self.assertTrue(pagetocheck in result_index) self.assertEqual(len(result_api), len(result_index)) # Compare every page in both lists, with/without API c = 0 for pagename_api in result_api: self.assertEqual(pagename_api.decode('utf8'), result_index[c].decode('utf8'), u'{0} and {1} are different'.format(pagename_api.decode('utf8'), result_index[c].decode('utf8'))) c += 1
def test_getPageTitles(self): # This test download the title list using API and index.php # Compare both lists in length and title by title # Check the presence of some special titles, like odd chars # The tested wikis are from different wikifarms and some alone print '#'*73, '\n', 'test_getPageTitles', '\n', '#'*73 tests = [ # Alone wikis ['http://archiveteam.org/index.php', 'http://archiveteam.org/api.php', u'April Fools\' Day'], ['http://skilledtests.com/wiki/index.php', 'http://skilledtests.com/wiki/api.php', u'Conway\'s Game of Life'], # Test old allpages API behaviour ['http://wiki.damirsystems.com/index.php', 'http://wiki.damirsystems.com/api.php', 'SQL Server Tips'], # Gentoo wikifarm ['http://wiki.gentoo.org/index.php', 'http://wiki.gentoo.org/api.php', u'/usr move'], ] session = requests.Session() session.headers = {'User-Agent': getUserAgent()} for index, api, pagetocheck in tests: # Testing with API print '\nTesting', api print 'Trying to parse', pagetocheck, 'with API' config_api = {'api': api, 'delay': 0, 'namespaces': ['all'], 'exnamespaces': []} result_api = getPageTitles(config=config_api, session=session) self.assertTrue(pagetocheck in result_api) # Testing with index print 'Testing', index print 'Trying to parse', pagetocheck, 'with index' config_index = {'index': index, 'delay': 0, 'namespaces': ['all'], 'exnamespaces': []} result_index = getPageTitles(config=config_index, session=session) self.assertTrue(pagetocheck in result_index) self.assertEqual(len(result_api), len(result_index)) # Compare every page in both lists, with/without API c = 0 for pagename_api in result_api: self.assertEqual(pagename_api, result_index[c], u'{0} and {1} are different'.format(pagename_api, result_index[c])) c += 1
def test_getPageTitles(self): # This test download the title list using API and index.php # Compare both lists in length and title by title # Check the presence of some special titles, like odd chars # The tested wikis are from different wikifarms and some alone print '#' * 73, '\n', 'test_getPageTitles', '\n', '#' * 73 tests = [ # Alone wikis [ 'http://archiveteam.org/index.php', 'http://archiveteam.org/api.php', u'April Fools\' Day' ], [ 'http://skilledtests.com/wiki/index.php', 'http://skilledtests.com/wiki/api.php', u'Conway\'s Game of Life' ], # Test old allpages API behaviour [ 'http://wiki.damirsystems.com/index.php', 'http://wiki.damirsystems.com/api.php', 'SQL Server Tips' ], ] session = requests.Session() session.headers = {'User-Agent': getUserAgent()} for index, api, pagetocheck in tests: # Testing with API print '\nTesting', api print 'Trying to parse', pagetocheck, 'with API' config_api = { 'api': api, 'delay': 0, 'namespaces': ['all'], 'exnamespaces': [] } result_api = getPageTitles(config=config_api, session=session) self.assertTrue(pagetocheck in result_api) # Testing with index print 'Testing', index print 'Trying to parse', pagetocheck, 'with index' config_index = { 'index': index, 'delay': 0, 'namespaces': ['all'], 'exnamespaces': [] } result_index = getPageTitles(config=config_index, session=session) self.assertTrue(pagetocheck in result_index) self.assertEqual(len(result_api), len(result_index)) # Compare every page in both lists, with/without API c = 0 for pagename_api in result_api: self.assertEqual( pagename_api, result_index[c], u'{0} and {1} are different'.format( pagename_api, result_index[c])) c += 1
def test_getPageTitles(self): # This test download the title list using API and index.php # Compare both lists in length and title by title # Check the presence of some special titles, like odd chars # The tested wikis are from different wikifarms and some alone print '\n', '#' * 73, '\n', 'test_getPageTitles', '\n', '#' * 73 tests = [ # Alone wikis [ 'http://archiveteam.org/index.php', 'http://archiveteam.org/api.php', u'April Fools\' Day' ], [ 'http://skilledtests.com/wiki/index.php', 'http://skilledtests.com/wiki/api.php', u'Conway\'s Game of Life' ], # Test old allpages API behaviour [ 'http://wiki.damirsystems.com/index.php', 'http://wiki.damirsystems.com/api.php', 'SQL Server Tips' ], # Test BOM encoding #['http://www.libreidea.org/w/index.php', 'http://www.libreidea.org/w/api.php', 'Main Page'], ] session = requests.Session() session.headers = {'User-Agent': getUserAgent()} for index, api, pagetocheck in tests: # Testing with API print '\nTesting', api print 'Trying to parse', pagetocheck, 'with API' config_api = { 'api': api, 'index': '', 'delay': 0, 'namespaces': ['all'], 'exnamespaces': [], 'date': datetime.datetime.now().strftime('%Y%m%d'), 'path': '.', 'retries': 5, } titles_api = getPageTitles(config=config_api, session=session) result_api = open(titles_api, 'r').read().decode('utf8').splitlines() os.remove(titles_api) self.assertTrue(pagetocheck in result_api) # Testing with index print 'Testing', index print 'Trying to parse', pagetocheck, 'with index' config_index = { 'index': index, 'api': '', 'delay': 0, 'namespaces': ['all'], 'exnamespaces': [], 'date': datetime.datetime.now().strftime('%Y%m%d'), 'path': '.', 'retries': 5 } titles_index = getPageTitles(config=config_index, session=session) result_index = open(titles_index, 'r').read().decode('utf8').splitlines() os.remove(titles_index) self.assertTrue(pagetocheck in result_index) self.assertEqual(len(result_api), len(result_index)) # Compare every page in both lists, with/without API c = 0 for pagename_api in result_api: chk = pagename_api in result_index self.assertEqual(chk, True, u'%s not in result_index' % (pagename_api)) c += 1