def __init__(self, file, namespace=NAMESPACE, db=DATABASE, url=None, silent=True): """ Initialize the WikiDigester with a file and a namespace. Args: | file (str) -- path to XML file (or bzipped XML) to digest. | namespace (str) -- namespace of the file. Defaults to MediaWiki namespace. | db (str) -- the name of the database to save to. | url (str) -- the url from where the dump can be fetched. | silent (bool) -- whether or not to send an email upon digestion completion. """ # Python 2.7 support. try: super().__init__(file, namespace) except TypeError: Digester.__init__(self, file, namespace) self.database = db self._db = None self.url = url self.silent = silent # Keep track of number of docs. # Necessary for performing TF-IDF processing. self.num_docs = 0
class DigesterTest(unittest.TestCase): def setUp(self): self.d = Digester('tests/data/article.xml', 'http://www.mediawiki.org/xml/export-0.8') def tearDown(self): self.d = None def test_instance(self): self.assertIsInstance(self.d, Digester) def test_iterate(self): for page in self.d.iterate('page'): self.assertIsNotNone(page) def test_iterate_bz2(self): self.d.file = 'tests/data/article.xml.bz2' for page in self.d.iterate('page'): self.assertIsNotNone(page)
def setUp(self): self.d = Digester('tests/data/article.xml', 'http://www.mediawiki.org/xml/export-0.8')