Esempio n. 1
0
    def __init__(self,
                 file,
                 namespace=NAMESPACE,
                 db=DATABASE,
                 url=None,
                 silent=True):
        """
        Initialize the WikiDigester with a file and a namespace.

        Args:
            | file (str)        -- path to XML file (or bzipped XML) to digest.
            | namespace (str)   -- namespace of the file. Defaults to MediaWiki namespace.
            | db (str)          -- the name of the database to save to.
            | url (str)         -- the url from where the dump can be fetched.
            | silent (bool)     -- whether or not to send an email upon digestion completion.
        """

        # Python 2.7 support.
        try:
            super().__init__(file, namespace)
        except TypeError:
            Digester.__init__(self, file, namespace)

        self.database = db
        self._db = None
        self.url = url
        self.silent = silent

        # Keep track of number of docs.
        # Necessary for performing TF-IDF processing.
        self.num_docs = 0
Esempio n. 2
0
    def __init__(self, file, namespace=NAMESPACE, db=DATABASE, url=None, silent=True):
        """
        Initialize the WikiDigester with a file and a namespace.

        Args:
            | file (str)        -- path to XML file (or bzipped XML) to digest.
            | namespace (str)   -- namespace of the file. Defaults to MediaWiki namespace.
            | db (str)          -- the name of the database to save to.
            | url (str)         -- the url from where the dump can be fetched.
            | silent (bool)     -- whether or not to send an email upon digestion completion.
        """

        # Python 2.7 support.
        try:
            super().__init__(file, namespace)
        except TypeError:
            Digester.__init__(self, file, namespace)

        self.database = db
        self._db = None
        self.url = url
        self.silent = silent

        # Keep track of number of docs.
        # Necessary for performing TF-IDF processing.
        self.num_docs = 0
Esempio n. 3
0
class DigesterTest(unittest.TestCase):
    def setUp(self):
        self.d = Digester('tests/data/article.xml', 'http://www.mediawiki.org/xml/export-0.8')

    def tearDown(self):
        self.d = None

    def test_instance(self):
        self.assertIsInstance(self.d, Digester)

    def test_iterate(self):
        for page in self.d.iterate('page'):
            self.assertIsNotNone(page)

    def test_iterate_bz2(self):
        self.d.file = 'tests/data/article.xml.bz2'
        for page in self.d.iterate('page'):
            self.assertIsNotNone(page)
Esempio n. 4
0
class DigesterTest(unittest.TestCase):
    def setUp(self):
        self.d = Digester('tests/data/article.xml', 'http://www.mediawiki.org/xml/export-0.8')

    def tearDown(self):
        self.d = None

    def test_instance(self):
        self.assertIsInstance(self.d, Digester)

    def test_iterate(self):
        for page in self.d.iterate('page'):
            self.assertIsNotNone(page)

    def test_iterate_bz2(self):
        self.d.file = 'tests/data/article.xml.bz2'
        for page in self.d.iterate('page'):
            self.assertIsNotNone(page)
Esempio n. 5
0
 def setUp(self):
     self.d = Digester('tests/data/article.xml', 'http://www.mediawiki.org/xml/export-0.8')
Esempio n. 6
0
 def setUp(self):
     self.d = Digester('tests/data/article.xml', 'http://www.mediawiki.org/xml/export-0.8')