def test_invalid_not_existing_page_source(self):
     """Test ProofreadPage from invalid not existing Page as source."""
     # namespace is forced
     source = pywikibot.Page(self.site, self.not_existing_invalid["title"])
     fixed_source = pywikibot.Page(self.site, source.title(withNamespace=False), ns=self.site.proofread_page_ns)
     page = ProofreadPage(fixed_source)
     self.assertEqual(page.title(), fixed_source.title())
    def test_index(self):
        """Test index property."""
        # Page with Index.
        page = ProofreadPage(self.site, self.valid['title'])
        index_page = IndexPage(self.site, self.valid['index'])

        # Test propery.
        self.assertEqual(page.index, index_page)

        # Test deleter
        del page.index
        self.assertFalse(hasattr(page, '_index'))
        # Test setter
        page.index = index_page
        self.assertEqual(page.index, index_page)

        # Page without Index.
        page = ProofreadPage(self.site, self.existing_multilinked['title'])
        index_page_1 = IndexPage(self.site, self.existing_multilinked['index_1'])
        index_page_2 = IndexPage(self.site, self.existing_multilinked['index_2'])
        self.assertEqual(page.index, index_page_1)
        self.assertNotEqual(page.index, index_page_2)
        self.assertEqual(page._index, (index_page_1, [index_page_2]))

        # Page without Index.
        page = ProofreadPage(self.site, self.existing_unlinked['title'])
        self.assertIs(page.index, None)
        self.assertEqual(page._index, (None, []))
 def gen(self):
     """Generate pages from specified page interval."""
     for page_number in self.page_number_gen():
         title = '{prefix}/{number}'.format(prefix=self._prefix,
                                            number=page_number)
         page = ProofreadPage(self._index.site, title)
         page.page_number = page_number  # remember page number in djvu file
         yield page
Esempio n. 4
0
 def gen(self):
     """Generate pages from specified page interval."""
     for page_number in self.page_number_gen():
         title = '{prefix}/{number}'.format(prefix=self._prefix,
                                            number=page_number)
         page = ProofreadPage(self._index.site, title)
         page.page_number = page_number  # remember page number in djvu file
         yield page
 def test_valid_link_source(self):
     """Test ProofreadPage from valid Link as source."""
     source = pywikibot.Link(self.valid['title'],
                             source=self.site,
                             default_namespace=self.site.proofread_page_ns)
     page = ProofreadPage(source)
     self.assertEqual(page.title(with_ns=False), source.title)
     self.assertEqual(page.namespace(), source.namespace)
 def test_preload_from_empty_text(self):
     """Test ProofreadPage page decomposing/composing text."""
     page = ProofreadPage(self.site, 'dummy test page')
     page.text = ''
     self.assertEqual(page.text,
                      '<noinclude><pagequality level="1" user="" />'
                      '<div class="pagetext">\n\n\n</noinclude>'
                      '<noinclude></div></noinclude>')
 def test_preload_from_empty_text(self):
     """Test ProofreadPage page decomposing/composing text."""
     page = ProofreadPage(self.site, 'dummy test page')
     page.text = ''
     self.assertEqual(
         page.text, '<noinclude><pagequality level="1" user="******" />'
         '<div class="pagetext">\n\n\n</noinclude>'
         '<noinclude></div></noinclude>' % self.site.username())
 def test_invalid_not_existing_page_source(self):
     """Test ProofreadPage from invalid not existing Page as source."""
     # namespace is forced
     source = pywikibot.Page(self.site, self.not_existing_invalid['title'])
     fixed_source = pywikibot.Page(self.site,
                                   source.title(with_ns=False),
                                   ns=self.site.proofread_page_ns)
     page = ProofreadPage(fixed_source)
     self.assertEqual(page.title(), fixed_source.title())
 def test_valid_link_source(self):
     """Test ProofreadPage from valid Link as source."""
     source = pywikibot.Link(
         self.valid['title'],
         source=self.site,
         defaultNamespace=self.site.proofread_page_ns)
     page = ProofreadPage(source)
     self.assertEqual(page.title(withNamespace=False), source.title)
     self.assertEqual(page.namespace(), source.namespace)
Esempio n. 10
0
 def test_preload_from_empty_text(self):
     """Test ProofreadPage page decomposing/composing text."""
     page = ProofreadPage(self.site, 'Page:dummy test page')
     page.text = ''
     class_pagetext, div = self.class_pagetext_fmt[page._full_header._has_div]
     self.assertEqual(page.text,
                      self.fmt.format(user=self.site.username(),
                                      class_pagetext=class_pagetext,
                                      references='',
                                      div_end=div))
 def test_preload_from_empty_text(self):
     """Test ProofreadPage page decomposing/composing text."""
     page = ProofreadPage(self.site, 'Page:dummy test page')
     page.text = ''
     class_pagetext, div = self.class_pagetext_fmt[page._full_header._has_div]
     self.assertEqual(page.text,
                      self.fmt.format(user=self.site.username(),
                                      class_pagetext=class_pagetext,
                                      references='',
                                      div_end=div))
Esempio n. 12
0
    def test_url_image(self):
        """Test fetching of url image of the scan of ProofreadPage."""
        page = ProofreadPage(self.site, self.valid['title'])
        self.assertEqual(page.url_image, self.valid['url_image'])

        page = ProofreadPage(self.site, self.valid_redlink['title'])
        self.assertEqual(page.url_image, self.valid_redlink['url_image'])

        page = ProofreadPage(self.site, self.existing_unlinked['title'])
        # test Exception in property.
        self.assertRaises(ValueError, getattr, page, 'url_image')
Esempio n. 13
0
    def test_index(self):
        """Test index property."""
        # Page with Index.
        page = ProofreadPage(self.site, self.valid['title'])
        index_page = IndexPage(self.site, self.valid['index'])

        # Test property.
        self.assertEqual(page.index, index_page)

        # Test deleter
        del page.index
        self.assertFalse(hasattr(page, '_index'))
        # Test setter with wrong type.
        with self.assertRaises(TypeError):
            page.index = 'invalid index'
        # Test setter with correct type.
        page.index = index_page
        self.assertEqual(page.index, index_page)

        # Page without Index.
        page = ProofreadPage(self.site, self.existing_multilinked['title'])
        index_page_1 = IndexPage(self.site,
                                 self.existing_multilinked['index_1'])
        index_page_2 = IndexPage(self.site,
                                 self.existing_multilinked['index_2'])
        self.assertEqual(page.index, index_page_1)
        self.assertNotEqual(page.index, index_page_2)
        self.assertEqual(page._index, (index_page_1, [index_page_2]))

        # Page without Index.
        page = ProofreadPage(self.site, self.existing_unlinked['title'])
        self.assertIsNone(page.index)
        self.assertEqual(page._index, (None, []))
Esempio n. 14
0
 def process_pages(self, temp_data):
     for idx, lemma in enumerate(self.lemmas):
         try:
             hit = self.regex_page.search(lemma["title"])
             year = hit.group(1)
             page = hit.group(2)
             if year not in self.data["pages"].keys():
                 self.data["pages"][year] = {}
             proofread_lemma = ProofreadPage(self.wiki,
                                             f"Seite:{lemma['title']}")
             if self.debug:
                 self.logger.debug(
                     f"{idx + 1}/{len(self.lemmas)} Page {page}({year}) "
                     f"has quality level {proofread_lemma.quality_level} "
                     f"_ Seite:{lemma['title']}")
             ref = search_for_refs(proofread_lemma.text)
             page_dict = {"q": int(proofread_lemma.quality_level)}
             if ref:
                 self.logger.debug(
                     f"There are refs ({ref}) @ {year}, {page}")
                 page_dict.update({"r": ref})
             self.data["pages"][year][page] = page_dict
             if year not in temp_data.keys():
                 temp_data[year] = []
             temp_data[year].append(page)
         except Exception as error:  # pylint: disable=broad-except
             self.logger.error(
                 f"wasn't able to process {lemma['title']}, error: {error}")
 def test_preload_from_not_existing_page(self):
     """Test ProofreadPage page decomposing/composing text."""
     page = ProofreadPage(self.site, 'dummy test page')
     self.assertEqual(
         page.text, '<noinclude><pagequality level="1" user="" />'
         '<div class="pagetext">\n\n\n</noinclude>'
         '<noinclude><references/></div></noinclude>')
Esempio n. 16
0
 def test_valid_parsing(self):
     """Test ProofreadPage page parsing functions."""
     page = ProofreadPage(self.site, self.valid['title'])
     self.assertEqual(page.ql, self.valid['ql'])
     self.assertEqual(page.user, self.valid['user'])
     self.assertEqual(page.header, self.valid['header'])
     self.assertEqual(page.footer, self.valid['footer'])
Esempio n. 17
0
 def test_applicable_quality_level(self):
     """Test Page.quality_level when applicable."""
     site = self.get_site()
     title = 'Page:Popular Science Monthly Volume 49.djvu/1'
     page = ProofreadPage(site, title)
     self.assertEqual(page.content_model, 'proofread-page')
     self.assertEqual(page.quality_level, 0)
Esempio n. 18
0
    def test_index(self):
        """Test index property."""
        # Page with Index.
        page = ProofreadPage(self.site, self.valid['title'])
        index_page = IndexPage(self.site, self.valid['index'])

        # Test propery.
        self.assertEqual(page.index, index_page)

        # Test deleter
        del page.index
        self.assertFalse(hasattr(page, '_index'))
        # Test setter
        page.index = index_page
        self.assertEqual(page.index, index_page)

        # Page without Index.
        page = ProofreadPage(self.site, self.existing_multilinked['title'])
        index_page_1 = IndexPage(self.site, self.existing_multilinked['index_1'])
        index_page_2 = IndexPage(self.site, self.existing_multilinked['index_2'])
        self.assertEqual(page.index, index_page_1)
        self.assertNotEqual(page.index, index_page_2)
        self.assertEqual(page._index, (index_page_1, [index_page_2]))

        # Page without Index.
        page = ProofreadPage(self.site, self.existing_unlinked['title'])
        self.assertIs(page.index, None)
        self.assertEqual(page._index, (None, []))
Esempio n. 19
0
 def test_parse_title(self, key):
     """Test ProofreadPage_parse_title() function."""
     data = self.sites[key]
     title = data['title']
     base, base_ext, num = data['tuple']
     page = ProofreadPage(self.site, title)
     self.assertEqual(page._base, base)
     self.assertEqual(page._base_ext, base_ext)
     self.assertEqual(page._num, num)
Esempio n. 20
0
 def test_preload_from_not_existing_page(self):
     """Test ProofreadPage page decomposing/composing text."""
     page = ProofreadPage(self.site, 'Page:dummy test page')
     # Fetch page text to instantiate page._full_header, in order to allow
     # for proper test result preparation.
     page.text
     class_pagetext, div = self.class_pagetext_fmt[page._full_header._has_div]
     self.assertEqual(page.text,
                      self.fmt.format(user=self.site.username(),
                                      class_pagetext=class_pagetext,
                                      references='<references/>',
                                      div_end=div))
    def test_json_format(self):
        """Test conversion to json format."""
        page = ProofreadPage(self.site, self.valid["title"])

        rvargs = {
            "rvprop": "ids|flags|timestamp|user|comment|content",
            "rvcontentformat": "application/json",
            "titles": page,
        }

        rvgen = self.site._generator(api.PropertyGenerator, type_arg="info|revisions", total=1, **rvargs)
        rvgen.set_maximum_items(-1)  # suppress use of rvlimit parameter

        try:
            pagedict = next(iter(rvgen))
            loaded_text = pagedict.get("revisions")[0].get("*")
        except (StopIteration, TypeError, KeyError, ValueError, IndexError):
            page_text = ""

        page_text = page._page_to_json()
        self.assertEqual(json.loads(page_text), json.loads(loaded_text))
Esempio n. 22
0
    def setUpClass(cls):
        """Prepare get_page dataset for tests."""
        super(TestIndexPageMappings, cls).setUpClass()
        for key, site_def in cls.sites.items():
            site = cls.get_site(name=key)
            base_title = site_def['page']

            # 'get_page' has same structure as 'get_number'.
            site_def['get_page'] = []
            for label, page_numbers in site_def['get_number']:
                page_set = {ProofreadPage(site, base_title.format(i))
                            for i in page_numbers}
                site_def['get_page'].append([label, page_set])
Esempio n. 23
0
    def test_json_format(self):
        """Test conversion to json format."""
        page = ProofreadPage(self.site, self.valid['title'])

        rvargs = {'rvprop': 'ids|flags|timestamp|user|comment|content',
                  'rvcontentformat': 'application/json',
                  'titles': page,
                  }

        rvgen = self.site._generator(api.PropertyGenerator,
                                     type_arg='info|revisions',
                                     total=1, **rvargs)
        rvgen.set_maximum_items(-1)  # suppress use of rvlimit parameter

        try:
            pagedict = next(iter(rvgen))
            loaded_text = pagedict.get('revisions')[0].get('*')
        except (StopIteration, TypeError, KeyError, ValueError, IndexError):
            page_text = ''

        page_text = page._page_to_json()
        self.assertEqual(json.loads(page_text), json.loads(loaded_text))
Esempio n. 24
0
    def test_get_labels(self, key):
        """Test IndexPage page get_label_from_* functions."""
        data = self.sites[key]
        num, title_num, label = data['get_label']

        index_page = IndexPage(self.site, self.sites[key]['index'])
        page_title = self.sites[key]['page'].format(title_num)
        proofread_page = ProofreadPage(self.site, page_title)

        # Get label from number.
        self.assertEqual(index_page.get_label_from_page_number(num), label)
        # Error if number does not exists.
        self.assertRaises(KeyError, index_page.get_label_from_page_number, -1)

        # Get label from page.
        self.assertEqual(index_page.get_label_from_page(proofread_page), label)
        # Error if page does not exists.
        self.assertRaises(KeyError, index_page.get_label_from_page, None)
Esempio n. 25
0
    def test_page_gen(self, key):
        """Test Index page generator."""
        data = self.sites[key]
        num, title_num, label = data['get_label']

        index_page = IndexPage(self.site, self.sites[key]['index'])
        page_title = self.sites[key]['page'].format(title_num)
        proofread_page = ProofreadPage(self.site, page_title)

        # Check start/end limits.
        self.assertRaises(ValueError, index_page.page_gen, -1, 2)
        self.assertRaises(ValueError, index_page.page_gen, 1, -1)
        self.assertRaises(ValueError, index_page.page_gen, 2, 1)

        # Check quality filters.
        gen = index_page.page_gen(num, num, filter_ql=range(5))
        self.assertEqual(list(gen), [proofread_page])

        gen = index_page.page_gen(num, num, filter_ql=[0])
        self.assertEqual(list(gen), [])
Esempio n. 26
0
 def setUpClass(cls):
     """Prepare tests by creating page instances."""
     super(TestIndexPageMappingsRedlinks, cls).setUpClass()
     cls.index = IndexPage(cls.site, cls.index_name)
     cls.pages = [ProofreadPage(cls.site, page) for page in cls.page_names]
     cls.missing = ProofreadPage(cls.site, cls.missing_name)
Esempio n. 27
0
 def setUp(self):
     """Set up test case."""
     self._page = ProofreadPage(
         self.site, 'Page:Popular Science Monthly Volume 1.djvu/12')
     super(TestLoadRevisionsCachingProofreadPage, self).setUp()
Esempio n. 28
0
 def setUp(self):
     """Set up test case."""
     self._page = ProofreadPage(
         self.site, 'Page:Popular Science Monthly Volume 1.djvu/12')
     super(TestBasePageMethodsProofreadPage, self).setUp()
Esempio n. 29
0
 def test_valid_site_source(self):
     """Test ProofreadPage from valid Site as source."""
     page = ProofreadPage(self.site, 'Page:dummy test page')
     self.assertEqual(page.namespace(), self.site.proofread_page_ns)
Esempio n. 30
0
 def test_invalid_link_source(self):
     """Test ProofreadPage from invalid Link as source."""
     source = pywikibot.Link(self.not_existing_invalid['title'],
                             source=self.site)
     with self.assertRaises(ValueError):
         ProofreadPage(source)
Esempio n. 31
0
 def test_div_in_footer(self):
     """Test ProofreadPage page parsing functions."""
     page = ProofreadPage(self.site, self.div_in_footer['title'])
     self.assertTrue(page.footer.endswith('</div>'))
Esempio n. 32
0
 def test_decompose_recompose_text(self):
     """Test ProofreadPage page decomposing/composing text."""
     page = ProofreadPage(self.site, self.valid['title'])
     plain_text = pywikibot.Page(self.site, self.valid['title']).text
     assert page.text
     self.assertEqual(plain_text, page.text)
Esempio n. 33
0
class TestPageOCR(BS4TestCase):
    """Test page ocr functions."""

    family = 'wikisource'
    code = 'en'

    cached = True

    data = {
        'title':
        'Page:Popular Science Monthly Volume 1.djvu/10',
        'hocr': (False, 'ENTERED, according to Act of Congress, in the '
                 'year 1872,\nBY D. APPLETON & CO.,\nIn the Office '
                 'of the Librarian of Congress, at '
                 'Washington.\n\n'),
        'ocr': (False, 'EsTEnen, according to Act of Congress, in the '
                'year 1872,\nBy D. APPLETON & CO.,\nIn the '
                'Office of the Librarian of Congress, at '
                'Washington.\n\u000c'),
        'googleOCR': (False, 'ENTERED, according to Act of Congress, in '
                      'the year 1572,\nBY D. APPLETON & CO.\n'
                      'In the Office of the Librarian of '
                      'Congress, at Washington.\n4 334\n'),
    }

    def setUp(self):
        """Test setUp."""
        site = self.get_site()
        title = self.data['title']
        self.page = ProofreadPage(site, title)
        super().setUp()

    def test_ocr_exceptions(self):
        """Test page.ocr() exceptions."""
        self.assertRaises(TypeError, self.page.ocr, ocr_tool='dummy')

    def test_do_hocr(self):
        """Test page._do_hocr()."""
        error, text = self.page._do_hocr()
        if error:
            self.skipTest(text)
        ref_error, ref_text = self.data['hocr']
        self.assertEqual(error, ref_error)
        s = difflib.SequenceMatcher(None, text, ref_text)
        self.assertGreater(s.ratio(), 0.9)

    def test_do_ocr_phetools(self):
        """Test page._do_ocr(ocr_tool='phetools')."""
        error, text = self.page._do_ocr(ocr_tool='phetools')
        ref_error, ref_text = self.data['ocr']
        if error:
            self.skipTest(text)
        self.assertEqual(error, ref_error)
        s = difflib.SequenceMatcher(None, text, ref_text)
        self.assertGreater(s.ratio(), 0.9)

    def test_do_ocr_googleocr(self):
        """Test page._do_ocr(ocr_tool='googleOCR')."""
        error, text = self.page._do_ocr(ocr_tool='googleOCR')
        if error:
            self.skipTest(text)
        ref_error, ref_text = self.data['googleOCR']
        self.assertEqual(error, ref_error)
        s = difflib.SequenceMatcher(None, text, ref_text)
        self.assertGreater(s.ratio(), 0.9)

    def test_ocr_googleocr(self):
        """Test page.ocr(ocr_tool='googleOCR')."""
        try:
            text = self.page.ocr(ocr_tool='googleOCR')
        except Exception as exc:
            self.assertIsInstance(exc, ValueError)
        else:
            ref_error, ref_text = self.data['googleOCR']
            s = difflib.SequenceMatcher(None, text, ref_text)
            self.assertGreater(s.ratio(), 0.9)
 def test_valid_site_source(self):
     """Test ProofreadPage from valid Site as source."""
     page = ProofreadPage(self.site, 'Page:dummy test page')
     self.assertEqual(page.namespace(), self.site.proofread_page_ns)
Esempio n. 35
0
 def setUp(self):
     """Test setUp."""
     site = self.get_site()
     title = self.data['title']
     self.page = ProofreadPage(site, title)
     super(TestPageOCR, self).setUp()
Esempio n. 36
0
 def test_invalid_not_existing_page_source_wrong_ns(self):
     """Test ProofreadPage from Page not existing in non-Page ns."""
     source = pywikibot.Page(self.site, self.not_existing_invalid['title1'])
     with self.assertRaises(ValueError):
         ProofreadPage(source)
Esempio n. 37
0
class TestPageOCR(TestCase):

    """Test page ocr functions."""

    family = 'wikisource'
    code = 'en'

    cached = True

    data = {'title': 'Page:Popular Science Monthly Volume 1.djvu/10',
            'hocr': (False, 'ENTERED, according to Act of Congress, in the '
                            'year 1872,\nBY D. APPLETON & CO.,\nIn the Office '
                            'of the Librarian of Congress, at '
                            'Washington.\n\n'),
            'ocr': (False, 'lam-mam, according to Act of Congress, in the '
                           'year 157-2,\nBY D. APPLEION Av CO.,\nIn the '
                           'Of\ufb01ce or the Librarian of '
                           'Congress, at Washington.\n\n'),
            'googleOCR': (False, 'ENTERED, according to Act of Congress, in '
                                 'the year 1572,\nBY D. APPLETON & CO.\n'
                                 'In the Office of the Librarian of '
                                 'Congress, at Washington.\n4 334\n'),
            }

    def setUp(self):
        """Test setUp."""
        site = self.get_site()
        title = self.data['title']
        self.page = ProofreadPage(site, title)
        super(TestPageOCR, self).setUp()

    def test_ocr_exceptions(self):
        """Test page.ocr() exceptions."""
        self.assertRaises(TypeError, self.page.ocr, ocr_tool='dummy')

    def test_do_hocr(self):
        """Test page._do_hocr()."""
        error, text = self.page._do_hocr()
        ref_error, ref_text = self.data['hocr']
        self.assertEqual(error, ref_error)
        self.assertEqual(text, ref_text)

    def test_do_ocr_phetools_raw_request(self):
        """Test page._do_ocr connection with wmflabs."""
        uri = ('https://tools.wmflabs.org/phetools/ocr.php?cmd=ocr'
               '&url=https://upload.wikimedia.org/wikipedia/commons/'
               'thumb/a/ac/Popular_Science_Monthly_Volume_1.djvu/'
               'page10-1024px-Popular_Science_Monthly_Volume_1.djvu.jpg'
               '&lang=en&user=None')
        response = http.fetch(uri)
        self.assertEqual(response.status, 200)

    def test_do_ocr_phetools(self):
        """Test page._do_ocr(ocr_tool='phetools')."""
        error, text = self.page._do_ocr(ocr_tool='phetools')
        ref_error, ref_text = self.data['ocr']
        self.assertEqual(error, ref_error)
        self.assertEqual(text, ref_text)

    def test_do_ocr_googleocr(self):
        """Test page._do_ocr(ocr_tool='googleOCR')."""
        error, text = self.page._do_ocr(ocr_tool='googleOCR')
        ref_error, ref_text = self.data['googleOCR']
        self.assertEqual(error, ref_error)
        self.assertEqual(text, ref_text)

    def test_ocr_googleocr(self):
        """Test page.ocr(ocr_tool='googleOCR')."""
        text = self.page.ocr(ocr_tool='googleOCR')
        ref_error, ref_text = self.data['googleOCR']
        self.assertEqual(text, ref_text)
Esempio n. 38
0
 def test_invalid_site_source(self):
     """Test ProofreadPage from invalid Site as source."""
     with self.assertRaises(pywikibot.UnknownExtension):
         ProofreadPage(self.site, 'title')