def test_invalid_not_existing_page_source(self): """Test ProofreadPage from invalid not existing Page as source.""" # namespace is forced source = pywikibot.Page(self.site, self.not_existing_invalid["title"]) fixed_source = pywikibot.Page(self.site, source.title(withNamespace=False), ns=self.site.proofread_page_ns) page = ProofreadPage(fixed_source) self.assertEqual(page.title(), fixed_source.title())
def test_index(self): """Test index property.""" # Page with Index. page = ProofreadPage(self.site, self.valid['title']) index_page = IndexPage(self.site, self.valid['index']) # Test propery. self.assertEqual(page.index, index_page) # Test deleter del page.index self.assertFalse(hasattr(page, '_index')) # Test setter page.index = index_page self.assertEqual(page.index, index_page) # Page without Index. page = ProofreadPage(self.site, self.existing_multilinked['title']) index_page_1 = IndexPage(self.site, self.existing_multilinked['index_1']) index_page_2 = IndexPage(self.site, self.existing_multilinked['index_2']) self.assertEqual(page.index, index_page_1) self.assertNotEqual(page.index, index_page_2) self.assertEqual(page._index, (index_page_1, [index_page_2])) # Page without Index. page = ProofreadPage(self.site, self.existing_unlinked['title']) self.assertIs(page.index, None) self.assertEqual(page._index, (None, []))
def gen(self): """Generate pages from specified page interval.""" for page_number in self.page_number_gen(): title = '{prefix}/{number}'.format(prefix=self._prefix, number=page_number) page = ProofreadPage(self._index.site, title) page.page_number = page_number # remember page number in djvu file yield page
def test_valid_link_source(self): """Test ProofreadPage from valid Link as source.""" source = pywikibot.Link(self.valid['title'], source=self.site, default_namespace=self.site.proofread_page_ns) page = ProofreadPage(source) self.assertEqual(page.title(with_ns=False), source.title) self.assertEqual(page.namespace(), source.namespace)
def test_preload_from_empty_text(self): """Test ProofreadPage page decomposing/composing text.""" page = ProofreadPage(self.site, 'dummy test page') page.text = '' self.assertEqual(page.text, '<noinclude><pagequality level="1" user="" />' '<div class="pagetext">\n\n\n</noinclude>' '<noinclude></div></noinclude>')
def test_preload_from_empty_text(self): """Test ProofreadPage page decomposing/composing text.""" page = ProofreadPage(self.site, 'dummy test page') page.text = '' self.assertEqual( page.text, '<noinclude><pagequality level="1" user="******" />' '<div class="pagetext">\n\n\n</noinclude>' '<noinclude></div></noinclude>' % self.site.username())
def test_invalid_not_existing_page_source(self): """Test ProofreadPage from invalid not existing Page as source.""" # namespace is forced source = pywikibot.Page(self.site, self.not_existing_invalid['title']) fixed_source = pywikibot.Page(self.site, source.title(with_ns=False), ns=self.site.proofread_page_ns) page = ProofreadPage(fixed_source) self.assertEqual(page.title(), fixed_source.title())
def test_valid_link_source(self): """Test ProofreadPage from valid Link as source.""" source = pywikibot.Link( self.valid['title'], source=self.site, defaultNamespace=self.site.proofread_page_ns) page = ProofreadPage(source) self.assertEqual(page.title(withNamespace=False), source.title) self.assertEqual(page.namespace(), source.namespace)
def test_preload_from_empty_text(self): """Test ProofreadPage page decomposing/composing text.""" page = ProofreadPage(self.site, 'Page:dummy test page') page.text = '' class_pagetext, div = self.class_pagetext_fmt[page._full_header._has_div] self.assertEqual(page.text, self.fmt.format(user=self.site.username(), class_pagetext=class_pagetext, references='', div_end=div))
def test_url_image(self): """Test fetching of url image of the scan of ProofreadPage.""" page = ProofreadPage(self.site, self.valid['title']) self.assertEqual(page.url_image, self.valid['url_image']) page = ProofreadPage(self.site, self.valid_redlink['title']) self.assertEqual(page.url_image, self.valid_redlink['url_image']) page = ProofreadPage(self.site, self.existing_unlinked['title']) # test Exception in property. self.assertRaises(ValueError, getattr, page, 'url_image')
def test_index(self): """Test index property.""" # Page with Index. page = ProofreadPage(self.site, self.valid['title']) index_page = IndexPage(self.site, self.valid['index']) # Test property. self.assertEqual(page.index, index_page) # Test deleter del page.index self.assertFalse(hasattr(page, '_index')) # Test setter with wrong type. with self.assertRaises(TypeError): page.index = 'invalid index' # Test setter with correct type. page.index = index_page self.assertEqual(page.index, index_page) # Page without Index. page = ProofreadPage(self.site, self.existing_multilinked['title']) index_page_1 = IndexPage(self.site, self.existing_multilinked['index_1']) index_page_2 = IndexPage(self.site, self.existing_multilinked['index_2']) self.assertEqual(page.index, index_page_1) self.assertNotEqual(page.index, index_page_2) self.assertEqual(page._index, (index_page_1, [index_page_2])) # Page without Index. page = ProofreadPage(self.site, self.existing_unlinked['title']) self.assertIsNone(page.index) self.assertEqual(page._index, (None, []))
def process_pages(self, temp_data): for idx, lemma in enumerate(self.lemmas): try: hit = self.regex_page.search(lemma["title"]) year = hit.group(1) page = hit.group(2) if year not in self.data["pages"].keys(): self.data["pages"][year] = {} proofread_lemma = ProofreadPage(self.wiki, f"Seite:{lemma['title']}") if self.debug: self.logger.debug( f"{idx + 1}/{len(self.lemmas)} Page {page}({year}) " f"has quality level {proofread_lemma.quality_level} " f"_ Seite:{lemma['title']}") ref = search_for_refs(proofread_lemma.text) page_dict = {"q": int(proofread_lemma.quality_level)} if ref: self.logger.debug( f"There are refs ({ref}) @ {year}, {page}") page_dict.update({"r": ref}) self.data["pages"][year][page] = page_dict if year not in temp_data.keys(): temp_data[year] = [] temp_data[year].append(page) except Exception as error: # pylint: disable=broad-except self.logger.error( f"wasn't able to process {lemma['title']}, error: {error}")
def test_preload_from_not_existing_page(self): """Test ProofreadPage page decomposing/composing text.""" page = ProofreadPage(self.site, 'dummy test page') self.assertEqual( page.text, '<noinclude><pagequality level="1" user="" />' '<div class="pagetext">\n\n\n</noinclude>' '<noinclude><references/></div></noinclude>')
def test_valid_parsing(self): """Test ProofreadPage page parsing functions.""" page = ProofreadPage(self.site, self.valid['title']) self.assertEqual(page.ql, self.valid['ql']) self.assertEqual(page.user, self.valid['user']) self.assertEqual(page.header, self.valid['header']) self.assertEqual(page.footer, self.valid['footer'])
def test_applicable_quality_level(self): """Test Page.quality_level when applicable.""" site = self.get_site() title = 'Page:Popular Science Monthly Volume 49.djvu/1' page = ProofreadPage(site, title) self.assertEqual(page.content_model, 'proofread-page') self.assertEqual(page.quality_level, 0)
def test_parse_title(self, key): """Test ProofreadPage_parse_title() function.""" data = self.sites[key] title = data['title'] base, base_ext, num = data['tuple'] page = ProofreadPage(self.site, title) self.assertEqual(page._base, base) self.assertEqual(page._base_ext, base_ext) self.assertEqual(page._num, num)
def test_preload_from_not_existing_page(self): """Test ProofreadPage page decomposing/composing text.""" page = ProofreadPage(self.site, 'Page:dummy test page') # Fetch page text to instantiate page._full_header, in order to allow # for proper test result preparation. page.text class_pagetext, div = self.class_pagetext_fmt[page._full_header._has_div] self.assertEqual(page.text, self.fmt.format(user=self.site.username(), class_pagetext=class_pagetext, references='<references/>', div_end=div))
def test_json_format(self): """Test conversion to json format.""" page = ProofreadPage(self.site, self.valid["title"]) rvargs = { "rvprop": "ids|flags|timestamp|user|comment|content", "rvcontentformat": "application/json", "titles": page, } rvgen = self.site._generator(api.PropertyGenerator, type_arg="info|revisions", total=1, **rvargs) rvgen.set_maximum_items(-1) # suppress use of rvlimit parameter try: pagedict = next(iter(rvgen)) loaded_text = pagedict.get("revisions")[0].get("*") except (StopIteration, TypeError, KeyError, ValueError, IndexError): page_text = "" page_text = page._page_to_json() self.assertEqual(json.loads(page_text), json.loads(loaded_text))
def setUpClass(cls): """Prepare get_page dataset for tests.""" super(TestIndexPageMappings, cls).setUpClass() for key, site_def in cls.sites.items(): site = cls.get_site(name=key) base_title = site_def['page'] # 'get_page' has same structure as 'get_number'. site_def['get_page'] = [] for label, page_numbers in site_def['get_number']: page_set = {ProofreadPage(site, base_title.format(i)) for i in page_numbers} site_def['get_page'].append([label, page_set])
def test_json_format(self): """Test conversion to json format.""" page = ProofreadPage(self.site, self.valid['title']) rvargs = {'rvprop': 'ids|flags|timestamp|user|comment|content', 'rvcontentformat': 'application/json', 'titles': page, } rvgen = self.site._generator(api.PropertyGenerator, type_arg='info|revisions', total=1, **rvargs) rvgen.set_maximum_items(-1) # suppress use of rvlimit parameter try: pagedict = next(iter(rvgen)) loaded_text = pagedict.get('revisions')[0].get('*') except (StopIteration, TypeError, KeyError, ValueError, IndexError): page_text = '' page_text = page._page_to_json() self.assertEqual(json.loads(page_text), json.loads(loaded_text))
def test_get_labels(self, key): """Test IndexPage page get_label_from_* functions.""" data = self.sites[key] num, title_num, label = data['get_label'] index_page = IndexPage(self.site, self.sites[key]['index']) page_title = self.sites[key]['page'].format(title_num) proofread_page = ProofreadPage(self.site, page_title) # Get label from number. self.assertEqual(index_page.get_label_from_page_number(num), label) # Error if number does not exists. self.assertRaises(KeyError, index_page.get_label_from_page_number, -1) # Get label from page. self.assertEqual(index_page.get_label_from_page(proofread_page), label) # Error if page does not exists. self.assertRaises(KeyError, index_page.get_label_from_page, None)
def test_page_gen(self, key): """Test Index page generator.""" data = self.sites[key] num, title_num, label = data['get_label'] index_page = IndexPage(self.site, self.sites[key]['index']) page_title = self.sites[key]['page'].format(title_num) proofread_page = ProofreadPage(self.site, page_title) # Check start/end limits. self.assertRaises(ValueError, index_page.page_gen, -1, 2) self.assertRaises(ValueError, index_page.page_gen, 1, -1) self.assertRaises(ValueError, index_page.page_gen, 2, 1) # Check quality filters. gen = index_page.page_gen(num, num, filter_ql=range(5)) self.assertEqual(list(gen), [proofread_page]) gen = index_page.page_gen(num, num, filter_ql=[0]) self.assertEqual(list(gen), [])
def setUpClass(cls): """Prepare tests by creating page instances.""" super(TestIndexPageMappingsRedlinks, cls).setUpClass() cls.index = IndexPage(cls.site, cls.index_name) cls.pages = [ProofreadPage(cls.site, page) for page in cls.page_names] cls.missing = ProofreadPage(cls.site, cls.missing_name)
def setUp(self): """Set up test case.""" self._page = ProofreadPage( self.site, 'Page:Popular Science Monthly Volume 1.djvu/12') super(TestLoadRevisionsCachingProofreadPage, self).setUp()
def setUp(self): """Set up test case.""" self._page = ProofreadPage( self.site, 'Page:Popular Science Monthly Volume 1.djvu/12') super(TestBasePageMethodsProofreadPage, self).setUp()
def test_valid_site_source(self): """Test ProofreadPage from valid Site as source.""" page = ProofreadPage(self.site, 'Page:dummy test page') self.assertEqual(page.namespace(), self.site.proofread_page_ns)
def test_invalid_link_source(self): """Test ProofreadPage from invalid Link as source.""" source = pywikibot.Link(self.not_existing_invalid['title'], source=self.site) with self.assertRaises(ValueError): ProofreadPage(source)
def test_div_in_footer(self): """Test ProofreadPage page parsing functions.""" page = ProofreadPage(self.site, self.div_in_footer['title']) self.assertTrue(page.footer.endswith('</div>'))
def test_decompose_recompose_text(self): """Test ProofreadPage page decomposing/composing text.""" page = ProofreadPage(self.site, self.valid['title']) plain_text = pywikibot.Page(self.site, self.valid['title']).text assert page.text self.assertEqual(plain_text, page.text)
class TestPageOCR(BS4TestCase): """Test page ocr functions.""" family = 'wikisource' code = 'en' cached = True data = { 'title': 'Page:Popular Science Monthly Volume 1.djvu/10', 'hocr': (False, 'ENTERED, according to Act of Congress, in the ' 'year 1872,\nBY D. APPLETON & CO.,\nIn the Office ' 'of the Librarian of Congress, at ' 'Washington.\n\n'), 'ocr': (False, 'EsTEnen, according to Act of Congress, in the ' 'year 1872,\nBy D. APPLETON & CO.,\nIn the ' 'Office of the Librarian of Congress, at ' 'Washington.\n\u000c'), 'googleOCR': (False, 'ENTERED, according to Act of Congress, in ' 'the year 1572,\nBY D. APPLETON & CO.\n' 'In the Office of the Librarian of ' 'Congress, at Washington.\n4 334\n'), } def setUp(self): """Test setUp.""" site = self.get_site() title = self.data['title'] self.page = ProofreadPage(site, title) super().setUp() def test_ocr_exceptions(self): """Test page.ocr() exceptions.""" self.assertRaises(TypeError, self.page.ocr, ocr_tool='dummy') def test_do_hocr(self): """Test page._do_hocr().""" error, text = self.page._do_hocr() if error: self.skipTest(text) ref_error, ref_text = self.data['hocr'] self.assertEqual(error, ref_error) s = difflib.SequenceMatcher(None, text, ref_text) self.assertGreater(s.ratio(), 0.9) def test_do_ocr_phetools(self): """Test page._do_ocr(ocr_tool='phetools').""" error, text = self.page._do_ocr(ocr_tool='phetools') ref_error, ref_text = self.data['ocr'] if error: self.skipTest(text) self.assertEqual(error, ref_error) s = difflib.SequenceMatcher(None, text, ref_text) self.assertGreater(s.ratio(), 0.9) def test_do_ocr_googleocr(self): """Test page._do_ocr(ocr_tool='googleOCR').""" error, text = self.page._do_ocr(ocr_tool='googleOCR') if error: self.skipTest(text) ref_error, ref_text = self.data['googleOCR'] self.assertEqual(error, ref_error) s = difflib.SequenceMatcher(None, text, ref_text) self.assertGreater(s.ratio(), 0.9) def test_ocr_googleocr(self): """Test page.ocr(ocr_tool='googleOCR').""" try: text = self.page.ocr(ocr_tool='googleOCR') except Exception as exc: self.assertIsInstance(exc, ValueError) else: ref_error, ref_text = self.data['googleOCR'] s = difflib.SequenceMatcher(None, text, ref_text) self.assertGreater(s.ratio(), 0.9)
def setUp(self): """Test setUp.""" site = self.get_site() title = self.data['title'] self.page = ProofreadPage(site, title) super(TestPageOCR, self).setUp()
def test_invalid_not_existing_page_source_wrong_ns(self): """Test ProofreadPage from Page not existing in non-Page ns.""" source = pywikibot.Page(self.site, self.not_existing_invalid['title1']) with self.assertRaises(ValueError): ProofreadPage(source)
class TestPageOCR(TestCase): """Test page ocr functions.""" family = 'wikisource' code = 'en' cached = True data = {'title': 'Page:Popular Science Monthly Volume 1.djvu/10', 'hocr': (False, 'ENTERED, according to Act of Congress, in the ' 'year 1872,\nBY D. APPLETON & CO.,\nIn the Office ' 'of the Librarian of Congress, at ' 'Washington.\n\n'), 'ocr': (False, 'lam-mam, according to Act of Congress, in the ' 'year 157-2,\nBY D. APPLEION Av CO.,\nIn the ' 'Of\ufb01ce or the Librarian of ' 'Congress, at Washington.\n\n'), 'googleOCR': (False, 'ENTERED, according to Act of Congress, in ' 'the year 1572,\nBY D. APPLETON & CO.\n' 'In the Office of the Librarian of ' 'Congress, at Washington.\n4 334\n'), } def setUp(self): """Test setUp.""" site = self.get_site() title = self.data['title'] self.page = ProofreadPage(site, title) super(TestPageOCR, self).setUp() def test_ocr_exceptions(self): """Test page.ocr() exceptions.""" self.assertRaises(TypeError, self.page.ocr, ocr_tool='dummy') def test_do_hocr(self): """Test page._do_hocr().""" error, text = self.page._do_hocr() ref_error, ref_text = self.data['hocr'] self.assertEqual(error, ref_error) self.assertEqual(text, ref_text) def test_do_ocr_phetools_raw_request(self): """Test page._do_ocr connection with wmflabs.""" uri = ('https://tools.wmflabs.org/phetools/ocr.php?cmd=ocr' '&url=https://upload.wikimedia.org/wikipedia/commons/' 'thumb/a/ac/Popular_Science_Monthly_Volume_1.djvu/' 'page10-1024px-Popular_Science_Monthly_Volume_1.djvu.jpg' '&lang=en&user=None') response = http.fetch(uri) self.assertEqual(response.status, 200) def test_do_ocr_phetools(self): """Test page._do_ocr(ocr_tool='phetools').""" error, text = self.page._do_ocr(ocr_tool='phetools') ref_error, ref_text = self.data['ocr'] self.assertEqual(error, ref_error) self.assertEqual(text, ref_text) def test_do_ocr_googleocr(self): """Test page._do_ocr(ocr_tool='googleOCR').""" error, text = self.page._do_ocr(ocr_tool='googleOCR') ref_error, ref_text = self.data['googleOCR'] self.assertEqual(error, ref_error) self.assertEqual(text, ref_text) def test_ocr_googleocr(self): """Test page.ocr(ocr_tool='googleOCR').""" text = self.page.ocr(ocr_tool='googleOCR') ref_error, ref_text = self.data['googleOCR'] self.assertEqual(text, ref_text)
def test_invalid_site_source(self): """Test ProofreadPage from invalid Site as source.""" with self.assertRaises(pywikibot.UnknownExtension): ProofreadPage(self.site, 'title')