def main(): site = pywikibot.Site('en', 'wikisource') # filename = "מפתח:ספר_השרשים.pdf" # filename = "Index:ספר_השרשים.pdf" filename = "Index:Catholic_Encyclopedia,_volume_17.djvu" index = IndexPage(site, filename) #sql = "SELECT DISTINCT page table WHERE page = %s ORDER BY page" #conn = sqlite3.connect('example.db') #c = conn.cursor() gen_list = [] #for p in c.execute(sql): for p in range(6, 7): #todo - what is content parameter ? and what should we put in filter_ql ? #TODO - here I get exception qualityN prp-pagequality-N" or class="new" # I added a patch proffread.patch to fix it gen = index.page_gen(start=p, end=p, filter_ql=None, content=False) gen_list.append(gen) gen = itertools.chain(*gen_list) pywikibot.output('\nUploading text to %s\n' % index.title(asLink=True)) bot = ubot.UploadTranscodedPageBot(gen, site=index.site) bot.run()
def test_page_gen(self, key): """Test Index page generator.""" data = self.sites[key] num, title_num, label = data['get_label'] index_page = IndexPage(self.site, self.sites[key]['index']) page_title = self.sites[key]['page'].format(title_num) proofread_page = ProofreadPage(self.site, page_title) # Check start/end limits. self.assertRaises(ValueError, index_page.page_gen, -1, 2) self.assertRaises(ValueError, index_page.page_gen, 1, -1) self.assertRaises(ValueError, index_page.page_gen, 2, 1) # Check quality filters. gen = index_page.page_gen(num, num, filter_ql=range(5)) self.assertEqual(list(gen), [proofread_page]) gen = index_page.page_gen(num, num, filter_ql=[0]) self.assertEqual(list(gen), [])
def test_page_gen(self, key): """Test Index page generator.""" data = self.sites[key] num, title_num, label = data['get_label'] index_page = IndexPage(self.site, self.sites[key]['index']) page_title = self.sites[key]['page'].format(title_num) proofread_page = ProofreadPage(self.site, page_title) # Check start/end limits. self.assertRaises(ValueError, index_page.page_gen, -1, 2) self.assertRaises(ValueError, index_page.page_gen, 1, -1) self.assertRaises(ValueError, index_page.page_gen, 2, 1) # Check quality filters. gen = index_page.page_gen(num, num, filter_ql=range(5)) self.assertEqual(list(gen), [proofread_page]) gen = index_page.page_gen(num, num, filter_ql=[0]) self.assertEqual(list(gen), [])
def main(): site = pywikibot.Site("mul", "wikisource") arts = pywikibot.Category(site, u"Indizeak euskaraz").articles(recurse=1, reverse=True, content=False) '''indexPage = IndexPage(site, u'Index:Chantspopulaires00sall.pdf') pages = indexPage.page_gen(only_existing=True, content=True) for page in pages: if not page.isRedirectPage() and page.exists(): name = page.title() title = name.replace(u"Chantspopulaires00sall", u"Chants populaires du pays basque (1870)") print title #page.move(title, reason="File renamed in Wikimedia Commons", movetalkpage=True) raw_input('Are you sure? (y/n)') exit(0)''' for art in arts: print art.title() if art.title() not in done: indexPage = IndexPage(art) try: #indexPage.page_gen(only_existing=True, content=True, filter_ql=0) pages = indexPage.page_gen(only_existing=True, content=True) except ValueError as nopage: continue for page in pages: #print page.text if page.exists(): print page cat = getCategoryNeeded(page.quality_level) oldText = page.text #print cat print cat #newText = oldText.replace(u"[[Category:Euskara]]", "") #match = re.match(cat, oldText) if cat not in oldText: newText = oldText for oldCat in categories: if oldCat in newText: newText = newText.replace(oldCat, "") print newText headerFooter = re.findall(r"(<noinclude>(?:[\S\s]+?))(?:<\/noinclude>)", newText) if len(headerFooter) == 1: newText = newText.replace(u"<noinclude></noinclude>", u"<noinclude>{0}</noinclude>".format(cat)) else: footer = headerFooter[1] newText = newText.replace(footer, u"{0}\n{1}".format(footer, cat)) pywikibot.showDiff(oldText,newText) #raw_input('Are you sure? (y/n)') page.put(newText, comment = u'Added category {0}'.format(cat), minorEdit=True)
def main(*args): """ Process command line arguments and invoke bot. If args is an empty list, sys.argv is used. @param args: command line arguments @type args: list of unicode """ index = None pages = '1-' options = {} # Parse command line arguments. local_args = pywikibot.handle_args(args) for arg in local_args: arg, sep, value = arg.partition(':') if arg == '-index': index = value elif arg == '-pages': pages = value elif arg == '-showdiff': issue_deprecation_warning('The usage of -showdiff option', None, 0) elif arg == '-summary': options['summary'] = value elif arg == '-force': issue_deprecation_warning('The usage of -force option', None, 0) elif arg == '-always': options['always'] = True else: pywikibot.output('Unknown argument %s' % arg) # index is mandatory. if not index: pywikibot.bot.suggest_help(missing_parameters=['-index']) return False site = pywikibot.Site() if not site.has_extension('ProofreadPage'): pywikibot.error('Site %s must have ProofreadPage extension.' % site) return False index = IndexPage(site, index) if not index.exists(): pywikibot.error("Page %s doesn't exist." % index) return False # Parse pages param. # Create a list of (start, end) tuples. pages = pages.split(',') for interval in range(len(pages)): start, sep, end = pages[interval].partition('-') start = 1 if not start else int(start) if not sep: end = start else: end = int(end) if end else index.num_pages pages[interval] = (start, end) gen_list = [] for start, end in sorted(pages): gen = index.page_gen(start=start, end=end, filter_ql=[1], content=False) gen_list.append(gen) gen = itertools.chain(*gen_list) pywikibot.output('\nUploading text to %s\n' % index.title(asLink=True)) bot = UploadTextBot(gen, site=index.site, **options) bot.run()
def main(*args): """ Process command line arguments and invoke bot. If args is an empty list, sys.argv is used. @param args: command line arguments @type args: str """ index = None pages = '1-' options = {} # Parse command line arguments. local_args = pywikibot.handle_args(args) for arg in local_args: arg, sep, value = arg.partition(':') if arg == '-index': index = value elif arg == '-pages': pages = value elif arg == '-showdiff': options['showdiff'] = True elif arg == '-summary': options['summary'] = value elif arg == '-ocr': options['ocr'] = value or 'phetools' elif arg == '-threads': options['threads'] = int(value) elif arg == '-force': options['force'] = True elif arg == '-always': options['always'] = True else: pywikibot.output('Unknown argument ' + arg) # index is mandatory. if not index: pywikibot.bot.suggest_help(missing_parameters=['-index']) return # '-force' can be used with '-ocr' only. if 'force' in options and 'ocr' not in options: pywikibot.error("'-force' can be used with '-ocr' option only.") return site = pywikibot.Site() if not site.has_extension('ProofreadPage'): pywikibot.error( 'Site {} must have ProofreadPage extension.'.format(site)) return index = IndexPage(site, index) if not index.exists(): pywikibot.error("Page {} doesn't exist.".format(index)) return # Parse pages param. # Create a list of (start, end) tuples. pages = pages.split(',') for interval in range(len(pages)): start, sep, end = pages[interval].partition('-') start = 1 if not start else int(start) if not sep: end = start else: end = int(end) if end else index.num_pages pages[interval] = (start, end) # gen yields ProofreadPage objects. gen_list = [] for start, end in sorted(pages): gen = index.page_gen(start=start, end=end, filter_ql=[1], content=True) gen_list.append(gen) gen = itertools.chain(*gen_list) pywikibot.output('\nUploading text to {}\n'.format( index.title(as_link=True))) bot = UploadTextBot(gen, site=index.site, **options) bot.run()
def main(*args): """ Process command line arguments and invoke bot. If args is an empty list, sys.argv is used. @param args: command line arguments @type args: list of unicode """ index = None pages = '1-' options = {} # Parse command line arguments. local_args = pywikibot.handle_args(args) for arg in local_args: arg, sep, value = arg.partition(':') if arg == '-index': index = value elif arg == '-pages': pages = value elif arg == '-showdiff': options['showdiff'] = True elif arg == '-summary': options['summary'] = value elif arg == '-ocr': options['ocr'] = True elif arg == '-force': options['force'] = True elif arg == '-always': options['always'] = True else: pywikibot.output('Unknown argument %s' % arg) # index is mandatory. if not index: pywikibot.bot.suggest_help(missing_parameters=['-index']) return False # '-force' can be used with '-ocr' only. if 'force' in options and 'ocr' not in options: pywikibot.error("'-force' can be used with '-ocr' option only.") return False site = pywikibot.Site() if not site.has_extension('ProofreadPage'): pywikibot.error('Site %s must have ProofreadPage extension.' % site) return False index = IndexPage(site, index) if not index.exists(): pywikibot.error("Page %s doesn't exist." % index) return False # Parse pages param. # Create a list of (start, end) tuples. pages = pages.split(',') for interval in range(len(pages)): start, sep, end = pages[interval].partition('-') start = 1 if not start else int(start) if not sep: end = start else: end = int(end) if end else index.num_pages pages[interval] = (start, end) # gen yields ProofreadPage objects. gen_list = [] for start, end in sorted(pages): gen = index.page_gen(start=start, end=end, filter_ql=[1], content=False) gen_list.append(gen) gen = itertools.chain(*gen_list) pywikibot.output('\nUploading text to %s\n' % index.title(asLink=True)) bot = UploadTextBot(gen, site=index.site, **options) bot.run()