def test_lines(self): filename = os.path.join(_data_dir, 'pagelist-lines.txt') site = self.get_site() titles = list(pagegenerators.TextfilePageGenerator(filename, site)) self.assertEqual(len(titles), len(self.expected_titles)) expected_titles = [ expected_title[self.title_columns[site.namespaces[ page.namespace()].case]] for expected_title, page in zip(self.expected_titles, titles) ]
def test_brackets(self): """Test TextfilePageGenerator with brackets.""" filename = join_data_path('pagelist-brackets.txt') site = self.get_site() titles = list(pagegenerators.TextfilePageGenerator(filename, site)) self.assertEqual(len(titles), len(self.expected_titles)) expected_titles = [ expected_title[self.title_columns[site.namespaces[ page.namespace()].case]] for expected_title, page in zip(self.expected_titles, titles) ]
def main(): # if -file is not used, this temporary array is used to read the page title. pageTitle = [] page = None gen = None interwiki = False keep_name = False targetLang = None targetFamily = None for arg in pywikibot.handleArgs(): if arg == '-interwiki': interwiki = True elif arg.startswith('-keepname'): keep_name = True elif arg.startswith('-tolang:'): targetLang = arg[8:] elif arg.startswith('-tofamily:'): targetFamily = arg[10:] elif arg.startswith('-file'): if len(arg) == 5: filename = pywikibot.input( u'Please enter the list\'s filename: ') else: filename = arg[6:] gen = pagegenerators.TextfilePageGenerator(filename) else: pageTitle.append(arg) if not gen: # if the page title is given as a command line argument, # connect the title's parts with spaces if pageTitle != []: pageTitle = ' '.join(pageTitle) page = pywikibot.Page(pywikibot.Site(), pageTitle) # if no page title was given as an argument, and none was # read from a file, query the user if not page: pageTitle = pywikibot.input(u'Which page to check:') page = pywikibot.Page(pywikibot.Site(), pageTitle) # generator which will yield only a single Page gen = iter([page]) if not targetLang and not targetFamily: targetSite = pywikibot.Site('commons', 'commons') else: if not targetLang: targetLang = pywikibot.Site().language if not targetFamily: targetFamily = pywikibot.Site().family targetSite = pywikibot.Site(targetLang, targetFamily) bot = ImageTransferBot(gen, interwiki=interwiki, targetSite=targetSite, keep_name=keep_name) bot.run()
def query(filename: str, params: QueryParams) -> None: site = pywikibot.Site() pages = list( pagegenerators.TextfilePageGenerator(filename=filename, site=site)) limit = _validated_limit(params.limit, params.offset, len(pages)) print('Downloading... offset={}, limit={}'.format(params.offset, limit)) tc, uc = 0, 0 for i in range(params.offset, params.offset + limit): p = pages[i] if p.pageid == 0: print("ERROR: Cannot fetch the page " + p.title()) continue # onyshchak: create_if_not_exists - switch to enrich only existing data page_dir = _get_path( out_dir=params.out_dir + p.title(as_filename=True).rstrip('.'), create_if_not_exists=not params.only_update_cached_pages) if not page_dir.exists(): continue if params.debug_info: print(i, page_dir) should_download_article = lambda path: (not path.exists() or stat( path).st_size == 0 or params.invalidate_text_cache) text_path = page_dir / 'text.json' if should_download_article(text_path): if params.debug_info: print("Downloading text.json") page_json = json.dumps({ "title": p.title(), "id": p.pageid, "url": p.full_url(), "text": p.text, }) _dump(text_path, page_json) # downloading page images tc, uc = _img_download(p.imagelinks(), page_dir, params.invalidate_img_cache, tc, uc) print('Downloaded {} images, where {} of them unavailable from commons'. format(tc, uc)) _file_log(skipped_svg, 'logs/skipped_svg_{}.txt'.format(params.offset))
def update_meta_description(filename, out_dir, offset=0, limit=None): site = pywikibot.Site() pages = list( pagegenerators.TextfilePageGenerator(filename=filename, site=site)) limit = _validated_limit(limit, offset, len(pages)) for i in range(offset, offset + limit): p = pages[i] if p.pageid == 0: print("ERROR: Cannot fetch the page " + p.title()) continue page_dir = _get_path(out_dir + p.title(as_filename=True).rstrip('.'), create_if_not_exists=False) if not page_dir.exists(): print('not page_dir.exists()', page_dir) continue # onyshchak: temporary switch to enrich only existing data print(i, p.title()) img_dir = _get_path(page_dir / "img", create_if_not_exists=False) meta_path = img_dir / 'meta.json' meta = _getJSON(meta_path) updated = False for img in p.imagelinks(): if not _valid_img_type(img.title(with_ns=False)): continue i = next(i for i, x in enumerate(meta['img_meta']) if x['title'] == img.title(with_ns=False)) updated_description = _get_description(img) if updated_description != meta['img_meta'][i]['description']: updated = True meta['img_meta'][i]['description'] = _get_description(img) print("DESCRIPTION", img_dir / meta['img_meta'][i]['filename']) if updated: meta_json = json.dumps(meta) _dump(meta_path, meta_json)
def get_redirects(self): return [ page.title() for page in pagegenerators.TextfilePageGenerator(site=self.site) ]
def main(*args): # the option that's always selected when the bot wonders what to do with # a link. If it's None, the user is prompted (default behaviour). always = None alternatives = [] getAlternatives = True dnSkip = False generator = None pageTitle = None primary = False main_only = False # For sorting the linked pages, case can be ignored minimum = 0 local_args = pywikibot.handleArgs(*args) for arg in local_args: if arg.startswith('-primary:'): primary = True getAlternatives = False alternatives.append(arg[9:]) elif arg == '-primary': primary = True elif arg.startswith('-always:'): always = arg[8:] elif arg.startswith('-file'): if len(arg) == 5: generator = pagegenerators.TextfilePageGenerator(filename=None) else: generator = pagegenerators.TextfilePageGenerator( filename=arg[6:]) elif arg.startswith('-pos:'): if arg[5] != ':': mysite = pywikibot.Site() page = pywikibot.Page(pywikibot.Link(arg[5:], mysite)) if page.exists(): alternatives.append(page.title()) else: answer = pywikibot.inputChoice( u'Possibility %s does not actually exist. Use it anyway?' % page.title(), ['yes', 'no'], ['y', 'N'], 'N') if answer == 'y': alternatives.append(page.title()) else: alternatives.append(arg[5:]) elif arg == '-just': getAlternatives = False elif arg == '-dnskip': dnSkip = True elif arg == '-main': main_only = True elif arg.startswith('-min:'): minimum = int(arg[5:]) elif arg.startswith('-start'): try: if len(arg) <= len('-start:'): generator = pagegenerators.CategorizedPageGenerator( pywikibot.Site().disambcategory()) else: generator = pagegenerators.CategorizedPageGenerator( pywikibot.Site().disambcategory(), start=arg[7:]) generator = pagegenerators.NamespaceFilterPageGenerator( generator, [0]) except pywikibot.NoPage: pywikibot.output( "Disambiguation category for your wiki is not known.") raise elif not pageTitle: pageTitle = arg site = pywikibot.Site() if pageTitle: page = pywikibot.Page(pywikibot.Link(pageTitle, site)) generator = iter([page]) if not generator: pywikibot.showHelp() return site.login() bot = DisambiguationRobot(always, alternatives, getAlternatives, dnSkip, generator, primary, main_only, minimum=minimum) bot.run()
def test_lines(self): filename = os.path.join(_data_dir, 'pagelist-lines.txt') site = self.get_site() titles = list(pagegenerators.TextfilePageGenerator(filename, site)) self.assertPagelistTitles(titles, self.expected_titles[site.case()])
def main(*args): """ Process command line arguments and invoke bot. If args is an empty list, sys.argv is used. @param args: command line arguments @type args: list of unicode """ # the option that's always selected when the bot wonders what to do with # a link. If it's None, the user is prompted (default behaviour). always = None alternatives = [] getAlternatives = True dnSkip = False generator = None pageTitle = None primary = False main_only = False # For sorting the linked pages, case can be ignored minimum = 0 local_args = pywikibot.handle_args(args) for arg in local_args: if arg.startswith('-primary:'): primary = True getAlternatives = False alternatives.append(arg[9:]) elif arg == '-primary': primary = True elif arg.startswith('-always:'): always = arg[8:] elif arg.startswith('-file'): if len(arg) == 5: generator = pagegenerators.TextfilePageGenerator(filename=None) else: generator = pagegenerators.TextfilePageGenerator( filename=arg[6:]) elif arg.startswith('-pos:'): if arg[5] != ':': mysite = pywikibot.Site() page = pywikibot.Page(pywikibot.Link(arg[5:], mysite)) if page.exists(): alternatives.append(page.title()) else: if pywikibot.input_yn( u'Possibility %s does not actually exist. Use it ' 'anyway?' % page.title(), default=False, automatic_quit=False): alternatives.append(page.title()) else: alternatives.append(arg[5:]) elif arg == '-just': getAlternatives = False elif arg == '-dnskip': dnSkip = True elif arg == '-main': main_only = True elif arg.startswith('-min:'): minimum = int(arg[5:]) elif arg.startswith('-start'): try: generator = pagegenerators.CategorizedPageGenerator( pywikibot.Site().disambcategory(), start=arg[7:], namespaces=[0]) except pywikibot.NoPage: pywikibot.output( "Disambiguation category for your wiki is not known.") raise elif not pageTitle: pageTitle = arg site = pywikibot.Site() if pageTitle: page = pywikibot.Page(pywikibot.Link(pageTitle, site)) generator = iter([page]) if not generator: pywikibot.showHelp() return site.login() bot = DisambiguationRobot(always, alternatives, getAlternatives, dnSkip, generator, primary, main_only, minimum=minimum) bot.run()
def query_size(filename: str): site = pywikibot.Site() pages = list( pagegenerators.TextfilePageGenerator(filename=filename, site=site)) return len(pages)
def main(*args): # the option that's always selected when the bot wonders what to do with # a link. If it's None, the user is prompted (default behaviour). always = None alternatives = [] getAlternatives = True dnSkip = False # if the -file argument is used, page titles are dumped in this array. # otherwise it will only contain one page. generator = None # This temporary array is used to read the page title if one single # page to work on is specified by the arguments. pageTitle = [] primary = False main_only = False # For sorting the linked pages, case can be ignored minimum = 0 for arg in pywikibot.handleArgs(*args): if arg.startswith('-primary:'): primary = True getAlternatives = False alternatives.append(arg[9:]) elif arg == '-primary': primary = True elif arg.startswith('-always:'): always = arg[8:] elif arg.startswith('-file'): if len(arg) == 5: generator = pagegenerators.TextfilePageGenerator(filename=None) else: generator = pagegenerators.TextfilePageGenerator( filename=arg[6:]) elif arg.startswith('-pos:'): if arg[5] != ':': mysite = pywikibot.Site() page = pywikibot.Page(pywikibot.Link(arg[5:], mysite)) if page.exists(): alternatives.append(page.title()) else: answer = pywikibot.inputChoice( u'Possibility %s does not actually exist. Use it anyway?' % page.title(), ['yes', 'no'], ['y', 'N'], 'N') if answer == 'y': alternatives.append(page.title()) else: alternatives.append(arg[5:]) elif arg == '-just': getAlternatives = False elif arg == '-dnskip': dnSkip = True elif arg == '-main': main_only = True elif arg.startswith('-min:'): minimum = int(arg[5:]) elif arg.startswith('-start'): try: if len(arg) <= len('-start:'): generator = pagegenerators.CategorizedPageGenerator( pywikibot.Site().disambcategory()) else: generator = pagegenerators.CategorizedPageGenerator( pywikibot.Site().disambcategory(), start=arg[7:]) generator = pagegenerators.NamespaceFilterPageGenerator( generator, [0]) except pywikibot.NoPage: pywikibot.output( "Disambiguation category for your wiki is not known.") raise elif arg.startswith("-"): pywikibot.output("Unrecognized command line argument: %s" % arg) # show help text and exit pywikibot.showHelp() else: pageTitle.append(arg) site = pywikibot.Site() site.login() # if the disambiguation page is given as a command line argument, # connect the title's parts with spaces if pageTitle != []: pageTitle = ' '.join(pageTitle) page = pywikibot.Page(pywikibot.Link(pageTitle, site)) generator = iter([page]) # if no disambiguation page was given as an argument, and none was # read from a file, query the user if not generator: pageTitle = pywikibot.input( u'On which disambiguation page do you want to work?') page = pywikibot.Page(pywikibot.Link(pageTitle, site)) generator = iter([page]) bot = DisambiguationRobot(always, alternatives, getAlternatives, dnSkip, generator, primary, main_only, minimum=minimum) bot.run()
def query(filename: str, params: QueryParams) -> None: site = pywikibot.Site(code=params.language_code, fam='wikipedia', user='******') pages = list( pagegenerators.TextfilePageGenerator(filename=filename, site=site)) limit = _validated_limit(params.limit, params.offset, len(pages)) icons: Set[str] = set() # TODO: don't execute driver when fill_captions=Flase options = Options() options.headless = True driver = webdriver.Firefox(options=options) print('Downloading... offset={}, limit={}'.format(params.offset, limit)) tc, uc = 0, 0 for i in range(params.offset, params.offset + limit): p = pages[i] if p.pageid == 0: print("\nERROR: Cannot fetch the page " + p.title()) continue # onyshchak: create_if_not_exists - switch to enrich only existing data page_dir = _get_path( out_dir=params.out_dir + p.title(as_filename=True).rstrip('.'), create_if_not_exists=not params.only_update_cached_pages) if not page_dir.exists(): continue if params.debug_info: print('\n{}) {}'.format(i, page_dir)) should_download_article = lambda path: (not path.exists() or stat( path).st_size == 0 or params.invalidate_cache.text_cache) text_path = page_dir / 'text.json' if should_download_article(text_path): if params.debug_info: print("Downloading text.json") page_json = { "title": p.title(), "id": p.pageid, "url": p.full_url(), } if params.fill_property.text_wikitext: page_json["wikitext"] = p.text if params.fill_property.text_html: response = urllib.request.urlopen(p.full_url()) page_json["html"] = response.read().decode("utf-8") _dump(text_path, page_json) # downloading page images tc, uc = _img_download(p.imagelinks(), page_dir, params, tc, uc) if params.fill_property.img_caption: _query_img_captions( page_dir=page_dir, driver=driver, icons=icons, language_code=params.language_code, invalidate_cache=params.invalidate_cache.caption_cache, debug_info=params.debug_info, ) print('\nDownloaded {} images, where {} of them unavailable from commons'. format(tc, uc)) driver.quit() icons_json = _getJSON(_KNOWN_ICONS_PATH) updated_icons = icons.union(icons_json['known_icons']) _dump(_KNOWN_ICONS_PATH, {"known_icons": list(updated_icons)})
def main(): gen = None oldName = None options = {} fromToPairs = [] # Process global args and prepare generator args parser local_args = pywikibot.handleArgs() genFactory = pagegenerators.GeneratorFactory() for arg in local_args: if arg.startswith('-pairs'): if len(arg) == len('-pairs'): filename = pywikibot.input( u'Enter the name of the file containing pairs:') else: filename = arg[len('-pairs:'):] oldName1 = None for page in pagegenerators.TextfilePageGenerator(filename): if oldName1: fromToPairs.append([oldName1, page.title()]) oldName1 = None else: oldName1 = page.title() if oldName1: pywikibot.warning( u'file %s contains odd number of links' % filename) elif arg == '-noredirect': options['noredirect'] = True elif arg == '-notalkpage': options['movetalkpage'] = False elif arg == '-always': options['always'] = True elif arg == '-skipredirects': options['skipredirects'] = True elif arg.startswith('-from:'): if oldName: pywikibot.warning(u'-from:%s without -to:' % oldName) oldName = arg[len('-from:'):] elif arg.startswith('-to:'): if oldName: fromToPairs.append([oldName, arg[len('-to:'):]]) oldName = None else: pywikibot.warning(u'%s without -from' % arg) elif arg.startswith('-prefix'): if len(arg) == len('-prefix'): options['prefix'] = pywikibot.input(u'Enter the prefix:') else: options['prefix'] = arg[8:] elif arg.startswith('-summary'): if len(arg) == len('-summary'): options['summary'] = pywikibot.input(u'Enter the summary:') else: options['summary'] = arg[9:] else: genFactory.handleArg(arg) if oldName: pywikibot.warning(u'-from:%s without -to:' % oldName) site = pywikibot.Site() for pair in fromToPairs: page = pywikibot.Page(site, pair[0]) bot = MovePagesBot(None, **options) bot.moveOne(page, pair[1]) if not gen: gen = genFactory.getCombinedGenerator() if gen: preloadingGen = pagegenerators.PreloadingGenerator(gen) bot = MovePagesBot(preloadingGen, **options) bot.run() elif not fromToPairs: pywikibot.showHelp()
def main(*args): """ Process command line arguments and invoke bot. If args is an empty list, sys.argv is used. @param args: command line arguments @type args: list of unicode """ gen = None oldName = None options = {} fromToPairs = [] # Process global args and prepare generator args parser local_args = pywikibot.handle_args(args) genFactory = pagegenerators.GeneratorFactory() for arg in local_args: if arg.startswith('-pairs'): if len(arg) == len('-pairs'): filename = pywikibot.input( u'Enter the name of the file containing pairs:') else: filename = arg[len('-pairs:'):] oldName1 = None for page in pagegenerators.TextfilePageGenerator(filename): if oldName1: fromToPairs.append([oldName1, page.title()]) oldName1 = None else: oldName1 = page.title() if oldName1: pywikibot.warning( u'file %s contains odd number of links' % filename) elif arg == '-noredirect': options['noredirect'] = True elif arg == '-notalkpage': options['movetalkpage'] = False elif arg == '-always': options['always'] = True elif arg == '-skipredirects': options['skipredirects'] = True elif arg.startswith('-from:'): if oldName: pywikibot.warning(u'-from:%s without -to:' % oldName) oldName = arg[len('-from:'):] elif arg.startswith('-to:'): if oldName: fromToPairs.append([oldName, arg[len('-to:'):]]) oldName = None else: pywikibot.warning(u'%s without -from' % arg) elif arg.startswith('-prefix'): if len(arg) == len('-prefix'): options['prefix'] = pywikibot.input(u'Enter the prefix:') else: options['prefix'] = arg[8:] elif arg.startswith('-summary'): if len(arg) == len('-summary'): options['summary'] = pywikibot.input(u'Enter the summary:') else: options['summary'] = arg[9:] else: genFactory.handleArg(arg) if oldName: pywikibot.warning(u'-from:%s without -to:' % oldName) site = pywikibot.Site() for pair in fromToPairs: page = pywikibot.Page(site, pair[0]) bot = MovePagesBot(None, **options) bot.moveOne(page, pair[1]) if not gen: gen = genFactory.getCombinedGenerator() if gen: preloadingGen = pagegenerators.PreloadingGenerator(gen) bot = MovePagesBot(preloadingGen, **options) bot.run() return True else: # in theory pairs could be missing too pywikibot.bot.suggest_help(missing_generator=True) return False
def fetch_meta_captions(filename, out_dir, offset=0, limit=None): site = pywikibot.Site() pages = list( pagegenerators.TextfilePageGenerator(filename=filename, site=site)) limit = _validated_limit(limit, offset, len(pages)) options = Options() options.headless = True driver = webdriver.Firefox(options=options) for j in range(offset, offset + limit): p = pages[j] if p.pageid == 0: print("ERROR: Cannot fetch the page " + p.title()) continue page_dir = _get_path(out_dir + p.title(as_filename=True).rstrip('.'), create_if_not_exists=False) if not page_dir.exists(): print('not page_dir.exists()', page_dir) continue # onyshchak: temporary switch to enrich only existing data print(j, p.title()) img_dir = _get_path(page_dir / "img", create_if_not_exists=False) meta_path = img_dir / 'meta.json' meta_arr = _getJSON(meta_path)['img_meta'] page_id = p.title(as_filename=True).rstrip('.') for img in p.imagelinks(): if not _valid_img_type(img.title(with_ns=False)): continue img_id = img.title(as_filename=True, with_ns=False) res = [ i for i, x in enumerate(meta_arr) if unquote(x['url']).split('/wiki/File:')[-1] == img_id ] if len(res) != 1: print('WARNING: outdated page {}, missing image {}'.format( page_id, img_id)) continue i = res[0] url = 'https://en.wikipedia.org/wiki/{}#/media/File:{}'.format( page_id, img_id) driver.get(url) time.sleep(1) # reqired for JS to load content caption = None for k in range(5): try: caption = driver.find_element_by_class_name( "mw-mmv-title").text if caption == "": caption = None raise Exception except: time.sleep(1) # reqired for JS to load content print("RETRY", k, " ||| ", img_id) meta_arr[i].pop('caption', None) if caption and caption != _remove_prefix( meta_arr[i]['description'], "English: "): meta_arr[i]['caption'] = caption # print(j, img_id, ' ||| ', caption) _dump(meta_path, json.dumps({"img_meta": meta_arr})) driver.quit()
def main(*args) -> None: """ Process command line arguments and invoke bot. If args is an empty list, sys.argv is used. @param args: command line arguments @type args: str """ oldName = None options = {} fromToPairs = [] # Process global args and prepare generator args parser local_args = pywikibot.handle_args(args) genFactory = pagegenerators.GeneratorFactory() local_args = genFactory.handle_args(local_args) for arg in local_args: if arg.startswith('-pairsfile'): if len(arg) == len('-pairsfile'): filename = pywikibot.input( 'Enter the name of the file containing pairs:') else: filename = arg[len('-pairsfile:'):] oldName1 = None for page in pagegenerators.TextfilePageGenerator(filename): if oldName1: fromToPairs.append([oldName1, page.title()]) oldName1 = None else: oldName1 = page.title() if oldName1: pywikibot.warning( 'file {} contains odd number of links'.format(filename)) elif arg == '-noredirect': options['noredirect'] = True elif arg == '-notalkpage': options['movetalkpage'] = False elif arg == '-always': options['always'] = True elif arg == '-skipredirects': options['skipredirects'] = True elif arg.startswith('-from:'): if oldName: pywikibot.warning('-from:{} without -to:'.format(oldName)) oldName = arg[len('-from:'):] elif arg.startswith('-to:'): if oldName: fromToPairs.append([oldName, arg[len('-to:'):]]) oldName = None else: pywikibot.warning('{} without -from'.format(arg)) elif arg.startswith('-prefix'): if len(arg) == len('-prefix'): options['prefix'] = pywikibot.input('Enter the prefix:') else: options['prefix'] = arg[8:] elif arg.startswith('-summary'): if len(arg) == len('-summary'): options['summary'] = pywikibot.input('Enter the summary:') else: options['summary'] = arg[9:] if oldName: pywikibot.warning('-from:{} without -to:'.format(oldName)) site = pywikibot.Site() for pair in fromToPairs: page = pywikibot.Page(site, pair[0]) bot = MovePagesBot(**options) bot.moveOne(page, pair[1]) gen = genFactory.getCombinedGenerator(preload=True) if gen: bot = MovePagesBot(generator=gen, **options) bot.run() elif not fromToPairs: pywikibot.bot.suggest_help(missing_generator=True)
def get_redirects(self): return list( map(methodcaller('title'), pagegenerators.TextfilePageGenerator(site=self.site)))