def ListToParam( list ): """Convert a list of unicode strings into a UTF8 string separated by the '|' symbols """ list = ConvToList( list ) if len(list) == 0: return '' encList = '' # items may not have one symbol - '|' for item in list: if isinstance(item, basestring): if u'|' in item: raise wikipedia.Error(u"item '%s' contains '|' symbol" % item) encList += ToUtf8(item) + u'|' elif type(item) == int: encList += ToUtf8(item) + u'|' elif isinstance(item, wikipedia.Page): encList += ToUtf8(item.title()) + u'|' elif item.__class__.__name__ == 'User': # delay loading this until it is needed import userlib encList += ToUtf8(item.name()) + u'|' else: raise wikipedia.Error(u'unknown item class %s' % item.__class__.__name__) # strip trailing '|' before returning return encList[:-1]
def categoryAllElementsAPI(CatName, cmlimit=5000, categories_parsed=[], site=None): # action=query&list=categorymembers&cmlimit=500&cmtitle=Category:License_tags """ Category to load all the elements in a category using the APIs. Limit: 5000 elements. """ pywikibot.output("Loading %s..." % CatName) params = { 'action': 'query', 'list': 'categorymembers', 'cmlimit': cmlimit, 'cmtitle': CatName, } data = query.GetData(params, site) categories_parsed.append(CatName) try: members = data['query']['categorymembers'] except KeyError: if int(cmlimit) != 500: pywikibot.output( u'An Error occured, trying to reload the category.') return categoryAllElementsAPI(CatName, cmlimit=500) else: raise pywikibot.Error(data) if len(members) == int(cmlimit): raise pywikibot.Error( u'The category selected has >= %s elements, limit reached.' % cmlimit) allmembers = members results = list() for subcat in members: ns = subcat['ns'] pageid = subcat['pageid'] title = subcat['title'] if ns == 14: if title not in categories_parsed: categories_parsed.append(title) (results_part, categories_parsed) = categoryAllElementsAPI( title, 5000, categories_parsed) allmembers.extend(results_part) for member in allmembers: ns = member['ns'] pageid = member['pageid'] title = member['title'] results.append(member) return (results, categories_parsed)
def query_api(self, host, path, **kwargs): data = urlencode([(k, v.encode('utf-8')) for k, v in kwargs.iteritems()]) if path.endswith('query.php'): query_string = '%s?format=json&%s' % (path, data) method = 'GET' data = '' elif path.endswith('api.php'): query_string = '%s?format=json' % path method = 'POST' else: raise ValueError('Unknown api %s' % repr(api)) try: res = self.request(method, query_string, {'Host': host, 'Content-Type': 'application/x-www-form-urlencoded'}, data) except httplib.ImproperConnectionState: self._conn.close() self.__init__(self.host) try: data = json.load(res) finally: res.close() if 'error' in data: if data['error']['code'] == u'internal_api_error_DBConnectionError': return self.query_api(host, path, **kwargs) raise wikipedia.Error(data['error']['code'], data['error']['info']) return data
def ListToParam(list): """Convert a list of unicode strings into a UTF8 string separated by the '|' symbols """ list = ConvToList(list) if len(list) == 0: return '' encList = '' # items may not have one symbol - '|' for l in list: if type(l) == str and u'|' in l: raise wikipedia.Error("item '%s' contains '|' symbol" % l) encList += ToUtf8(l) + u'|' return encList[:-1]
def getCategoryMembers(self, page, min=0, step=50): if (page.namespace() != 14): raise wikipedia.Error( "%s is not in category namespace '%s'" % (page.__repr__(), page.site().category_namespace())) q = """ SELECT page_namespace, page_title FROM %s.categorylinks LEFT JOIN %s.page ON page_id = cl_from WHERE cl_to=%%s """ % ((page.site().dbName(), ) * 2) for row in self._generate( q, min, step, page.titleWithoutNamespace(True).encode('utf-8')): if (row['page_namespace'] == 14): yield catlib.Category( page.site(), page.site().category_namespace() + ':' + row['page_title'].decode('utf-8'), page.site()) else: yield wikipedia.Page(page.site(), row['page_title'].decode('utf-8'), page.site(), row['page_namespace'])
def main(*args): add_cat = None gen = None # summary message summary_commandline = False # Array which will collect commandline parameters. # First element is original text, second element is replacement text. commandline_replacements = [] # A list of 2-tuples of original text and replacement text. replacements = [] # Don't edit pages which contain certain texts. exceptions = { 'title': [], 'text-contains': [], 'inside': [], 'inside-tags': [], 'require-title': [], # using a seperate requirements dict needs some } # major refactoring of code. # Should the elements of 'replacements' and 'exceptions' be interpreted # as regular expressions? regex = False # Predefined fixes from dictionary 'fixes' (see above). fix = None # the dump's path, either absolute or relative, which will be used # if -xml flag is present xmlFilename = None useSql = False PageTitles = [] # will become True when the user presses a ('yes to all') or uses the # -always flag. acceptall = False # Will become True if the user inputs the commandline parameter -nocase caseInsensitive = False # Will become True if the user inputs the commandline parameter -dotall dotall = False # Will become True if the user inputs the commandline parameter -multiline multiline = False # Do all hits when they overlap allowoverlap = False # Do not recurse replacement recursive = False # This is the maximum number of pages to load per query maxquerysize = 60 # This factory is responsible for processing command line arguments # that are also used by other scripts and that determine on which pages # to work on. genFactory = pagegenerators.GeneratorFactory() # Load default summary message. # BUG WARNING: This is probably incompatible with the -lang parameter. editSummary = pywikibot.translate(pywikibot.getSite(), msg) # Between a regex and another (using -fix) sleep some time (not to waste # too much CPU sleep = None # Do not save the page titles, rather work on wiki titlefile = None filename = None # If we save, primary behaviour is append rather then new file append = True # Read commandline parameters. for arg in pywikibot.handleArgs(*args): if arg == '-regex': regex = True elif arg.startswith('-xmlstart'): if len(arg) == 9: xmlStart = pywikibot.input( u'Please enter the dumped article to start with:') else: xmlStart = arg[10:] elif arg.startswith('-xml'): if len(arg) == 4: xmlFilename = pywikibot.input( u'Please enter the XML dump\'s filename:') else: xmlFilename = arg[5:] elif arg == '-sql': useSql = True elif arg.startswith('-page'): if len(arg) == 5: PageTitles.append( pywikibot.input(u'Which page do you want to change?')) else: PageTitles.append(arg[6:]) elif arg.startswith('-savenew'): append = False if len(arg) == 8: filename = pywikibot.input( u'Please enter the filename to save the titles \n(will be deleted if exists):' ) else: filename = arg[9:] elif arg.startswith('-save'): if len(arg) == 5: filename = pywikibot.input( u'Please enter the filename to save the titles:') else: filename = arg[6:] elif arg.startswith('-excepttitle:'): exceptions['title'].append(arg[13:]) elif arg.startswith('-requiretitle:'): exceptions['require-title'].append(arg[14:]) elif arg.startswith('-excepttext:'): exceptions['text-contains'].append(arg[12:]) elif arg.startswith('-exceptinside:'): exceptions['inside'].append(arg[14:]) elif arg.startswith('-exceptinsidetag:'): exceptions['inside-tags'].append(arg[17:]) elif arg.startswith('-fix:'): fix = arg[5:] elif arg.startswith('-sleep:'): sleep = float(arg[7:]) elif arg == '-always': acceptall = True elif arg == '-recursive': recursive = True elif arg == '-nocase': caseInsensitive = True elif arg == '-dotall': dotall = True elif arg == '-multiline': multiline = True elif arg.startswith('-addcat:'): add_cat = arg[8:] elif arg.startswith('-summary:'): editSummary = arg[9:] summary_commandline = True elif arg.startswith('-allowoverlap'): allowoverlap = True elif arg.startswith('-query:'): maxquerysize = int(arg[7:]) else: if not genFactory.handleArg(arg): commandline_replacements.append(arg) if (len(commandline_replacements) % 2): raise pywikibot.Error, 'require even number of replacements.' elif (len(commandline_replacements) == 2 and fix is None): replacements.append( (commandline_replacements[0], commandline_replacements[1])) if not summary_commandline: editSummary = pywikibot.translate(pywikibot.getSite(), msg) % ( ' (-%s +%s)' % (commandline_replacements[0], commandline_replacements[1])) elif (len(commandline_replacements) > 1): if (fix is None): for i in xrange(0, len(commandline_replacements), 2): replacements.append((commandline_replacements[i], commandline_replacements[i + 1])) if not summary_commandline: pairs = [(commandline_replacements[i], commandline_replacements[i + 1]) for i in range(0, len(commandline_replacements), 2)] replacementsDescription = '(%s)' % ', '.join( [('-' + pair[0] + ' +' + pair[1]) for pair in pairs]) editSummary = pywikibot.translate(pywikibot.getSite(), msg ) \ % replacementsDescription else: raise pywikibot.Error( 'Specifying -fix with replacements is undefined') elif fix is None: old = pywikibot.input( u'Please enter the text that should be replaced:') new = pywikibot.input(u'Please enter the new text:') change = '(-' + old + ' +' + new replacements.append((old, new)) while True: old = pywikibot.input( u'Please enter another text that should be replaced, or press Enter to start:' ) if old == '': change = change + ')' break new = pywikibot.input(u'Please enter the new text:') change = change + ' & -' + old + ' +' + new replacements.append((old, new)) if not summary_commandline: default_summary_message = pywikibot.translate( pywikibot.getSite(), msg) % change pywikibot.output(u'The summary message will default to: %s' % default_summary_message) summary_message = pywikibot.input( u'Press Enter to use this default message, or enter a description of the\nchanges your bot will make:' ) if summary_message == '': summary_message = default_summary_message editSummary = summary_message else: # Perform one of the predefined actions. try: fix = fixes.fixes[fix] except KeyError: pywikibot.output(u'Available predefined fixes are: %s' % fixes.fixes.keys()) return if "regex" in fix: regex = fix['regex'] if "msg" in fix: editSummary = pywikibot.translate(pywikibot.getSite(), fix['msg']) if "exceptions" in fix: exceptions = fix['exceptions'] if "nocase" in fix: caseInsensitive = fix['nocase'] replacements = fix['replacements'] #Set the regular expression flags flags = re.UNICODE if caseInsensitive: flags = flags | re.IGNORECASE if dotall: flags = flags | re.DOTALL if multiline: flags = flags | re.MULTILINE # Pre-compile all regular expressions here to save time later for i in range(len(replacements)): old, new = replacements[i] if not regex: old = re.escape(old) oldR = re.compile(old, flags) replacements[i] = oldR, new for exceptionCategory in [ 'title', 'require-title', 'text-contains', 'inside' ]: if exceptionCategory in exceptions: patterns = exceptions[exceptionCategory] if not regex: patterns = [re.escape(pattern) for pattern in patterns] patterns = [re.compile(pattern, flags) for pattern in patterns] exceptions[exceptionCategory] = patterns if xmlFilename: try: xmlStart except NameError: xmlStart = None gen = XmlDumpReplacePageGenerator(xmlFilename, xmlStart, replacements, exceptions) elif useSql: whereClause = 'WHERE (%s)' % ' OR '.join([ "old_text RLIKE '%s'" % prepareRegexForMySQL(old.pattern) for (old, new) in replacements ]) if exceptions: exceptClause = 'AND NOT (%s)' % ' OR '.join([ "old_text RLIKE '%s'" % prepareRegexForMySQL(exc.pattern) for exc in exceptions ]) else: exceptClause = '' query = u""" SELECT page_namespace, page_title FROM page JOIN text ON (page_id = old_id) %s %s LIMIT 200""" % (whereClause, exceptClause) gen = pagegenerators.MySQLPageGenerator(query) elif PageTitles: pages = [ pywikibot.Page(pywikibot.getSite(), PageTitle) for PageTitle in PageTitles ] gen = iter(pages) gen = genFactory.getCombinedGenerator(gen) if not gen: # syntax error, show help text from the top of this file pywikibot.showHelp('replace') return if xmlFilename: # XML parsing can be quite slow, so use smaller batches and # longer lookahead. preloadingGen = pagegenerators.PreloadingGenerator(gen, pageNumber=20, lookahead=100) else: preloadingGen = pagegenerators.PreloadingGenerator( gen, pageNumber=maxquerysize) #Finally we open the file for page titles or set article to None if filename: try: #This opens in strict error mode, that means bot will stop #on encoding errors with ValueError. #See http://docs.python.org/library/codecs.html#codecs.open titlefile = codecs.open(filename, encoding='utf-8', mode=(lambda x: x and 'a' or 'w')(append)) except IOError: pywikibot.output("%s cannot be opened for writing." % filename) return bot = ReplaceRobot(preloadingGen, replacements, exceptions, acceptall, allowoverlap, recursive, add_cat, sleep, editSummary, titlefile) try: bot.run() finally: if titlefile: #Just for the spirit of programming (it was flushed) titlefile.close()
def replaceCategoryLinks(oldtext, new, site=None, addOnly=False): """ Replace the category links given in the wikitext given in oldtext by the new links given in new. 'new' should be a list of Category objects or strings which can be either the raw name or [[Category:..]]. If addOnly is True, the old category won't be deleted and the category(s) given will be added (and so they won't replace anything). """ # Find a marker that is not already in the text. marker = findmarker(oldtext) if site is None: site = pywikibot.getSite() if site.sitename() == 'wikipedia:de' and "{{Personendaten" in oldtext: raise pywikibot.Error("""\ The PyWikipediaBot is no longer allowed to touch categories on the German Wikipedia on pages that contain the Personendaten template because of the non-standard placement of that template. See http://de.wikipedia.org/wiki/Hilfe_Diskussion:Personendaten/Archiv/bis_2006#Position_der_Personendaten_am_.22Artikelende.22 """) separator = site.family.category_text_separator iseparator = site.family.interwiki_text_separator separatorstripped = separator.strip() iseparatorstripped = iseparator.strip() if addOnly: s2 = oldtext else: s2 = removeCategoryLinksAndSeparator(oldtext, site=site, marker=marker, separator=separatorstripped) s = categoryFormat(new, insite=site) if s: if site.language() in site.family.category_attop: newtext = s + separator + s2 else: # calculate what was after the categories links on the page firstafter = s2.find(marker) if firstafter < 0: firstafter = len(s2) else: firstafter += len(marker) # Is there text in the 'after' part that means we should keep it # after? if "</noinclude>" in s2[firstafter:]: if separatorstripped: s = separator + s newtext = (s2[:firstafter].replace(marker, '') + s + s2[firstafter:]) elif site.language() in site.family.categories_last: newtext = s2.replace(marker, '').strip() + separator + s else: interwiki = getLanguageLinks(s2) s2 = removeLanguageLinksAndSeparator(s2.replace( marker, ''), site, '', iseparatorstripped) + separator + s newtext = replaceLanguageLinks(s2, interwiki, site=site, addOnly=True) else: newtext = s2.replace(marker, '') return newtext.strip()