def update_family(families):
    if not families:
        families = familiesDict.keys()
    for family in families:
        wikipedia.output('Checking family %s:' % family)

        original = wikipedia.Family(family).languages_by_size
        obsolete = wikipedia.Family(family).obsolete

        url = 'http://s23.org/wikistats/%s' % familiesDict[family]
        uo = wikipedia.MyURLopener
        f = uo.open(url)
        text = f.read()

        if family == 'wikipedia':
            p = re.compile(
                r"\[\[:([a-z\-]{2,}):\|\1\]\].*?'''([0-9,]{1,})'''</span>\]",
                re.DOTALL)
        else:
            p = re.compile(
                r"\[http://([a-z\-]{2,}).%s.org/wiki/ \1].*?'''([0-9,]{1,})'''\]"
                % family, re.DOTALL)

        new = []
        for lang, cnt in p.findall(text):
            if lang in obsolete or lang in exceptions:
                # Ignore this language
                continue
            new.append(lang)
        if original == new:
            wikipedia.output(u'The lists match!')
        else:
            wikipedia.output(u"The lists don't match, the new list is:")
            text = u'        self.languages_by_size = [\r\n'
            line = ' ' * 11
            for lang in new:
                if len(line) + len(lang) <= 76:
                    line += u" '%s'," % lang
                else:
                    text += u'%s\r\n' % line
                    line = '           '
                    line += u" '%s'," % lang
            text += u'%s\r\n' % line
            text += u'        ]'
            wikipedia.output(text)
            family_file_name = '../families/%s_family.py' % family
            family_file = codecs.open(family_file_name, 'r', 'utf8')
            old_text = family_text = family_file.read()
            old = re.findall(ur'(?msu)^ {8}self.languages_by_size.+?\]',
                             family_text)[0]
            family_text = family_text.replace(old, text)
            family_file = codecs.open(family_file_name, 'w', 'utf8')
            family_file.write(family_text)
            family_file.close()
Esempio n. 2
0
def join_family_data(reString, namespace):
    for s in pywikibot.Family().namespaces[namespace].itervalues():
        if type(s) == list:
            for e in s:
                reString += '|' + e
        else:
            reString += '|' + s
    return '\s*(' + reString + ')\s*'
Esempio n. 3
0
def check_and_update(families, update_main=False):
    for family in families:
        family = wikipedia.Family(family)
        result = family_check.check_family(family)
        update_family(family, result, update_main)
        if update_main:
            # Update also the family.py file
            update_family(None, result, update_main)
Esempio n. 4
0
def getLanguageLinks(text,
                     insite=None,
                     pageLink="[[]]",
                     template_subpage=False):
    """
    Return a dict of interlanguage links found in text.

    Dict uses language codes as keys and Page objects as values.
    Do not call this routine directly, use Page.interwiki() method
    instead.

    """
    if insite is None:
        insite = pywikibot.getSite()
    fam = insite.family
    # when interwiki links forward to another family, retrieve pages & other
    # infos there
    if fam.interwiki_forward:
        fam = pywikibot.Family(fam.interwiki_forward)
    result = {}
    # Ignore interwiki links within nowiki tags, includeonly tags, pre tags,
    # and HTML comments
    tags = ['comments', 'nowiki', 'pre', 'source']
    if not template_subpage:
        tags += ['includeonly']
    text = removeDisabledParts(text, tags)

    # This regular expression will find every link that is possibly an
    # interwiki link.
    # NOTE: language codes are case-insensitive and only consist of basic latin
    # letters and hyphens.
    # TODO: currently, we do not have any, but BCP 47 allows digits, and
    #       underscores.
    # TODO: There is no semantic difference between hyphens and
    #       underscores -> fold them.
    interwikiR = re.compile(r'\[\[([a-zA-Z\-]+)\s?:([^\[\]\n]*)\]\]')
    for lang, pagetitle in interwikiR.findall(text):
        lang = lang.lower()
        # Check if it really is in fact an interwiki link to a known
        # language, or if it's e.g. a category tag or an internal link
        if lang in fam.obsolete:
            lang = fam.obsolete[lang]
        if lang in fam.langs.keys():
            if '|' in pagetitle:
                # ignore text after the pipe
                pagetitle = pagetitle[:pagetitle.index('|')]
            # we want the actual page objects rather than the titles
            site = pywikibot.getSite(code=lang, fam=fam)
            try:
                result[site] = pywikibot.Page(site, pagetitle, insite=insite)
            except pywikibot.InvalidTitle:
                pywikibot.output(u'[getLanguageLinks] Text contains invalid '
                                 u'interwiki link [[%s:%s]].' %
                                 (lang, pagetitle))
                continue
    return result
Esempio n. 5
0
    def get(self, site, type, key=None, default=None):
        # This can probably also provide something for
        # localised settings, but then it first needs to
        # check whether the page is sysop only.
        if not key:
            key = str(site)

        self.lock.acquire()
        try:
            if type not in self.summaries:
                self.summaries[type] = {}
            if key in self.summaries[type]:
                if (time.time() - self.summaries[type][key][1]) < \
                        self.CommonsDelinker.config['summary_cache']:
                    # Return cached result
                    return self.summaries[type][key][0]

            output(u'%s Fetching new summary for %s' % (self, site))

            # FIXME: evil
            if self.CommonsDelinker.config['global']:
                self.check_user_page(site)
            page = wikipedia.Page(site, '%s%s' % \
                (self.CommonsDelinker.config['local_settings'], type))
            try:
                # Fetch the summary template, follow redirects
                i18n = page.get(get_redirect=True)
                self.summaries[type][key] = (i18n, time.time())
                return i18n
            except wikipedia.NoPage:
                pass
        finally:
            self.lock.release()

        # No i18n available, but it may be available in the wikipedia
        # of that language. Only do so for wiktionary, wikibooks,
        # wikiquote, wikisource, wikinews, wikiversity
        # This will cause the bot to function even on special wikis
        # like mediawiki.org and meta and species.
        output(u'%s Using default summary for %s' % (self, site))

        if default: return default

        if site.family.name != 'wikipedia' and self.CommonsDelinker.config[
                'global']:
            if site.family.name in ('wiktionary', 'wikibooks', 'wikiquote',
                                    'wikisource', 'wikinews', 'wikiversity'):
                if site.lang in config.usernames['wikipedia']:
                    newsite = self.CommonsDelinker.get_site(
                        site.lang, wikipedia.Family('wikipedia'))
                    return self.get(newsite, type, key=key)
        return self.CommonsDelinker.config['default_settings'].get(type, '')
Esempio n. 6
0
def main():
    all = False
    language = None
    fam = None
    wikimedia = False
    for arg in pywikibot.handleArgs():
        if arg == '-all':
            all = True
        elif arg[0:7] == '-langs:':
            language = arg[7:]
        elif arg[0:10] == '-families:':
            fam = arg[10:]
        elif arg[0:10] == '-wikimedia':
            wikimedia = True

    mySite = pywikibot.getSite()
    if wikimedia:
        families = [
            'commons', 'incubator', 'mediawiki', 'meta', 'species', 'test',
            'wikibooks', 'wikidata', 'wikinews', 'wikipedia', 'wikiquote',
            'wikisource', 'wikiversity', 'wikivoyage', 'wiktionary'
        ]
    elif fam is not None:
        families = fam.split(',')
    else:
        families = [
            mySite.family.name,
        ]

    for family in families:
        try:
            fam = pywikibot.Family(family)
        except ValueError:
            pywikibot.output(u'No such family %s' % family)
            continue
        if all:
            for lang in fam.langs.iterkeys():
                testSite(pywikibot.getSite(lang, family))
        elif language is None:
            lang = mySite.lang
            if not lang in fam.langs.keys():
                lang = fam.langs.keys()[-1]
            testSite(pywikibot.getSite(lang, family))
        else:
            languages = language.split(',')
            for lang in languages:
                try:
                    testSite(pywikibot.getSite(lang, family))
                except pywikibot.NoSuchSite:
                    pywikibot.output(u'No such language %s in family %s' %
                                     (lang, family))
Esempio n. 7
0
    def __init__(self, limit = 100,
            mysql_default_server = 3, mysql_host_prefix = 'sql-s', mysql_host_suffix = '',
            mysql_kwargs = {}, no_db = False, use_autoconn = False,

            http_retry_timeout = 30, http_max_retries = -1,
            http_callback = lambda *args: None,

            mysql_retry_timeout = 60,
            mysql_max_retries = -1, mysql_callback = lambda *args: None):

        self.http = None
        self.http_retry_timeout = http_retry_timeout
        self.http_max_retries = http_max_retries
        self.http_callback = http_callback

        if no_db: return

        self.mysql_host_prefix = mysql_host_prefix
        self.mysql_kwargs = mysql_kwargs.copy() # To be safe
        if 'host' in self.mysql_kwargs: del self.mysql_kwargs['host']
        self.use_autoconn = use_autoconn
        self.mysql_retry_timeout = mysql_retry_timeout
        self.mysql_max_retries = mysql_max_retries
        self.mysql_callback = mysql_callback

        self.connections = []

        # Mapping database name -> mysql connection
        self.databases = {}
        # Mapping server id -> mysql connection
        self.servers = {}
        # Mapping database name -> (lang, family)
        self.sites = {}

        self.domains = {}

        self.unknown_families = []
        # Mapping family name -> family object
        self.known_families = {}

        database, cursor = self.connect_mysql(mysql_host_prefix + str(mysql_default_server))
        self.servers[mysql_default_server] = (database, cursor)

        # Find where the databases are located
        cursor.execute('SELECT dbname, domain, server FROM toolserver.wiki ORDER BY size DESC LIMIT %s', (limit, ))
        for dbname, domain, server in cursor.fetchall():
            if server not in self.servers:
                self.servers[server] = self.connect_mysql(mysql_host_prefix + str(server) + mysql_host_suffix)

            # FIXME: wikimediafoundation!
            # TODO: This is one big mess
            try:
                lang, fam = family(domain)
                if fam not in self.known_families:
                    self.known_families[fam] = wikipedia.Family(fam, fatal = False)
            except (RuntimeError, ValueError, SyntaxError):
                self.unknown_families.append(domain)
            else:
                self.sites[dbname] = (lang, fam)
                self.databases[dbname] = self.servers[server]

            self.domains[dbname] = domain
Esempio n. 8
0
            site = wikipedia.getSite(lang, family)
            wikipedia.output(u'Checking %s' % site)
            namespaces = check_namespaces(site)
            if namespaces:
                for id, name, defined_namespace in namespaces:
                    try:
                        msg = u'Namespace %s for %s is ' \
                              + (u'[%s]. ' if len(name) > 1 else u'%s. ') \
                              + (u'[%s]' if len(defined_namespace) > 1 else u'%s') \
                              + u' is defined in family file.'
                        wikipedia.output(msg % (id, site, ', '.join(name),
                                                ', '.join(defined_namespace)))
                    except:
                        pass
                result[lang] = namespaces
    return result


if __name__ == '__main__':
    try:
        wikipedia.handleArgs()
        family = wikipedia.Family(wikipedia.default_family)
        result = check_family(family)
        wikipedia.output(u'\nWriting raw Python dictionary to stdout.')
        wikipedia.output(
            u'Format is: (namespace_id, namespace_name, predefined_namespace)')
        print
        print result
    finally:
        wikipedia.stopme()