Exemple #1
0
def mwSaveSiteInfo(config={}):
    """ Save a file with site info """

    if config['api']:
        if os.path.exists('%s/siteinfo.json' % (config['path'])):
            sys.stderr.write('siteinfo.json exists, do not overwrite')
        else:
            sys.stderr.write('Downloading site info as siteinfo.json')

            # MediaWiki 1.13+
            raw = wikiteam.getURL(url=config['api'], data={
                'action': 'query',
                'meta': 'siteinfo',
                'siprop': 'general|namespaces|statistics|dbrepllag|interwikimap|namespacealiases|specialpagealiases|usergroups|extensions|skins|magicwords|fileextensions|rightsinfo',
                'sinumberingroup': 1,
                'format': 'json'})
            wikiteam.delay(config=config)
            # MediaWiki 1.11-1.12
            if not 'query' in wikiteam.getJSON(raw):
                raw = wikiteam.getURL(url=config['api'], data={
                    'action': 'query',
                    'meta': 'siteinfo',
                    'siprop': 'general|namespaces|statistics|dbrepllag|interwikimap',
                    'format': 'json'})
            # MediaWiki 1.8-1.10
            if not 'query' in wikiteam.getJSON(raw):
                raw = wikiteam.getURL(url=config['api'], data={
                    'action': 'query',
                    'meta': 'siteinfo',
                    'siprop': 'general|namespaces',
                    'format': 'json'})
            result = wikiteam.getJSON(raw)
            wikiteam.delay(config=config)
            with open('%s/siteinfo.json' % (config['path']), 'w') as outfile:
                outfile.write(json.dumps(result, indent=4, sort_keys=True))
Exemple #2
0
def mwGetNamespacesAPI(config={}):
    """ Uses the API to get the list of namespaces names and ids """
    namespaces = config['namespaces']
    namespacenames = {0: ''}  # main is 0, no prefix
    if namespaces:
        data = {'action': 'query',
                'meta': 'siteinfo',
                'siprop': 'namespaces',
                'format': 'json'}
        r = wikiteam.getURL(url=config['mwapi'], data=data)
        result = wikiteam.getJSON(r)
        wikiteam.delay(config=config)
        if 'all' in namespaces:
            namespaces = []
            for i in result['query']['namespaces'].keys():
                if int(i) < 0:  # Skipping -1: Special, -2: Media
                    continue
                namespaces.append(int(i))
                namespacenames[int(i)] = result['query']['namespaces'][i]['*']
        else:
            # check if those namespaces really exist in this wiki
            namespaces2 = []
            for i in result['query']['namespaces'].keys():
                if int(i) < 0:
                    continue
                if int(i) in namespaces:
                    namespaces2.append(int(i))
                    namespacenames[int(i)] = result['query']['namespaces'][i]['*']
            namespaces = namespaces2
    else:
        namespaces = [0]

    namespaces = list(set(namespaces))  # uniques
    sys.stderr.write('%d namespaces found\n' % (len(namespaces)))
    return namespaces, namespacenames
def mwSaveSiteInfo(config={}):
    """ Save a file with site info """

    if config['api']:
        if os.path.exists('%s/siteinfo.json' % (config['path'])):
            sys.stderr.write('siteinfo.json exists, do not overwrite')
        else:
            sys.stderr.write('Downloading site info as siteinfo.json')

            # MediaWiki 1.13+
            raw = wikiteam.getURL(
                url=config['api'],
                data={
                    'action': 'query',
                    'meta': 'siteinfo',
                    'siprop':
                    'general|namespaces|statistics|dbrepllag|interwikimap|namespacealiases|specialpagealiases|usergroups|extensions|skins|magicwords|fileextensions|rightsinfo',
                    'sinumberingroup': 1,
                    'format': 'json'
                })
            wikiteam.delay(config=config)
            # MediaWiki 1.11-1.12
            if not 'query' in wikiteam.getJSON(raw):
                raw = wikiteam.getURL(
                    url=config['api'],
                    data={
                        'action': 'query',
                        'meta': 'siteinfo',
                        'siprop':
                        'general|namespaces|statistics|dbrepllag|interwikimap',
                        'format': 'json'
                    })
            # MediaWiki 1.8-1.10
            if not 'query' in wikiteam.getJSON(raw):
                raw = wikiteam.getURL(url=config['api'],
                                      data={
                                          'action': 'query',
                                          'meta': 'siteinfo',
                                          'siprop': 'general|namespaces',
                                          'format': 'json'
                                      })
            result = wikiteam.getJSON(raw)
            wikiteam.delay(config=config)
            with open('%s/siteinfo.json' % (config['path']), 'w') as outfile:
                outfile.write(json.dumps(result, indent=4, sort_keys=True))
def mwGetNamespacesAPI(config={}):
    """ Uses the API to get the list of namespaces names and ids """
    namespaces = config['namespaces']
    namespacenames = {0: ''}  # main is 0, no prefix
    if namespaces:
        data = {
            'action': 'query',
            'meta': 'siteinfo',
            'siprop': 'namespaces',
            'format': 'json'
        }
        r = wikiteam.getURL(url=config['mwapi'], data=data)
        result = wikiteam.getJSON(r)
        wikiteam.delay(config=config)
        if 'all' in namespaces:
            namespaces = []
            for i in result['query']['namespaces'].keys():
                if int(i) < 0:  # Skipping -1: Special, -2: Media
                    continue
                namespaces.append(int(i))
                namespacenames[int(i)] = result['query']['namespaces'][i]['*']
        else:
            # check if those namespaces really exist in this wiki
            namespaces2 = []
            for i in result['query']['namespaces'].keys():
                if int(i) < 0:
                    continue
                if int(i) in namespaces:
                    namespaces2.append(int(i))
                    namespacenames[int(
                        i)] = result['query']['namespaces'][i]['*']
            namespaces = namespaces2
    else:
        namespaces = [0]

    namespaces = list(set(namespaces))  # uniques
    sys.stderr.write('%d namespaces found\n' % (len(namespaces)))
    return namespaces, namespacenames
Exemple #5
0
def mwGetPageTitlesAPI(config={}):
    """ Uses the API to get the list of page titles """
    pagetitles = []
    namespaces, namespacenames = mwGetNamespacesAPI(
        config=config)
    for namespace in namespaces:
        if namespace in config['exnamespaces']:
            sys.stderr.write('    Skipping namespace = %d\n' % (namespace))
            continue

        c = 0
        sys.stderr.write('    Retrieving page titles in namespace %d\n' % (namespace))
        apfrom = '!'
        while apfrom:
            sys.stderr.write('.')  # progress
            data = {
                'action': 'query',
                'list': 'allpages',
                'apnamespace': namespace,
                'apfrom': apfrom.encode('utf-8'),
                'format': 'json',
                'aplimit': 500}
            retryCount = 0
            while retryCount < config["retries"]:
                try:
                    r = wikiteam.getURL(url=config['mwapi'], data=data)
                    break
                except ConnectionError as err:
                    sys.stderr.write("Connection error: %s\n" % (str(err),))
                    retryCount += 1
                    time.sleep(20)
            #wikiteam.handleStatusCode(r)
            # FIXME Handle HTTP errors here!
            jsontitles = wikiteam.getJSON(r)
            apfrom = ''
            if 'query-continue' in jsontitles and 'allpages' in jsontitles[
                    'query-continue']:
                if 'apcontinue' in jsontitles['query-continue']['allpages']:
                    apfrom = jsontitles[
                        'query-continue']['allpages']['apcontinue']
                elif 'apfrom' in jsontitles['query-continue']['allpages']:
                    apfrom = jsontitles['query-continue']['allpages']['apfrom']
            elif 'continue' in jsontitles:
                if 'apcontinue' in jsontitles['continue']:
                    apfrom = jsontitles['continue']['apcontinue']
                elif 'apfrom' in jsontitles['continue']:
                    apfrom = jsontitles['continue']['apfrom']
            
            # sys.stderr.write(apfrom)
            # sys.stderr.write(jsontitles)
            allpages = jsontitles['query']['allpages']
            # Hack for old versions of MediaWiki API where result is dict
            if isinstance(allpages, dict):
                allpages = allpages.values()
            for page in allpages:
                yield page['title']
            c += len(allpages)

            if len(pagetitles) != len(set(pagetitles)):
                # Are we in a loop? Server returning dupes, stop it
                sys.stderr.write('Probably a loop, finishing\n')
                apfrom = ''

            wikiteam.delay(config=config)
        sys.stderr.write('    %d titles retrieved in namespace %d\n' % (c, namespace))
Exemple #6
0
def mwGetImageNamesAPI(config={}):
    """ Retrieve file list: filename, url, uploader """
    
    oldAPI = False
    aifrom = '!'
    imagenames = []
    while aifrom:
        sys.stderr.write('.')  # progress
        data = {
            'action': 'query',
            'list': 'allimages',
            'aiprop': 'url|user',
            'aifrom': aifrom,
            'format': 'json',
            'ailimit': 500}
        # FIXME Handle HTTP Errors HERE
        r = wikiteam.getURL(url=config['mwapi'], data=data)
        #handleStatusCode(r)
        jsonimages = wikiteam.getJSON(r)
        wikiteam.delay(config=config)

        if 'query' in jsonimages:
            aifrom = ''
            if 'query-continue' in jsonimages and 'allimages' in jsonimages['query-continue']:
                if 'aicontinue' in jsonimages['query-continue']['allimages']:
                    aifrom = jsonimages['query-continue']['allimages']['aicontinue']
                elif 'aifrom' in jsonimages['query-continue']['allimages']:
                    aifrom = jsonimages['query-continue']['allimages']['aifrom']
            elif 'continue' in jsonimages:
                if 'aicontinue' in jsonimages['continue']:
                    aifrom = jsonimages['continue']['aicontinue']
                elif 'aifrom' in jsonimages['continue']:
                    aifrom = jsonimages['continue']['aifrom']
            # sys.stderr.write(aifrom)

            for image in jsonimages['query']['allimages']:
                url = image['url']
                url = mwCurateImageURL(config=config, url=url)
                # encoding to ascii is needed to work around this horrible bug:
                # http://bugs.python.org/issue8136
                if 'mwapi' in config and '.wikia.com' in config['mwapi']:
                    #to avoid latest?cb=20120816112532 in filenames
                    filename = urllib.parse.unquote(re.sub('_', ' ', url.split('/')[-3])).encode('ascii', 'ignore')
                else:
                    filename = urllib.parse.unquote(re.sub('_', ' ', url.split('/')[-1])).encode('ascii', 'ignore')
                uploader = re.sub('_', ' ', image['user'])
                imagenames.append([filename, url, uploader])
        else:
            oldAPI = True
            break

    if oldAPI:
        gapfrom = '!'
        imagenames = []
        while gapfrom:
            sys.stderr.write('.')  # progress
            # Some old APIs doesn't have allimages query
            # In this case use allpages (in nm=6) as generator for imageinfo
            # Example:
            # http://minlingo.wiki-site.com/api.php?action=query&generator=allpages&gapnamespace=6
            # &gaplimit=500&prop=imageinfo&iiprop=user|url&gapfrom=!
            data = {
                'action': 'query',
                'generator': 'allpages',
                'gapnamespace': 6,
                'gaplimit': 500,
                'gapfrom': gapfrom,
                'prop': 'imageinfo',
                'iiprop': 'user|url',
                'format': 'json'}
            # FIXME Handle HTTP Errors HERE
            r = wikiteam.getURL(url=config['mwapi'], data=data)
            #handleStatusCode(r)
            jsonimages = wikiteam.getJSON(r)
            wikiteam.delay(config=config)

            if 'query' in jsonimages:
                gapfrom = ''
                if 'query-continue' in jsonimages and 'allpages' in jsonimages['query-continue']:
                    if 'gapfrom' in jsonimages['query-continue']['allpages']:
                        gapfrom = jsonimages['query-continue']['allpages']['gapfrom']

                for image, props in jsonimages['query']['pages'].items():
                    url = props['imageinfo'][0]['url']
                    url = mwCurateImageURL(config=config, url=url)
                    tmp_filename = ':'.join(props['title'].split(':')[1:])
                    filename = re.sub('_', ' ', tmp_filename)
                    uploader = re.sub('_', ' ', props['imageinfo'][0]['user'])
                    imagenames.append([filename, url, uploader])
            else:
                # if the API doesn't return query data, then we're done
                break

    if len(imagenames) == 1:
        sys.stderr.write('    Found 1 image')
    else:
        sys.stderr.write('    Found %d images' % (len(imagenames)))

    return imagenames
def mwGetPageTitlesAPI(config={}):
    """ Uses the API to get the list of page titles """
    pagetitles = []
    namespaces, namespacenames = mwGetNamespacesAPI(config=config)
    for namespace in namespaces:
        if namespace in config['exnamespaces']:
            sys.stderr.write('    Skipping namespace = %d\n' % (namespace))
            continue

        c = 0
        sys.stderr.write('    Retrieving page titles in namespace %d\n' %
                         (namespace))
        apfrom = '!'
        while apfrom:
            sys.stderr.write('.')  # progress
            data = {
                'action': 'query',
                'list': 'allpages',
                'apnamespace': namespace,
                'apfrom': apfrom.encode('utf-8'),
                'format': 'json',
                'aplimit': 500
            }
            retryCount = 0
            while retryCount < config["retries"]:
                try:
                    r = wikiteam.getURL(url=config['mwapi'], data=data)
                    break
                except ConnectionError as err:
                    sys.stderr.write("Connection error: %s\n" % (str(err), ))
                    retryCount += 1
                    time.sleep(20)
            #wikiteam.handleStatusCode(r)
            # FIXME Handle HTTP errors here!
            jsontitles = wikiteam.getJSON(r)
            apfrom = ''
            if 'query-continue' in jsontitles and 'allpages' in jsontitles[
                    'query-continue']:
                if 'apcontinue' in jsontitles['query-continue']['allpages']:
                    apfrom = jsontitles['query-continue']['allpages'][
                        'apcontinue']
                elif 'apfrom' in jsontitles['query-continue']['allpages']:
                    apfrom = jsontitles['query-continue']['allpages']['apfrom']
            elif 'continue' in jsontitles:
                if 'apcontinue' in jsontitles['continue']:
                    apfrom = jsontitles['continue']['apcontinue']
                elif 'apfrom' in jsontitles['continue']:
                    apfrom = jsontitles['continue']['apfrom']

            # sys.stderr.write(apfrom)
            # sys.stderr.write(jsontitles)
            allpages = jsontitles['query']['allpages']
            # Hack for old versions of MediaWiki API where result is dict
            if isinstance(allpages, dict):
                allpages = allpages.values()
            for page in allpages:
                yield page['title']
            c += len(allpages)

            if len(pagetitles) != len(set(pagetitles)):
                # Are we in a loop? Server returning dupes, stop it
                sys.stderr.write('Probably a loop, finishing\n')
                apfrom = ''

            wikiteam.delay(config=config)
        sys.stderr.write('    %d titles retrieved in namespace %d\n' %
                         (c, namespace))
def mwGetImageNamesAPI(config={}):
    """ Retrieve file list: filename, url, uploader """

    oldAPI = False
    aifrom = '!'
    imagenames = []
    while aifrom:
        sys.stderr.write('.')  # progress
        data = {
            'action': 'query',
            'list': 'allimages',
            'aiprop': 'url|user',
            'aifrom': aifrom,
            'format': 'json',
            'ailimit': 500
        }
        # FIXME Handle HTTP Errors HERE
        r = wikiteam.getURL(url=config['mwapi'], data=data)
        #handleStatusCode(r)
        jsonimages = wikiteam.getJSON(r)
        wikiteam.delay(config=config)

        if 'query' in jsonimages:
            aifrom = ''
            if 'query-continue' in jsonimages and 'allimages' in jsonimages[
                    'query-continue']:
                if 'aicontinue' in jsonimages['query-continue']['allimages']:
                    aifrom = jsonimages['query-continue']['allimages'][
                        'aicontinue']
                elif 'aifrom' in jsonimages['query-continue']['allimages']:
                    aifrom = jsonimages['query-continue']['allimages'][
                        'aifrom']
            elif 'continue' in jsonimages:
                if 'aicontinue' in jsonimages['continue']:
                    aifrom = jsonimages['continue']['aicontinue']
                elif 'aifrom' in jsonimages['continue']:
                    aifrom = jsonimages['continue']['aifrom']
            # sys.stderr.write(aifrom)

            for image in jsonimages['query']['allimages']:
                url = image['url']
                url = mwCurateImageURL(config=config, url=url)
                # encoding to ascii is needed to work around this horrible bug:
                # http://bugs.python.org/issue8136
                if 'mwapi' in config and '.wikia.com' in config['mwapi']:
                    #to avoid latest?cb=20120816112532 in filenames
                    filename = urllib.parse.unquote(
                        re.sub('_', ' ',
                               url.split('/')[-3])).encode('ascii', 'ignore')
                else:
                    filename = urllib.parse.unquote(
                        re.sub('_', ' ',
                               url.split('/')[-1])).encode('ascii', 'ignore')
                uploader = re.sub('_', ' ', image['user'])
                imagenames.append([filename, url, uploader])
        else:
            oldAPI = True
            break

    if oldAPI:
        gapfrom = '!'
        imagenames = []
        while gapfrom:
            sys.stderr.write('.')  # progress
            # Some old APIs doesn't have allimages query
            # In this case use allpages (in nm=6) as generator for imageinfo
            # Example:
            # http://minlingo.wiki-site.com/api.php?action=query&generator=allpages&gapnamespace=6
            # &gaplimit=500&prop=imageinfo&iiprop=user|url&gapfrom=!
            data = {
                'action': 'query',
                'generator': 'allpages',
                'gapnamespace': 6,
                'gaplimit': 500,
                'gapfrom': gapfrom,
                'prop': 'imageinfo',
                'iiprop': 'user|url',
                'format': 'json'
            }
            # FIXME Handle HTTP Errors HERE
            r = wikiteam.getURL(url=config['mwapi'], data=data)
            #handleStatusCode(r)
            jsonimages = wikiteam.getJSON(r)
            wikiteam.delay(config=config)

            if 'query' in jsonimages:
                gapfrom = ''
                if 'query-continue' in jsonimages and 'allpages' in jsonimages[
                        'query-continue']:
                    if 'gapfrom' in jsonimages['query-continue']['allpages']:
                        gapfrom = jsonimages['query-continue']['allpages'][
                            'gapfrom']

                for image, props in jsonimages['query']['pages'].items():
                    url = props['imageinfo'][0]['url']
                    url = mwCurateImageURL(config=config, url=url)
                    tmp_filename = ':'.join(props['title'].split(':')[1:])
                    filename = re.sub('_', ' ', tmp_filename)
                    uploader = re.sub('_', ' ', props['imageinfo'][0]['user'])
                    imagenames.append([filename, url, uploader])
            else:
                # if the API doesn't return query data, then we're done
                break

    if len(imagenames) == 1:
        sys.stderr.write('    Found 1 image')
    else:
        sys.stderr.write('    Found %d images' % (len(imagenames)))

    return imagenames