def mwSaveSiteInfo(config={}): """ Save a file with site info """ if config['api']: if os.path.exists('%s/siteinfo.json' % (config['path'])): sys.stderr.write('siteinfo.json exists, do not overwrite') else: sys.stderr.write('Downloading site info as siteinfo.json') # MediaWiki 1.13+ raw = wikiteam.getURL(url=config['api'], data={ 'action': 'query', 'meta': 'siteinfo', 'siprop': 'general|namespaces|statistics|dbrepllag|interwikimap|namespacealiases|specialpagealiases|usergroups|extensions|skins|magicwords|fileextensions|rightsinfo', 'sinumberingroup': 1, 'format': 'json'}) wikiteam.delay(config=config) # MediaWiki 1.11-1.12 if not 'query' in wikiteam.getJSON(raw): raw = wikiteam.getURL(url=config['api'], data={ 'action': 'query', 'meta': 'siteinfo', 'siprop': 'general|namespaces|statistics|dbrepllag|interwikimap', 'format': 'json'}) # MediaWiki 1.8-1.10 if not 'query' in wikiteam.getJSON(raw): raw = wikiteam.getURL(url=config['api'], data={ 'action': 'query', 'meta': 'siteinfo', 'siprop': 'general|namespaces', 'format': 'json'}) result = wikiteam.getJSON(raw) wikiteam.delay(config=config) with open('%s/siteinfo.json' % (config['path']), 'w') as outfile: outfile.write(json.dumps(result, indent=4, sort_keys=True))
def mwGetNamespacesAPI(config={}): """ Uses the API to get the list of namespaces names and ids """ namespaces = config['namespaces'] namespacenames = {0: ''} # main is 0, no prefix if namespaces: data = {'action': 'query', 'meta': 'siteinfo', 'siprop': 'namespaces', 'format': 'json'} r = wikiteam.getURL(url=config['mwapi'], data=data) result = wikiteam.getJSON(r) wikiteam.delay(config=config) if 'all' in namespaces: namespaces = [] for i in result['query']['namespaces'].keys(): if int(i) < 0: # Skipping -1: Special, -2: Media continue namespaces.append(int(i)) namespacenames[int(i)] = result['query']['namespaces'][i]['*'] else: # check if those namespaces really exist in this wiki namespaces2 = [] for i in result['query']['namespaces'].keys(): if int(i) < 0: continue if int(i) in namespaces: namespaces2.append(int(i)) namespacenames[int(i)] = result['query']['namespaces'][i]['*'] namespaces = namespaces2 else: namespaces = [0] namespaces = list(set(namespaces)) # uniques sys.stderr.write('%d namespaces found\n' % (len(namespaces))) return namespaces, namespacenames
def mwSaveSiteInfo(config={}): """ Save a file with site info """ if config['api']: if os.path.exists('%s/siteinfo.json' % (config['path'])): sys.stderr.write('siteinfo.json exists, do not overwrite') else: sys.stderr.write('Downloading site info as siteinfo.json') # MediaWiki 1.13+ raw = wikiteam.getURL( url=config['api'], data={ 'action': 'query', 'meta': 'siteinfo', 'siprop': 'general|namespaces|statistics|dbrepllag|interwikimap|namespacealiases|specialpagealiases|usergroups|extensions|skins|magicwords|fileextensions|rightsinfo', 'sinumberingroup': 1, 'format': 'json' }) wikiteam.delay(config=config) # MediaWiki 1.11-1.12 if not 'query' in wikiteam.getJSON(raw): raw = wikiteam.getURL( url=config['api'], data={ 'action': 'query', 'meta': 'siteinfo', 'siprop': 'general|namespaces|statistics|dbrepllag|interwikimap', 'format': 'json' }) # MediaWiki 1.8-1.10 if not 'query' in wikiteam.getJSON(raw): raw = wikiteam.getURL(url=config['api'], data={ 'action': 'query', 'meta': 'siteinfo', 'siprop': 'general|namespaces', 'format': 'json' }) result = wikiteam.getJSON(raw) wikiteam.delay(config=config) with open('%s/siteinfo.json' % (config['path']), 'w') as outfile: outfile.write(json.dumps(result, indent=4, sort_keys=True))
def mwGetNamespacesAPI(config={}): """ Uses the API to get the list of namespaces names and ids """ namespaces = config['namespaces'] namespacenames = {0: ''} # main is 0, no prefix if namespaces: data = { 'action': 'query', 'meta': 'siteinfo', 'siprop': 'namespaces', 'format': 'json' } r = wikiteam.getURL(url=config['mwapi'], data=data) result = wikiteam.getJSON(r) wikiteam.delay(config=config) if 'all' in namespaces: namespaces = [] for i in result['query']['namespaces'].keys(): if int(i) < 0: # Skipping -1: Special, -2: Media continue namespaces.append(int(i)) namespacenames[int(i)] = result['query']['namespaces'][i]['*'] else: # check if those namespaces really exist in this wiki namespaces2 = [] for i in result['query']['namespaces'].keys(): if int(i) < 0: continue if int(i) in namespaces: namespaces2.append(int(i)) namespacenames[int( i)] = result['query']['namespaces'][i]['*'] namespaces = namespaces2 else: namespaces = [0] namespaces = list(set(namespaces)) # uniques sys.stderr.write('%d namespaces found\n' % (len(namespaces))) return namespaces, namespacenames
def mwGetPageTitlesAPI(config={}): """ Uses the API to get the list of page titles """ pagetitles = [] namespaces, namespacenames = mwGetNamespacesAPI( config=config) for namespace in namespaces: if namespace in config['exnamespaces']: sys.stderr.write(' Skipping namespace = %d\n' % (namespace)) continue c = 0 sys.stderr.write(' Retrieving page titles in namespace %d\n' % (namespace)) apfrom = '!' while apfrom: sys.stderr.write('.') # progress data = { 'action': 'query', 'list': 'allpages', 'apnamespace': namespace, 'apfrom': apfrom.encode('utf-8'), 'format': 'json', 'aplimit': 500} retryCount = 0 while retryCount < config["retries"]: try: r = wikiteam.getURL(url=config['mwapi'], data=data) break except ConnectionError as err: sys.stderr.write("Connection error: %s\n" % (str(err),)) retryCount += 1 time.sleep(20) #wikiteam.handleStatusCode(r) # FIXME Handle HTTP errors here! jsontitles = wikiteam.getJSON(r) apfrom = '' if 'query-continue' in jsontitles and 'allpages' in jsontitles[ 'query-continue']: if 'apcontinue' in jsontitles['query-continue']['allpages']: apfrom = jsontitles[ 'query-continue']['allpages']['apcontinue'] elif 'apfrom' in jsontitles['query-continue']['allpages']: apfrom = jsontitles['query-continue']['allpages']['apfrom'] elif 'continue' in jsontitles: if 'apcontinue' in jsontitles['continue']: apfrom = jsontitles['continue']['apcontinue'] elif 'apfrom' in jsontitles['continue']: apfrom = jsontitles['continue']['apfrom'] # sys.stderr.write(apfrom) # sys.stderr.write(jsontitles) allpages = jsontitles['query']['allpages'] # Hack for old versions of MediaWiki API where result is dict if isinstance(allpages, dict): allpages = allpages.values() for page in allpages: yield page['title'] c += len(allpages) if len(pagetitles) != len(set(pagetitles)): # Are we in a loop? Server returning dupes, stop it sys.stderr.write('Probably a loop, finishing\n') apfrom = '' wikiteam.delay(config=config) sys.stderr.write(' %d titles retrieved in namespace %d\n' % (c, namespace))
def mwGetImageNamesAPI(config={}): """ Retrieve file list: filename, url, uploader """ oldAPI = False aifrom = '!' imagenames = [] while aifrom: sys.stderr.write('.') # progress data = { 'action': 'query', 'list': 'allimages', 'aiprop': 'url|user', 'aifrom': aifrom, 'format': 'json', 'ailimit': 500} # FIXME Handle HTTP Errors HERE r = wikiteam.getURL(url=config['mwapi'], data=data) #handleStatusCode(r) jsonimages = wikiteam.getJSON(r) wikiteam.delay(config=config) if 'query' in jsonimages: aifrom = '' if 'query-continue' in jsonimages and 'allimages' in jsonimages['query-continue']: if 'aicontinue' in jsonimages['query-continue']['allimages']: aifrom = jsonimages['query-continue']['allimages']['aicontinue'] elif 'aifrom' in jsonimages['query-continue']['allimages']: aifrom = jsonimages['query-continue']['allimages']['aifrom'] elif 'continue' in jsonimages: if 'aicontinue' in jsonimages['continue']: aifrom = jsonimages['continue']['aicontinue'] elif 'aifrom' in jsonimages['continue']: aifrom = jsonimages['continue']['aifrom'] # sys.stderr.write(aifrom) for image in jsonimages['query']['allimages']: url = image['url'] url = mwCurateImageURL(config=config, url=url) # encoding to ascii is needed to work around this horrible bug: # http://bugs.python.org/issue8136 if 'mwapi' in config and '.wikia.com' in config['mwapi']: #to avoid latest?cb=20120816112532 in filenames filename = urllib.parse.unquote(re.sub('_', ' ', url.split('/')[-3])).encode('ascii', 'ignore') else: filename = urllib.parse.unquote(re.sub('_', ' ', url.split('/')[-1])).encode('ascii', 'ignore') uploader = re.sub('_', ' ', image['user']) imagenames.append([filename, url, uploader]) else: oldAPI = True break if oldAPI: gapfrom = '!' imagenames = [] while gapfrom: sys.stderr.write('.') # progress # Some old APIs doesn't have allimages query # In this case use allpages (in nm=6) as generator for imageinfo # Example: # http://minlingo.wiki-site.com/api.php?action=query&generator=allpages&gapnamespace=6 # &gaplimit=500&prop=imageinfo&iiprop=user|url&gapfrom=! data = { 'action': 'query', 'generator': 'allpages', 'gapnamespace': 6, 'gaplimit': 500, 'gapfrom': gapfrom, 'prop': 'imageinfo', 'iiprop': 'user|url', 'format': 'json'} # FIXME Handle HTTP Errors HERE r = wikiteam.getURL(url=config['mwapi'], data=data) #handleStatusCode(r) jsonimages = wikiteam.getJSON(r) wikiteam.delay(config=config) if 'query' in jsonimages: gapfrom = '' if 'query-continue' in jsonimages and 'allpages' in jsonimages['query-continue']: if 'gapfrom' in jsonimages['query-continue']['allpages']: gapfrom = jsonimages['query-continue']['allpages']['gapfrom'] for image, props in jsonimages['query']['pages'].items(): url = props['imageinfo'][0]['url'] url = mwCurateImageURL(config=config, url=url) tmp_filename = ':'.join(props['title'].split(':')[1:]) filename = re.sub('_', ' ', tmp_filename) uploader = re.sub('_', ' ', props['imageinfo'][0]['user']) imagenames.append([filename, url, uploader]) else: # if the API doesn't return query data, then we're done break if len(imagenames) == 1: sys.stderr.write(' Found 1 image') else: sys.stderr.write(' Found %d images' % (len(imagenames))) return imagenames
def mwGetPageTitlesAPI(config={}): """ Uses the API to get the list of page titles """ pagetitles = [] namespaces, namespacenames = mwGetNamespacesAPI(config=config) for namespace in namespaces: if namespace in config['exnamespaces']: sys.stderr.write(' Skipping namespace = %d\n' % (namespace)) continue c = 0 sys.stderr.write(' Retrieving page titles in namespace %d\n' % (namespace)) apfrom = '!' while apfrom: sys.stderr.write('.') # progress data = { 'action': 'query', 'list': 'allpages', 'apnamespace': namespace, 'apfrom': apfrom.encode('utf-8'), 'format': 'json', 'aplimit': 500 } retryCount = 0 while retryCount < config["retries"]: try: r = wikiteam.getURL(url=config['mwapi'], data=data) break except ConnectionError as err: sys.stderr.write("Connection error: %s\n" % (str(err), )) retryCount += 1 time.sleep(20) #wikiteam.handleStatusCode(r) # FIXME Handle HTTP errors here! jsontitles = wikiteam.getJSON(r) apfrom = '' if 'query-continue' in jsontitles and 'allpages' in jsontitles[ 'query-continue']: if 'apcontinue' in jsontitles['query-continue']['allpages']: apfrom = jsontitles['query-continue']['allpages'][ 'apcontinue'] elif 'apfrom' in jsontitles['query-continue']['allpages']: apfrom = jsontitles['query-continue']['allpages']['apfrom'] elif 'continue' in jsontitles: if 'apcontinue' in jsontitles['continue']: apfrom = jsontitles['continue']['apcontinue'] elif 'apfrom' in jsontitles['continue']: apfrom = jsontitles['continue']['apfrom'] # sys.stderr.write(apfrom) # sys.stderr.write(jsontitles) allpages = jsontitles['query']['allpages'] # Hack for old versions of MediaWiki API where result is dict if isinstance(allpages, dict): allpages = allpages.values() for page in allpages: yield page['title'] c += len(allpages) if len(pagetitles) != len(set(pagetitles)): # Are we in a loop? Server returning dupes, stop it sys.stderr.write('Probably a loop, finishing\n') apfrom = '' wikiteam.delay(config=config) sys.stderr.write(' %d titles retrieved in namespace %d\n' % (c, namespace))
def mwGetImageNamesAPI(config={}): """ Retrieve file list: filename, url, uploader """ oldAPI = False aifrom = '!' imagenames = [] while aifrom: sys.stderr.write('.') # progress data = { 'action': 'query', 'list': 'allimages', 'aiprop': 'url|user', 'aifrom': aifrom, 'format': 'json', 'ailimit': 500 } # FIXME Handle HTTP Errors HERE r = wikiteam.getURL(url=config['mwapi'], data=data) #handleStatusCode(r) jsonimages = wikiteam.getJSON(r) wikiteam.delay(config=config) if 'query' in jsonimages: aifrom = '' if 'query-continue' in jsonimages and 'allimages' in jsonimages[ 'query-continue']: if 'aicontinue' in jsonimages['query-continue']['allimages']: aifrom = jsonimages['query-continue']['allimages'][ 'aicontinue'] elif 'aifrom' in jsonimages['query-continue']['allimages']: aifrom = jsonimages['query-continue']['allimages'][ 'aifrom'] elif 'continue' in jsonimages: if 'aicontinue' in jsonimages['continue']: aifrom = jsonimages['continue']['aicontinue'] elif 'aifrom' in jsonimages['continue']: aifrom = jsonimages['continue']['aifrom'] # sys.stderr.write(aifrom) for image in jsonimages['query']['allimages']: url = image['url'] url = mwCurateImageURL(config=config, url=url) # encoding to ascii is needed to work around this horrible bug: # http://bugs.python.org/issue8136 if 'mwapi' in config and '.wikia.com' in config['mwapi']: #to avoid latest?cb=20120816112532 in filenames filename = urllib.parse.unquote( re.sub('_', ' ', url.split('/')[-3])).encode('ascii', 'ignore') else: filename = urllib.parse.unquote( re.sub('_', ' ', url.split('/')[-1])).encode('ascii', 'ignore') uploader = re.sub('_', ' ', image['user']) imagenames.append([filename, url, uploader]) else: oldAPI = True break if oldAPI: gapfrom = '!' imagenames = [] while gapfrom: sys.stderr.write('.') # progress # Some old APIs doesn't have allimages query # In this case use allpages (in nm=6) as generator for imageinfo # Example: # http://minlingo.wiki-site.com/api.php?action=query&generator=allpages&gapnamespace=6 # &gaplimit=500&prop=imageinfo&iiprop=user|url&gapfrom=! data = { 'action': 'query', 'generator': 'allpages', 'gapnamespace': 6, 'gaplimit': 500, 'gapfrom': gapfrom, 'prop': 'imageinfo', 'iiprop': 'user|url', 'format': 'json' } # FIXME Handle HTTP Errors HERE r = wikiteam.getURL(url=config['mwapi'], data=data) #handleStatusCode(r) jsonimages = wikiteam.getJSON(r) wikiteam.delay(config=config) if 'query' in jsonimages: gapfrom = '' if 'query-continue' in jsonimages and 'allpages' in jsonimages[ 'query-continue']: if 'gapfrom' in jsonimages['query-continue']['allpages']: gapfrom = jsonimages['query-continue']['allpages'][ 'gapfrom'] for image, props in jsonimages['query']['pages'].items(): url = props['imageinfo'][0]['url'] url = mwCurateImageURL(config=config, url=url) tmp_filename = ':'.join(props['title'].split(':')[1:]) filename = re.sub('_', ' ', tmp_filename) uploader = re.sub('_', ' ', props['imageinfo'][0]['user']) imagenames.append([filename, url, uploader]) else: # if the API doesn't return query data, then we're done break if len(imagenames) == 1: sys.stderr.write(' Found 1 image') else: sys.stderr.write(' Found %d images' % (len(imagenames))) return imagenames