Example #1
0
def mwGetNamespacesAPI(config={}):
    """ Uses the API to get the list of namespaces names and ids """
    namespaces = config['namespaces']
    namespacenames = {0: ''}  # main is 0, no prefix
    if namespaces:
        data = {'action': 'query',
                'meta': 'siteinfo',
                'siprop': 'namespaces',
                'format': 'json'}
        r = wikiteam.getURL(url=config['mwapi'], data=data)
        result = wikiteam.getJSON(r)
        wikiteam.delay(config=config)
        if 'all' in namespaces:
            namespaces = []
            for i in result['query']['namespaces'].keys():
                if int(i) < 0:  # Skipping -1: Special, -2: Media
                    continue
                namespaces.append(int(i))
                namespacenames[int(i)] = result['query']['namespaces'][i]['*']
        else:
            # check if those namespaces really exist in this wiki
            namespaces2 = []
            for i in result['query']['namespaces'].keys():
                if int(i) < 0:
                    continue
                if int(i) in namespaces:
                    namespaces2.append(int(i))
                    namespacenames[int(i)] = result['query']['namespaces'][i]['*']
            namespaces = namespaces2
    else:
        namespaces = [0]

    namespaces = list(set(namespaces))  # uniques
    sys.stderr.write('%d namespaces found\n' % (len(namespaces)))
    return namespaces, namespacenames
Example #2
0
def mwSaveSiteInfo(config={}):
    """ Save a file with site info """

    if config['api']:
        if os.path.exists('%s/siteinfo.json' % (config['path'])):
            sys.stderr.write('siteinfo.json exists, do not overwrite')
        else:
            sys.stderr.write('Downloading site info as siteinfo.json')

            # MediaWiki 1.13+
            raw = wikiteam.getURL(url=config['api'], data={
                'action': 'query',
                'meta': 'siteinfo',
                'siprop': 'general|namespaces|statistics|dbrepllag|interwikimap|namespacealiases|specialpagealiases|usergroups|extensions|skins|magicwords|fileextensions|rightsinfo',
                'sinumberingroup': 1,
                'format': 'json'})
            wikiteam.delay(config=config)
            # MediaWiki 1.11-1.12
            if not 'query' in wikiteam.getJSON(raw):
                raw = wikiteam.getURL(url=config['api'], data={
                    'action': 'query',
                    'meta': 'siteinfo',
                    'siprop': 'general|namespaces|statistics|dbrepllag|interwikimap',
                    'format': 'json'})
            # MediaWiki 1.8-1.10
            if not 'query' in wikiteam.getJSON(raw):
                raw = wikiteam.getURL(url=config['api'], data={
                    'action': 'query',
                    'meta': 'siteinfo',
                    'siprop': 'general|namespaces',
                    'format': 'json'})
            result = wikiteam.getJSON(raw)
            wikiteam.delay(config=config)
            with open('%s/siteinfo.json' % (config['path']), 'w') as outfile:
                outfile.write(json.dumps(result, indent=4, sort_keys=True))
Example #3
0
def mwGetNamespacesScraper(config={}):
    """ Hackishly gets the list of namespaces names and ids from the dropdown in the HTML of Special:AllPages """
    """ Function called if no API is available """
    
    namespaces = config['namespaces']
    namespacenames = {0: ''}  # main is 0, no prefix
    if namespaces:
        raw = wikiteam.getURL(url=config['index'], data={'title': 'Special:Allpages'})
        wikiteam.delay(config=config)

        # [^>]*? to include selected="selected"
        m = re.compile(r'<option [^>]*?value="(?P<namespaceid>\d+)"[^>]*?>(?P<namespacename>[^<]+)</option>').finditer(raw)
        if 'all' in namespaces:
            namespaces = []
            for i in m:
                namespaces.append(int(i.group("namespaceid")))
                namespacenames[int(i.group("namespaceid"))] = i.group("namespacename")
        else:
            # check if those namespaces really exist in this wiki
            namespaces2 = []
            for i in m:
                if int(i.group("namespaceid")) in namespaces:
                    namespaces2.append(int(i.group("namespaceid")))
                    namespacenames[int(i.group("namespaceid"))] = i.group("namespacename")
            namespaces = namespaces2
    else:
        namespaces = [0]

    namespaces = list(set(namespaces))  # uniques
    std.stderr.write('%d namespaces found' % (len(namespaces)))
    return namespaces, namespacenames
Example #4
0
def mwGeneratePageDump(config={}, pagetitles=None, start=None):
    """ Generates a XML dump for page titles """

    sys.stderr.write('Retrieving XML for every page from "%s"' %
                     (start or 'start'))
    header = mwGetXMLHeader(config=config)
    footer = '</mediawiki>\n'  # new line at the end
    xmlfilename = '%s-%s-%s.xml' % (wikiteam.domain2prefix(
        config=config), config['date'], config['curonly'] and 'current'
                                    or 'history')
    xmlfile = ''
    lock = True
    if start:
        sys.stderr.write(
            "Removing the last chunk of past XML dump: it is probably incomplete.\n"
        )
        for i in reverse_readline('%s/%s' % (config['path'], xmlfilename),
                                  truncate=True):
            pass
    else:
        # requested complete xml dump
        lock = False
        xmlfile = open('%s/%s' % (config['path'], xmlfilename), 'w')
        xmlfile.write(header)
        xmlfile.close()

    xmlfile = open('%s/%s' % (config['path'], xmlfilename), 'a')
    c = 1
    for pagetitle in mwGetPageTitles(config=config, start=start):
        if not pagetitle.strip():
            continue
        if pagetitle == start:  # start downloading from start, included
            lock = False
        if lock:
            continue
        wikiteam.delay(config=config)
        if c % 10 == 0:
            sys.stderr.write('Downloaded %d pages\n' % (c))
        try:
            for xml in getXMLPage(config=config, title=title):
                xml = cleanXML(xml=xml)
                xmlfile.write(xml)
        except PageMissingError:
            logerror(
                config=config,
                text='The page "%s" was missing in the wiki (probably deleted)'
                % (title))
        # here, XML is a correct <page> </page> chunk or
        # an empty string due to a deleted page (logged in errors log) or
        # an empty string due to an error while retrieving the page from server
        # (logged in errors log)
        c += 1
    xmlfile.write(footer)
    xmlfile.close()
    sys.stderr.write('XML dump saved at... %s\n' % (xmlfilename))
Example #5
0
def mwSaveIndexPHP(config={}):
    """ Save index.php as .html, to preserve license details available at the botom of the page """

    if os.path.exists('%s/index.html' % (config['path'])):
        sys.stderr.write('index.html exists, do not overwrite')
    else:
        sys.stderr.write('Downloading index.php (Main Page) as index.html')
        raw = wikiteam.getURL(url=config['index'], data={})
        wikiteam.delay(config=config)
        raw = mwRemoveIP(raw=raw)
        with open('%s/index.html' % (config['path']), 'w') as outfile:
            outfile.write(raw)
Example #6
0
def mwSaveIndexPHP(config={}):
    """ Save index.php as .html, to preserve license details available at the botom of the page """

    if os.path.exists('%s/index.html' % (config['path'])):
        sys.stderr.write('index.html exists, do not overwrite')
    else:
        sys.stderr.write('Downloading index.php (Main Page) as index.html')
        raw = wikiteam.getURL(url=config['index'], data={})
        wikiteam.delay(config=config)
        raw = mwRemoveIP(raw=raw)
        with open('%s/index.html' % (config['path']), 'w') as outfile:
            outfile.write(raw)
Example #7
0
def mwSaveSpecialVersion(config={}):
    """ Save Special:Version as .html, to preserve extensions details """

    if os.path.exists('%s/Special:Version.html' % (config['path'])):
        sys.stderr.write('Special:Version.html exists, do not overwrite')
    else:
        sys.stderr.write('Downloading Special:Version with extensions and other related info')
        raw = wikiteam.getURL(url=config['index'], data={'title': 'Special:Version'})
        wikiteam.delay(config=config)
        raw = mwRemoveIP(raw=raw)
        with open('%s/Special:Version.html' % (config['path']), 'w') as outfile:
            outfile.write(raw)
Example #8
0
def mwGeneratePageDump(config={}, pagetitles=None, start=None):
    """ Generates a XML dump for page titles """
    
    sys.stderr.write('Retrieving XML for every page from "%s"' % (start or 'start'))
    header = mwGetXMLHeader(config=config)
    footer = '</mediawiki>\n'  # new line at the end
    xmlfilename = '%s-%s-%s.xml' % (wikiteam.domain2prefix(config=config),
                                    config['date'],
                                    config['curonly'] and 'current' or 'history')
    xmlfile = ''
    lock = True
    if start:
        sys.stderr.write("Removing the last chunk of past XML dump: it is probably incomplete.\n")
        for i in reverse_readline('%s/%s' % (config['path'], xmlfilename), truncate=True):
            pass
    else:
        # requested complete xml dump
        lock = False
        xmlfile = open('%s/%s' % (config['path'], xmlfilename), 'w')
        xmlfile.write(header)
        xmlfile.close()

    xmlfile = open('%s/%s' % (config['path'], xmlfilename), 'a')
    c = 1
    for pagetitle in mwGetPageTitles(config=config, start=start):
        if not pagetitle.strip():
            continue
        if pagetitle == start:  # start downloading from start, included
            lock = False
        if lock:
            continue
        wikiteam.delay(config=config)
        if c % 10 == 0:
            sys.stderr.write('Downloaded %d pages\n' % (c))
        try:
            for xml in getXMLPage(config=config, title=title):
                xml = cleanXML(xml=xml)
                xmlfile.write(xml)
        except PageMissingError:
            logerror(
                config=config,
                text='The page "%s" was missing in the wiki (probably deleted)' %
                (title))
        # here, XML is a correct <page> </page> chunk or
        # an empty string due to a deleted page (logged in errors log) or
        # an empty string due to an error while retrieving the page from server
        # (logged in errors log)
        c += 1
    xmlfile.write(footer)
    xmlfile.close()
    sys.stderr.write('XML dump saved at... %s\n' % (xmlfilename))
Example #9
0
def mwSaveSpecialVersion(config={}):
    """ Save Special:Version as .html, to preserve extensions details """

    if os.path.exists('%s/Special:Version.html' % (config['path'])):
        sys.stderr.write('Special:Version.html exists, do not overwrite')
    else:
        sys.stderr.write(
            'Downloading Special:Version with extensions and other related info'
        )
        raw = wikiteam.getURL(url=config['index'],
                              data={'title': 'Special:Version'})
        wikiteam.delay(config=config)
        raw = mwRemoveIP(raw=raw)
        with open('%s/Special:Version.html' % (config['path']),
                  'w') as outfile:
            outfile.write(raw)
Example #10
0
def mwSaveSiteInfo(config={}):
    """ Save a file with site info """

    if config['api']:
        if os.path.exists('%s/siteinfo.json' % (config['path'])):
            sys.stderr.write('siteinfo.json exists, do not overwrite')
        else:
            sys.stderr.write('Downloading site info as siteinfo.json')

            # MediaWiki 1.13+
            raw = wikiteam.getURL(
                url=config['api'],
                data={
                    'action': 'query',
                    'meta': 'siteinfo',
                    'siprop':
                    'general|namespaces|statistics|dbrepllag|interwikimap|namespacealiases|specialpagealiases|usergroups|extensions|skins|magicwords|fileextensions|rightsinfo',
                    'sinumberingroup': 1,
                    'format': 'json'
                })
            wikiteam.delay(config=config)
            # MediaWiki 1.11-1.12
            if not 'query' in wikiteam.getJSON(raw):
                raw = wikiteam.getURL(
                    url=config['api'],
                    data={
                        'action': 'query',
                        'meta': 'siteinfo',
                        'siprop':
                        'general|namespaces|statistics|dbrepllag|interwikimap',
                        'format': 'json'
                    })
            # MediaWiki 1.8-1.10
            if not 'query' in wikiteam.getJSON(raw):
                raw = wikiteam.getURL(url=config['api'],
                                      data={
                                          'action': 'query',
                                          'meta': 'siteinfo',
                                          'siprop': 'general|namespaces',
                                          'format': 'json'
                                      })
            result = wikiteam.getJSON(raw)
            wikiteam.delay(config=config)
            with open('%s/siteinfo.json' % (config['path']), 'w') as outfile:
                outfile.write(json.dumps(result, indent=4, sort_keys=True))
Example #11
0
def mwGetNamespacesAPI(config={}):
    """ Uses the API to get the list of namespaces names and ids """
    namespaces = config['namespaces']
    namespacenames = {0: ''}  # main is 0, no prefix
    if namespaces:
        data = {
            'action': 'query',
            'meta': 'siteinfo',
            'siprop': 'namespaces',
            'format': 'json'
        }
        r = wikiteam.getURL(url=config['mwapi'], data=data)
        result = wikiteam.getJSON(r)
        wikiteam.delay(config=config)
        if 'all' in namespaces:
            namespaces = []
            for i in result['query']['namespaces'].keys():
                if int(i) < 0:  # Skipping -1: Special, -2: Media
                    continue
                namespaces.append(int(i))
                namespacenames[int(i)] = result['query']['namespaces'][i]['*']
        else:
            # check if those namespaces really exist in this wiki
            namespaces2 = []
            for i in result['query']['namespaces'].keys():
                if int(i) < 0:
                    continue
                if int(i) in namespaces:
                    namespaces2.append(int(i))
                    namespacenames[int(
                        i)] = result['query']['namespaces'][i]['*']
            namespaces = namespaces2
    else:
        namespaces = [0]

    namespaces = list(set(namespaces))  # uniques
    sys.stderr.write('%d namespaces found\n' % (len(namespaces)))
    return namespaces, namespacenames
Example #12
0
def mwGetNamespacesScraper(config={}):
    """ Hackishly gets the list of namespaces names and ids from the dropdown in the HTML of Special:AllPages """
    """ Function called if no API is available """

    namespaces = config['namespaces']
    namespacenames = {0: ''}  # main is 0, no prefix
    if namespaces:
        raw = wikiteam.getURL(url=config['index'],
                              data={'title': 'Special:Allpages'})
        wikiteam.delay(config=config)

        # [^>]*? to include selected="selected"
        m = re.compile(
            r'<option [^>]*?value="(?P<namespaceid>\d+)"[^>]*?>(?P<namespacename>[^<]+)</option>'
        ).finditer(raw)
        if 'all' in namespaces:
            namespaces = []
            for i in m:
                namespaces.append(int(i.group("namespaceid")))
                namespacenames[int(
                    i.group("namespaceid"))] = i.group("namespacename")
        else:
            # check if those namespaces really exist in this wiki
            namespaces2 = []
            for i in m:
                if int(i.group("namespaceid")) in namespaces:
                    namespaces2.append(int(i.group("namespaceid")))
                    namespacenames[int(
                        i.group("namespaceid"))] = i.group("namespacename")
            namespaces = namespaces2
    else:
        namespaces = [0]

    namespaces = list(set(namespaces))  # uniques
    std.stderr.write('%d namespaces found' % (len(namespaces)))
    return namespaces, namespacenames
Example #13
0
def mwGetPageTitlesScraper(config={}):
    """ Scrape list of page titles from Special:Allpages """
    
    pagetitles = []
    namespaces, namespacenames = mwGetNamespacesScraper(
        config=config)
    for namespace in namespaces:
        sys.stderr.write('    Retrieving titles in namespace %s\n' % (namespace))
        url = '%s?title=Special:Allpages&namespace=%s' % (config['index'], namespace)
        raw = wikiteam.getURL(url=url)
        raw = mwCleanHTML(raw)

        r_title = r'title="(?P<title>[^>]+)">'
        r_suballpages = ''
        r_suballpages1 = r'&amp;from=(?P<from>[^>]+)&amp;to=(?P<to>[^>]+)">'
        r_suballpages2 = r'Special:Allpages/(?P<from>[^>]+)">'
        r_suballpages3 = r'&amp;from=(?P<from>[^>]+)" title="[^>]+">'
        if re.search(r_suballpages1, raw):
            r_suballpages = r_suballpages1
        elif re.search(r_suballpages2, raw):
            r_suballpages = r_suballpages2
        elif re.search(r_suballpages3, raw):
            r_suballpages = r_suballpages3
        else:
            pass  # perhaps no subpages

        # 3 is the current deep of English Wikipedia for Special:Allpages
        deep = 3
        c = 0
        checked_suballpages = []
        rawacum = raw
        while r_suballpages and re.search(r_suballpages, raw) and c < deep:
            # load sub-Allpages
            m = re.compile(r_suballpages).finditer(raw)
            for i in m:
                fr = i.group('from')

                if r_suballpages == r_suballpages1:
                    to = i.group('to')
                    name = '%s-%s' % (fr, to)
                    url = '%s?title=Special:Allpages&namespace=%s&from=%s&to=%s' % (
                        config['index'], namespace, fr, to)  # do not put urllib.quote in fr or to
                # fix, esta regexp no carga bien todas? o falla el r_title en
                # este tipo de subpag? (wikiindex)
                elif r_suballpages == r_suballpages2:
                    # clean &amp;namespace=\d, sometimes happens
                    fr = fr.split('&amp;namespace=')[0]
                    name = fr
                    url = '%s?title=Special:Allpages/%s&namespace=%s' % (
                        config['index'], name, namespace)
                elif r_suballpages == r_suballpages3:
                    fr = fr.split('&amp;namespace=')[0]
                    name = fr
                    url = '%s?title=Special:Allpages&from=%s&namespace=%s' % (
                        config['index'], name, namespace)

                if name not in checked_suballpages:
                    # to avoid reload dupe subpages links
                    checked_suballpages.append(name)
                    wikiteam.delay(config=config)
                    raw2 = wikiteam.getURL(url=url)
                    raw2 = mwCleanHTML(raw2)
                    rawacum += raw2  # merge it after removed junk
                    sys.stderr.write('    Reading %s, %s bytes, %d subpages, %d pages' % (name, len(raw2), \
                        len(re.findall(r_suballpages, raw2)), \
                        len(re.findall(r_title, raw2))))

                wikiteam.delay(config=config)
            c += 1

        c = 0
        m = re.compile(r_title).finditer(rawacum)
        for i in m:
            t = wikiteam.undoHTMLEntities(text=i.group('title'))
            if not t.startswith('Special:'):
                if t not in pagetitles:
                    pagetitles.append(t)
                    c += 1
        sys.stderr.write('    %d titles retrieved in the namespace %d\n' % (c, namespace))
    return pagetitles
Example #14
0
def mwGetPageTitlesAPI(config={}):
    """ Uses the API to get the list of page titles """
    pagetitles = []
    namespaces, namespacenames = mwGetNamespacesAPI(
        config=config)
    for namespace in namespaces:
        if namespace in config['exnamespaces']:
            sys.stderr.write('    Skipping namespace = %d\n' % (namespace))
            continue

        c = 0
        sys.stderr.write('    Retrieving page titles in namespace %d\n' % (namespace))
        apfrom = '!'
        while apfrom:
            sys.stderr.write('.')  # progress
            data = {
                'action': 'query',
                'list': 'allpages',
                'apnamespace': namespace,
                'apfrom': apfrom.encode('utf-8'),
                'format': 'json',
                'aplimit': 500}
            retryCount = 0
            while retryCount < config["retries"]:
                try:
                    r = wikiteam.getURL(url=config['mwapi'], data=data)
                    break
                except ConnectionError as err:
                    sys.stderr.write("Connection error: %s\n" % (str(err),))
                    retryCount += 1
                    time.sleep(20)
            #wikiteam.handleStatusCode(r)
            # FIXME Handle HTTP errors here!
            jsontitles = wikiteam.getJSON(r)
            apfrom = ''
            if 'query-continue' in jsontitles and 'allpages' in jsontitles[
                    'query-continue']:
                if 'apcontinue' in jsontitles['query-continue']['allpages']:
                    apfrom = jsontitles[
                        'query-continue']['allpages']['apcontinue']
                elif 'apfrom' in jsontitles['query-continue']['allpages']:
                    apfrom = jsontitles['query-continue']['allpages']['apfrom']
            elif 'continue' in jsontitles:
                if 'apcontinue' in jsontitles['continue']:
                    apfrom = jsontitles['continue']['apcontinue']
                elif 'apfrom' in jsontitles['continue']:
                    apfrom = jsontitles['continue']['apfrom']
            
            # sys.stderr.write(apfrom)
            # sys.stderr.write(jsontitles)
            allpages = jsontitles['query']['allpages']
            # Hack for old versions of MediaWiki API where result is dict
            if isinstance(allpages, dict):
                allpages = allpages.values()
            for page in allpages:
                yield page['title']
            c += len(allpages)

            if len(pagetitles) != len(set(pagetitles)):
                # Are we in a loop? Server returning dupes, stop it
                sys.stderr.write('Probably a loop, finishing\n')
                apfrom = ''

            wikiteam.delay(config=config)
        sys.stderr.write('    %d titles retrieved in namespace %d\n' % (c, namespace))
Example #15
0
def mwGetImageNamesAPI(config={}):
    """ Retrieve file list: filename, url, uploader """

    oldAPI = False
    aifrom = '!'
    imagenames = []
    while aifrom:
        sys.stderr.write('.')  # progress
        data = {
            'action': 'query',
            'list': 'allimages',
            'aiprop': 'url|user',
            'aifrom': aifrom,
            'format': 'json',
            'ailimit': 500
        }
        # FIXME Handle HTTP Errors HERE
        r = wikiteam.getURL(url=config['mwapi'], data=data)
        #handleStatusCode(r)
        jsonimages = wikiteam.getJSON(r)
        wikiteam.delay(config=config)

        if 'query' in jsonimages:
            aifrom = ''
            if 'query-continue' in jsonimages and 'allimages' in jsonimages[
                    'query-continue']:
                if 'aicontinue' in jsonimages['query-continue']['allimages']:
                    aifrom = jsonimages['query-continue']['allimages'][
                        'aicontinue']
                elif 'aifrom' in jsonimages['query-continue']['allimages']:
                    aifrom = jsonimages['query-continue']['allimages'][
                        'aifrom']
            elif 'continue' in jsonimages:
                if 'aicontinue' in jsonimages['continue']:
                    aifrom = jsonimages['continue']['aicontinue']
                elif 'aifrom' in jsonimages['continue']:
                    aifrom = jsonimages['continue']['aifrom']
            # sys.stderr.write(aifrom)

            for image in jsonimages['query']['allimages']:
                url = image['url']
                url = mwCurateImageURL(config=config, url=url)
                # encoding to ascii is needed to work around this horrible bug:
                # http://bugs.python.org/issue8136
                if 'mwapi' in config and '.wikia.com' in config['mwapi']:
                    #to avoid latest?cb=20120816112532 in filenames
                    filename = urllib.parse.unquote(
                        re.sub('_', ' ',
                               url.split('/')[-3])).encode('ascii', 'ignore')
                else:
                    filename = urllib.parse.unquote(
                        re.sub('_', ' ',
                               url.split('/')[-1])).encode('ascii', 'ignore')
                uploader = re.sub('_', ' ', image['user'])
                imagenames.append([filename, url, uploader])
        else:
            oldAPI = True
            break

    if oldAPI:
        gapfrom = '!'
        imagenames = []
        while gapfrom:
            sys.stderr.write('.')  # progress
            # Some old APIs doesn't have allimages query
            # In this case use allpages (in nm=6) as generator for imageinfo
            # Example:
            # http://minlingo.wiki-site.com/api.php?action=query&generator=allpages&gapnamespace=6
            # &gaplimit=500&prop=imageinfo&iiprop=user|url&gapfrom=!
            data = {
                'action': 'query',
                'generator': 'allpages',
                'gapnamespace': 6,
                'gaplimit': 500,
                'gapfrom': gapfrom,
                'prop': 'imageinfo',
                'iiprop': 'user|url',
                'format': 'json'
            }
            # FIXME Handle HTTP Errors HERE
            r = wikiteam.getURL(url=config['mwapi'], data=data)
            #handleStatusCode(r)
            jsonimages = wikiteam.getJSON(r)
            wikiteam.delay(config=config)

            if 'query' in jsonimages:
                gapfrom = ''
                if 'query-continue' in jsonimages and 'allpages' in jsonimages[
                        'query-continue']:
                    if 'gapfrom' in jsonimages['query-continue']['allpages']:
                        gapfrom = jsonimages['query-continue']['allpages'][
                            'gapfrom']

                for image, props in jsonimages['query']['pages'].items():
                    url = props['imageinfo'][0]['url']
                    url = mwCurateImageURL(config=config, url=url)
                    tmp_filename = ':'.join(props['title'].split(':')[1:])
                    filename = re.sub('_', ' ', tmp_filename)
                    uploader = re.sub('_', ' ', props['imageinfo'][0]['user'])
                    imagenames.append([filename, url, uploader])
            else:
                # if the API doesn't return query data, then we're done
                break

    if len(imagenames) == 1:
        sys.stderr.write('    Found 1 image')
    else:
        sys.stderr.write('    Found %d images' % (len(imagenames)))

    return imagenames
Example #16
0
def mwGetImageNamesScraper(config={}):
    """ Retrieve file list: filename, url, uploader """

    # (?<! http://docs.python.org/library/re.html
    r_next = r'(?<!&amp;dir=prev)&amp;offset=(?P<offset>\d+)&amp;'
    imagenames = []
    offset = '29990101000000'  # january 1, 2999
    limit = 5000
    retries = config['retries']
    while offset:
        # 5000 overload some servers, but it is needed for sites like this with
        # no next links
        # http://www.memoryarchive.org/en/index.php?title=Special:Imagelist&sort=byname&limit=50&wpIlMatch=
        data={
            'title': 'Special:Imagelist',
            'limit': limit,
            'offset': offset}
        raw = wikiteam.getURL(url=config['index'], data=data)
        #handleStatusCode(r)
        wikiteam.delay(config=config)
        # delicate wiki
        if re.search(r'(?i)(allowed memory size of \d+ bytes exhausted|Call to a member function getURL)', raw):
            if limit > 10:
                sys.stderr.write('Error: listing %d images in a chunk is not possible, trying tiny chunks' % (limit))
                limit = limit / 10
                continue
            elif retries > 0:  # waste retries, then exit
                retries -= 1
                sys.stderr.write('Retrying...')
                continue
            else:
                sys.stderr.write('No more retries, exit...')
                break

        raw = mwCleanHTML(raw)
        # archiveteam 1.15.1 <td class="TablePager_col_img_name"><a href="/index.php?title=File:Yahoovideo.jpg" title="File:Yahoovideo.jpg">Yahoovideo.jpg</a> (<a href="/images/2/2b/Yahoovideo.jpg">file</a>)</td>
        # wikanda 1.15.5 <td class="TablePager_col_img_user_text"><a
        # href="/w/index.php?title=Usuario:Fernandocg&amp;action=edit&amp;redlink=1"
        # class="new" title="Usuario:Fernandocg (página no
        # existe)">Fernandocg</a></td>
        r_images1 = r'(?im)<td class="TablePager_col_img_name"><a href[^>]+title="[^:>]+:(?P<filename>[^>]+)">[^<]+</a>[^<]+<a href="(?P<url>[^>]+/[^>/]+)">[^<]+</a>[^<]+</td>\s*<td class="TablePager_col_img_user_text"><a[^>]+>(?P<uploader>[^<]+)</a></td>'
        # wikijuegos 1.9.5
        # http://softwarelibre.uca.es/wikijuegos/Especial:Imagelist old
        # mediawiki version
        r_images2 = r'(?im)<td class="TablePager_col_links"><a href[^>]+title="[^:>]+:(?P<filename>[^>]+)">[^<]+</a>[^<]+<a href="(?P<url>[^>]+/[^>/]+)">[^<]+</a></td>\s*<td class="TablePager_col_img_timestamp">[^<]+</td>\s*<td class="TablePager_col_img_name">[^<]+</td>\s*<td class="TablePager_col_img_user_text"><a[^>]+>(?P<uploader>[^<]+)</a></td>'
        # gentoowiki 1.18
        r_images3 = r'(?im)<td class="TablePager_col_img_name"><a[^>]+title="[^:>]+:(?P<filename>[^>]+)">[^<]+</a>[^<]+<a href="(?P<url>[^>]+)">[^<]+</a>[^<]+</td><td class="TablePager_col_thumb"><a[^>]+><img[^>]+></a></td><td class="TablePager_col_img_size">[^<]+</td><td class="TablePager_col_img_user_text"><a[^>]+>(?P<uploader>[^<]+)</a></td>'
        # http://www.memoryarchive.org/en/index.php?title=Special:Imagelist&sort=byname&limit=50&wpIlMatch=
        # (<a href="/en/Image:109_0923.JPG" title="Image:109 0923.JPG">desc</a>) <a href="/en/upload/c/cd/109_0923.JPG">109 0923.JPG</a> . . 885,713 bytes . . <a href="/en/User:Bfalconer" title="User:Bfalconer">Bfalconer</a> . . 18:44, 17 November 2005<br />
        r_images4 = r'(?im)<a href=[^>]+ title="[^:>]+:(?P<filename>[^>]+)">[^<]+</a>[^<]+<a href="(?P<url>[^>]+)">[^<]+</a>[^<]+<a[^>]+>(?P<uploader>[^<]+)</a>'
        r_images5 = (
            r'(?im)<td class="TablePager_col_img_name">\s*<a href[^>]*?>(?P<filename>[^>]+)</a>\s*\(<a href="(?P<url>[^>]+)">[^<]*?</a>\s*\)\s*</td>\s*'
            '<td class="TablePager_col_thumb">[^\n\r]*?</td>\s*'
            '<td class="TablePager_col_img_size">[^<]*?</td>\s*'
            '<td class="TablePager_col_img_user_text">\s*(<a href="[^>]*?" title="[^>]*?">)?(?P<uploader>[^<]+?)(</a>)?\s*</td>')

        # Select the regexp that returns more results
        regexps = [r_images1, r_images2, r_images3, r_images4, r_images5]
        count = 0
        i = 0
        regexp_best = 0
        for regexp in regexps:
            if len(re.findall(regexp, raw)) > count:
                count = len(re.findall(regexp, raw))
                regexp_best = i
            i += 1
        m = re.compile(regexps[regexp_best]).finditer(raw)

        # Iter the image results
        for i in m:
            url = i.group('url')
            url = mwCurateImageURL(config=config, url=url)
            filename = re.sub('_', ' ', i.group('filename'))
            filename = wikiteam.undoHTMLEntities(text=filename)
            filename = urllib.unquote(filename)
            uploader = re.sub('_', ' ', i.group('uploader'))
            uploader = wikiteam.undoHTMLEntities(text=uploader)
            uploader = urllib.unquote(uploader)
            imagenames.append([filename, url, uploader])

        if re.search(r_next, raw):
            new_offset = re.findall(r_next, raw)[0]
            # Avoid infinite loop
            if new_offset != offset:
                offset = new_offset
                retries += 5  # add more retries if we got a page with offset
            else:
                offset = ''
        else:
            offset = ''

    if (len(imagenames) == 1):
        sys.stderr.write('    Found 1 image')
    else:
        sys.stderr.write('    Found %d images' % (len(imagenames)))

    imagenames.sort()
    return imagenames
Example #17
0
def mwGetImageNamesAPI(config={}):
    """ Retrieve file list: filename, url, uploader """
    
    oldAPI = False
    aifrom = '!'
    imagenames = []
    while aifrom:
        sys.stderr.write('.')  # progress
        data = {
            'action': 'query',
            'list': 'allimages',
            'aiprop': 'url|user',
            'aifrom': aifrom,
            'format': 'json',
            'ailimit': 500}
        # FIXME Handle HTTP Errors HERE
        r = wikiteam.getURL(url=config['mwapi'], data=data)
        #handleStatusCode(r)
        jsonimages = wikiteam.getJSON(r)
        wikiteam.delay(config=config)

        if 'query' in jsonimages:
            aifrom = ''
            if 'query-continue' in jsonimages and 'allimages' in jsonimages['query-continue']:
                if 'aicontinue' in jsonimages['query-continue']['allimages']:
                    aifrom = jsonimages['query-continue']['allimages']['aicontinue']
                elif 'aifrom' in jsonimages['query-continue']['allimages']:
                    aifrom = jsonimages['query-continue']['allimages']['aifrom']
            elif 'continue' in jsonimages:
                if 'aicontinue' in jsonimages['continue']:
                    aifrom = jsonimages['continue']['aicontinue']
                elif 'aifrom' in jsonimages['continue']:
                    aifrom = jsonimages['continue']['aifrom']
            # sys.stderr.write(aifrom)

            for image in jsonimages['query']['allimages']:
                url = image['url']
                url = mwCurateImageURL(config=config, url=url)
                # encoding to ascii is needed to work around this horrible bug:
                # http://bugs.python.org/issue8136
                if 'mwapi' in config and '.wikia.com' in config['mwapi']:
                    #to avoid latest?cb=20120816112532 in filenames
                    filename = urllib.parse.unquote(re.sub('_', ' ', url.split('/')[-3])).encode('ascii', 'ignore')
                else:
                    filename = urllib.parse.unquote(re.sub('_', ' ', url.split('/')[-1])).encode('ascii', 'ignore')
                uploader = re.sub('_', ' ', image['user'])
                imagenames.append([filename, url, uploader])
        else:
            oldAPI = True
            break

    if oldAPI:
        gapfrom = '!'
        imagenames = []
        while gapfrom:
            sys.stderr.write('.')  # progress
            # Some old APIs doesn't have allimages query
            # In this case use allpages (in nm=6) as generator for imageinfo
            # Example:
            # http://minlingo.wiki-site.com/api.php?action=query&generator=allpages&gapnamespace=6
            # &gaplimit=500&prop=imageinfo&iiprop=user|url&gapfrom=!
            data = {
                'action': 'query',
                'generator': 'allpages',
                'gapnamespace': 6,
                'gaplimit': 500,
                'gapfrom': gapfrom,
                'prop': 'imageinfo',
                'iiprop': 'user|url',
                'format': 'json'}
            # FIXME Handle HTTP Errors HERE
            r = wikiteam.getURL(url=config['mwapi'], data=data)
            #handleStatusCode(r)
            jsonimages = wikiteam.getJSON(r)
            wikiteam.delay(config=config)

            if 'query' in jsonimages:
                gapfrom = ''
                if 'query-continue' in jsonimages and 'allpages' in jsonimages['query-continue']:
                    if 'gapfrom' in jsonimages['query-continue']['allpages']:
                        gapfrom = jsonimages['query-continue']['allpages']['gapfrom']

                for image, props in jsonimages['query']['pages'].items():
                    url = props['imageinfo'][0]['url']
                    url = mwCurateImageURL(config=config, url=url)
                    tmp_filename = ':'.join(props['title'].split(':')[1:])
                    filename = re.sub('_', ' ', tmp_filename)
                    uploader = re.sub('_', ' ', props['imageinfo'][0]['user'])
                    imagenames.append([filename, url, uploader])
            else:
                # if the API doesn't return query data, then we're done
                break

    if len(imagenames) == 1:
        sys.stderr.write('    Found 1 image')
    else:
        sys.stderr.write('    Found %d images' % (len(imagenames)))

    return imagenames
Example #18
0
def mwGetImageNamesScraper(config={}):
    """ Retrieve file list: filename, url, uploader """

    # (?<! http://docs.python.org/library/re.html
    r_next = r'(?<!&amp;dir=prev)&amp;offset=(?P<offset>\d+)&amp;'
    imagenames = []
    offset = '29990101000000'  # january 1, 2999
    limit = 5000
    retries = config['retries']
    while offset:
        # 5000 overload some servers, but it is needed for sites like this with
        # no next links
        # http://www.memoryarchive.org/en/index.php?title=Special:Imagelist&sort=byname&limit=50&wpIlMatch=
        data = {'title': 'Special:Imagelist', 'limit': limit, 'offset': offset}
        raw = wikiteam.getURL(url=config['index'], data=data)
        #handleStatusCode(r)
        wikiteam.delay(config=config)
        # delicate wiki
        if re.search(
                r'(?i)(allowed memory size of \d+ bytes exhausted|Call to a member function getURL)',
                raw):
            if limit > 10:
                sys.stderr.write(
                    'Error: listing %d images in a chunk is not possible, trying tiny chunks'
                    % (limit))
                limit = limit / 10
                continue
            elif retries > 0:  # waste retries, then exit
                retries -= 1
                sys.stderr.write('Retrying...')
                continue
            else:
                sys.stderr.write('No more retries, exit...')
                break

        raw = mwCleanHTML(raw)
        # archiveteam 1.15.1 <td class="TablePager_col_img_name"><a href="/index.php?title=File:Yahoovideo.jpg" title="File:Yahoovideo.jpg">Yahoovideo.jpg</a> (<a href="/images/2/2b/Yahoovideo.jpg">file</a>)</td>
        # wikanda 1.15.5 <td class="TablePager_col_img_user_text"><a
        # href="/w/index.php?title=Usuario:Fernandocg&amp;action=edit&amp;redlink=1"
        # class="new" title="Usuario:Fernandocg (página no
        # existe)">Fernandocg</a></td>
        r_images1 = r'(?im)<td class="TablePager_col_img_name"><a href[^>]+title="[^:>]+:(?P<filename>[^>]+)">[^<]+</a>[^<]+<a href="(?P<url>[^>]+/[^>/]+)">[^<]+</a>[^<]+</td>\s*<td class="TablePager_col_img_user_text"><a[^>]+>(?P<uploader>[^<]+)</a></td>'
        # wikijuegos 1.9.5
        # http://softwarelibre.uca.es/wikijuegos/Especial:Imagelist old
        # mediawiki version
        r_images2 = r'(?im)<td class="TablePager_col_links"><a href[^>]+title="[^:>]+:(?P<filename>[^>]+)">[^<]+</a>[^<]+<a href="(?P<url>[^>]+/[^>/]+)">[^<]+</a></td>\s*<td class="TablePager_col_img_timestamp">[^<]+</td>\s*<td class="TablePager_col_img_name">[^<]+</td>\s*<td class="TablePager_col_img_user_text"><a[^>]+>(?P<uploader>[^<]+)</a></td>'
        # gentoowiki 1.18
        r_images3 = r'(?im)<td class="TablePager_col_img_name"><a[^>]+title="[^:>]+:(?P<filename>[^>]+)">[^<]+</a>[^<]+<a href="(?P<url>[^>]+)">[^<]+</a>[^<]+</td><td class="TablePager_col_thumb"><a[^>]+><img[^>]+></a></td><td class="TablePager_col_img_size">[^<]+</td><td class="TablePager_col_img_user_text"><a[^>]+>(?P<uploader>[^<]+)</a></td>'
        # http://www.memoryarchive.org/en/index.php?title=Special:Imagelist&sort=byname&limit=50&wpIlMatch=
        # (<a href="/en/Image:109_0923.JPG" title="Image:109 0923.JPG">desc</a>) <a href="/en/upload/c/cd/109_0923.JPG">109 0923.JPG</a> . . 885,713 bytes . . <a href="/en/User:Bfalconer" title="User:Bfalconer">Bfalconer</a> . . 18:44, 17 November 2005<br />
        r_images4 = r'(?im)<a href=[^>]+ title="[^:>]+:(?P<filename>[^>]+)">[^<]+</a>[^<]+<a href="(?P<url>[^>]+)">[^<]+</a>[^<]+<a[^>]+>(?P<uploader>[^<]+)</a>'
        r_images5 = (
            r'(?im)<td class="TablePager_col_img_name">\s*<a href[^>]*?>(?P<filename>[^>]+)</a>\s*\(<a href="(?P<url>[^>]+)">[^<]*?</a>\s*\)\s*</td>\s*'
            '<td class="TablePager_col_thumb">[^\n\r]*?</td>\s*'
            '<td class="TablePager_col_img_size">[^<]*?</td>\s*'
            '<td class="TablePager_col_img_user_text">\s*(<a href="[^>]*?" title="[^>]*?">)?(?P<uploader>[^<]+?)(</a>)?\s*</td>'
        )

        # Select the regexp that returns more results
        regexps = [r_images1, r_images2, r_images3, r_images4, r_images5]
        count = 0
        i = 0
        regexp_best = 0
        for regexp in regexps:
            if len(re.findall(regexp, raw)) > count:
                count = len(re.findall(regexp, raw))
                regexp_best = i
            i += 1
        m = re.compile(regexps[regexp_best]).finditer(raw)

        # Iter the image results
        for i in m:
            url = i.group('url')
            url = mwCurateImageURL(config=config, url=url)
            filename = re.sub('_', ' ', i.group('filename'))
            filename = wikiteam.undoHTMLEntities(text=filename)
            filename = urllib.unquote(filename)
            uploader = re.sub('_', ' ', i.group('uploader'))
            uploader = wikiteam.undoHTMLEntities(text=uploader)
            uploader = urllib.unquote(uploader)
            imagenames.append([filename, url, uploader])

        if re.search(r_next, raw):
            new_offset = re.findall(r_next, raw)[0]
            # Avoid infinite loop
            if new_offset != offset:
                offset = new_offset
                retries += 5  # add more retries if we got a page with offset
            else:
                offset = ''
        else:
            offset = ''

    if (len(imagenames) == 1):
        sys.stderr.write('    Found 1 image')
    else:
        sys.stderr.write('    Found %d images' % (len(imagenames)))

    imagenames.sort()
    return imagenames
Example #19
0
def mwGetPageTitlesScraper(config={}):
    """ Scrape list of page titles from Special:Allpages """

    pagetitles = []
    namespaces, namespacenames = mwGetNamespacesScraper(config=config)
    for namespace in namespaces:
        sys.stderr.write('    Retrieving titles in namespace %s\n' %
                         (namespace))
        url = '%s?title=Special:Allpages&namespace=%s' % (config['index'],
                                                          namespace)
        raw = wikiteam.getURL(url=url)
        raw = mwCleanHTML(raw)

        r_title = r'title="(?P<title>[^>]+)">'
        r_suballpages = ''
        r_suballpages1 = r'&amp;from=(?P<from>[^>]+)&amp;to=(?P<to>[^>]+)">'
        r_suballpages2 = r'Special:Allpages/(?P<from>[^>]+)">'
        r_suballpages3 = r'&amp;from=(?P<from>[^>]+)" title="[^>]+">'
        if re.search(r_suballpages1, raw):
            r_suballpages = r_suballpages1
        elif re.search(r_suballpages2, raw):
            r_suballpages = r_suballpages2
        elif re.search(r_suballpages3, raw):
            r_suballpages = r_suballpages3
        else:
            pass  # perhaps no subpages

        # 3 is the current deep of English Wikipedia for Special:Allpages
        deep = 3
        c = 0
        checked_suballpages = []
        rawacum = raw
        while r_suballpages and re.search(r_suballpages, raw) and c < deep:
            # load sub-Allpages
            m = re.compile(r_suballpages).finditer(raw)
            for i in m:
                fr = i.group('from')

                if r_suballpages == r_suballpages1:
                    to = i.group('to')
                    name = '%s-%s' % (fr, to)
                    url = '%s?title=Special:Allpages&namespace=%s&from=%s&to=%s' % (
                        config['index'], namespace, fr, to
                    )  # do not put urllib.quote in fr or to
                # fix, esta regexp no carga bien todas? o falla el r_title en
                # este tipo de subpag? (wikiindex)
                elif r_suballpages == r_suballpages2:
                    # clean &amp;namespace=\d, sometimes happens
                    fr = fr.split('&amp;namespace=')[0]
                    name = fr
                    url = '%s?title=Special:Allpages/%s&namespace=%s' % (
                        config['index'], name, namespace)
                elif r_suballpages == r_suballpages3:
                    fr = fr.split('&amp;namespace=')[0]
                    name = fr
                    url = '%s?title=Special:Allpages&from=%s&namespace=%s' % (
                        config['index'], name, namespace)

                if name not in checked_suballpages:
                    # to avoid reload dupe subpages links
                    checked_suballpages.append(name)
                    wikiteam.delay(config=config)
                    raw2 = wikiteam.getURL(url=url)
                    raw2 = mwCleanHTML(raw2)
                    rawacum += raw2  # merge it after removed junk
                    sys.stderr.write('    Reading %s, %s bytes, %d subpages, %d pages' % (name, len(raw2), \
                        len(re.findall(r_suballpages, raw2)), \
                        len(re.findall(r_title, raw2))))

                wikiteam.delay(config=config)
            c += 1

        c = 0
        m = re.compile(r_title).finditer(rawacum)
        for i in m:
            t = wikiteam.undoHTMLEntities(text=i.group('title'))
            if not t.startswith('Special:'):
                if t not in pagetitles:
                    pagetitles.append(t)
                    c += 1
        sys.stderr.write('    %d titles retrieved in the namespace %d\n' %
                         (c, namespace))
    return pagetitles
Example #20
0
def mwGetPageTitlesAPI(config={}):
    """ Uses the API to get the list of page titles """
    pagetitles = []
    namespaces, namespacenames = mwGetNamespacesAPI(config=config)
    for namespace in namespaces:
        if namespace in config['exnamespaces']:
            sys.stderr.write('    Skipping namespace = %d\n' % (namespace))
            continue

        c = 0
        sys.stderr.write('    Retrieving page titles in namespace %d\n' %
                         (namespace))
        apfrom = '!'
        while apfrom:
            sys.stderr.write('.')  # progress
            data = {
                'action': 'query',
                'list': 'allpages',
                'apnamespace': namespace,
                'apfrom': apfrom.encode('utf-8'),
                'format': 'json',
                'aplimit': 500
            }
            retryCount = 0
            while retryCount < config["retries"]:
                try:
                    r = wikiteam.getURL(url=config['mwapi'], data=data)
                    break
                except ConnectionError as err:
                    sys.stderr.write("Connection error: %s\n" % (str(err), ))
                    retryCount += 1
                    time.sleep(20)
            #wikiteam.handleStatusCode(r)
            # FIXME Handle HTTP errors here!
            jsontitles = wikiteam.getJSON(r)
            apfrom = ''
            if 'query-continue' in jsontitles and 'allpages' in jsontitles[
                    'query-continue']:
                if 'apcontinue' in jsontitles['query-continue']['allpages']:
                    apfrom = jsontitles['query-continue']['allpages'][
                        'apcontinue']
                elif 'apfrom' in jsontitles['query-continue']['allpages']:
                    apfrom = jsontitles['query-continue']['allpages']['apfrom']
            elif 'continue' in jsontitles:
                if 'apcontinue' in jsontitles['continue']:
                    apfrom = jsontitles['continue']['apcontinue']
                elif 'apfrom' in jsontitles['continue']:
                    apfrom = jsontitles['continue']['apfrom']

            # sys.stderr.write(apfrom)
            # sys.stderr.write(jsontitles)
            allpages = jsontitles['query']['allpages']
            # Hack for old versions of MediaWiki API where result is dict
            if isinstance(allpages, dict):
                allpages = allpages.values()
            for page in allpages:
                yield page['title']
            c += len(allpages)

            if len(pagetitles) != len(set(pagetitles)):
                # Are we in a loop? Server returning dupes, stop it
                sys.stderr.write('Probably a loop, finishing\n')
                apfrom = ''

            wikiteam.delay(config=config)
        sys.stderr.write('    %d titles retrieved in namespace %d\n' %
                         (c, namespace))