Beispiel #1
0
    def test_getPageTitles(self):
        # This test download the title list using API and index.php
        # Compare both lists in length and title by title
        # Check the presence of some special titles, like odd chars
        # The tested wikis are from different wikifarms and some alone
        
        print '\n', '#'*73, '\n', 'test_getPageTitles', '\n', '#'*73
        tests = [
            # Alone wikis
            ['http://archiveteam.org/index.php', 'http://archiveteam.org/api.php', u'April Fools\' Day'],
            ['http://skilledtests.com/wiki/index.php', 'http://skilledtests.com/wiki/api.php', u'Conway\'s Game of Life'],

            # Test old allpages API behaviour
            ['http://wiki.damirsystems.com/index.php', 'http://wiki.damirsystems.com/api.php', 'SQL Server Tips'],

            # Test BOM encoding
            ['http://www.libreidea.org/w/index.php', 'http://www.libreidea.org/w/api.php', 'Main Page'],
        ]
        
        session = requests.Session()
        session.headers = {'User-Agent': getUserAgent()}
        for index, api, pagetocheck in tests:
            # Testing with API
            print '\nTesting', api
            print 'Trying to parse', pagetocheck, 'with API'
            config_api = {'api': api, 'index': '', 'delay': 0, 'namespaces': ['all'], 'exnamespaces': [], 'date': datetime.datetime.now().strftime('%Y%m%d'), 'path': '.'}
            getPageTitles(config=config_api, session=session)
            titles_api = './%s-%s-titles.txt' % (domain2prefix(config=config_api), config_api['date'])
            result_api = open(titles_api, 'r').read().splitlines()
            os.remove(titles_api)
            self.assertTrue(pagetocheck in result_api)
            
            # Testing with index
            print 'Testing', index
            print 'Trying to parse', pagetocheck, 'with index'
            config_index = {'index': index, 'api': '', 'delay': 0, 'namespaces': ['all'], 'exnamespaces': [], 'date': datetime.datetime.now().strftime('%Y%m%d'), 'path': '.'}
            getPageTitles(config=config_index, session=session)
            titles_index = './%s-%s-titles.txt' % (domain2prefix(config=config_index), config_index['date'])
            result_index = open(titles_index, 'r').read().splitlines()
            os.remove(titles_index)
            self.assertTrue(pagetocheck in result_index)
            self.assertEqual(len(result_api), len(result_index))
            
            # Compare every page in both lists, with/without API
            c = 0
            for pagename_api in result_api:
                self.assertEqual(pagename_api.decode('utf8'), result_index[c].decode('utf8'), u'{0} and {1} are different'.format(pagename_api.decode('utf8'), result_index[c].decode('utf8')))
                c += 1
Beispiel #2
0
    def test_getPageTitles(self):
        # This test download the title list using API and index.php
        # Compare both lists in length and title by title
        # Check the presence of some special titles, like odd chars
        # The tested wikis are from different wikifarms and some alone
        
        print '#'*73, '\n', 'test_getPageTitles', '\n', '#'*73
        tests = [
            # Alone wikis
            ['http://archiveteam.org/index.php', 'http://archiveteam.org/api.php', u'April Fools\' Day'],
            ['http://skilledtests.com/wiki/index.php', 'http://skilledtests.com/wiki/api.php', u'Conway\'s Game of Life'],

            # Test old allpages API behaviour
            ['http://wiki.damirsystems.com/index.php', 'http://wiki.damirsystems.com/api.php', 'SQL Server Tips'],

            # Gentoo wikifarm
            ['http://wiki.gentoo.org/index.php', 'http://wiki.gentoo.org/api.php', u'/usr move'],
        ]
        
        session = requests.Session()
        session.headers = {'User-Agent': getUserAgent()}
        for index, api, pagetocheck in tests:
            # Testing with API
            print '\nTesting', api
            print 'Trying to parse', pagetocheck, 'with API'
            config_api = {'api': api, 'delay': 0, 'namespaces': ['all'], 'exnamespaces': []}
            result_api = getPageTitles(config=config_api, session=session)
            self.assertTrue(pagetocheck in result_api)
            
            # Testing with index
            print 'Testing', index
            print 'Trying to parse', pagetocheck, 'with index'
            config_index = {'index': index, 'delay': 0, 'namespaces': ['all'], 'exnamespaces': []}
            result_index = getPageTitles(config=config_index, session=session)
            self.assertTrue(pagetocheck in result_index)
            self.assertEqual(len(result_api), len(result_index))
            
            # Compare every page in both lists, with/without API
            c = 0
            for pagename_api in result_api:
                self.assertEqual(pagename_api, result_index[c], u'{0} and {1} are different'.format(pagename_api, result_index[c]))
                c += 1
    def test_getImages(self):
        # This test download the image list using API and index.php
        # Compare both lists in length and file by file
        # Check the presence of some special files, like odd chars filenames
        # The tested wikis are from different wikifarms and some alone

        print '#' * 73, '\n', 'test_getImages', '\n', '#' * 73
        tests = [
            # Alone wikis
            #['http://wiki.annotation.jp/index.php', 'http://wiki.annotation.jp/api.php', u'かずさアノテーション - ソーシャル・ゲノム・アノテーション.jpg'],
            [
                'http://archiveteam.org/index.php',
                'http://archiveteam.org/api.php',
                u'Archive-is 2013-07-02 17-05-40.png'
            ],
            [
                'http://skilledtests.com/wiki/index.php',
                'http://skilledtests.com/wiki/api.php',
                u'Benham\'s disc (animated).gif'
            ],

            # Editthis wikifarm
            # It has a page view limit

            # Gamepedia wikifarm
            [
                'http://dawngate.gamepedia.com/index.php',
                'http://dawngate.gamepedia.com/api.php', u'Spell Vanquish.png'
            ],

            # Neoseeker wikifarm
            [
                'http://digimon.neoseeker.com/w/index.php',
                'http://digimon.neoseeker.com/w/api.php', u'Ogremon card.png'
            ],

            # Orain wikifarm
            #['http://mc.orain.org/w/index.php', 'http://mc.orain.org/w/api.php', u'Mojang logo.svg'],

            # Referata wikifarm
            [
                'http://wikipapers.referata.com/w/index.php',
                'http://wikipapers.referata.com/w/api.php', u'Avbot logo.png'
            ],

            # ShoutWiki wikifarm
            [
                'http://commandos.shoutwiki.com/w/index.php',
                'http://commandos.shoutwiki.com/w/api.php',
                u'Night of the Wolves loading.png'
            ],

            # Wiki-site wikifarm
            [
                'http://minlingo.wiki-site.com/index.php',
                'http://minlingo.wiki-site.com/api.php', u'一 (書方灋ᅗᅩ).png'
            ],

            # Wikkii wikifarm
            # It seems offline
        ]
        session = requests.Session()
        session.headers = {'User-Agent': getUserAgent()}
        for index, api, filetocheck in tests:
            # Testing with API
            print '\nTesting', api
            config_api = {'api': api, 'delay': 0}
            req = urllib2.Request(url=api,
                                  data=urllib.urlencode({
                                      'action': 'query',
                                      'meta': 'siteinfo',
                                      'siprop': 'statistics',
                                      'format': 'json'
                                  }),
                                  headers={'User-Agent': getUserAgent()})
            f = urllib2.urlopen(req)
            imagecount = int(
                json.loads(f.read())['query']['statistics']['images'])
            f.close()

            print 'Trying to parse', filetocheck, 'with API'
            result_api = getImageNames(config=config_api, session=session)
            self.assertEqual(len(result_api), imagecount)
            self.assertTrue(
                filetocheck in
                [filename for filename, url, uploader in result_api])

            # Testing with index
            print '\nTesting', index
            config_index = {'index': index, 'delay': 0}
            req = urllib2.Request(url=api,
                                  data=urllib.urlencode({
                                      'action': 'query',
                                      'meta': 'siteinfo',
                                      'siprop': 'statistics',
                                      'format': 'json'
                                  }),
                                  headers={'User-Agent': getUserAgent()})
            f = urllib2.urlopen(req)
            imagecount = int(
                json.loads(f.read())['query']['statistics']['images'])
            f.close()

            print 'Trying to parse', filetocheck, 'with index'
            result_index = getImageNames(config=config_index, session=session)
            #print 111, set([filename for filename, url, uploader in result_api]) - set([filename for filename, url, uploader in result_index])
            self.assertEqual(len(result_index), imagecount)
            self.assertTrue(
                filetocheck in
                [filename for filename, url, uploader in result_index])

            # Compare every image in both lists, with/without API
            c = 0
            for filename_api, url_api, uploader_api in result_api:
                self.assertEqual(
                    filename_api, result_index[c][0],
                    u'{0} and {1} are different'.format(
                        filename_api, result_index[c][0]))
                self.assertEqual(
                    url_api, result_index[c][1],
                    u'{0} and {1} are different'.format(
                        url_api, result_index[c][1]))
                self.assertEqual(
                    uploader_api, result_index[c][2],
                    u'{0} and {1} are different'.format(
                        uploader_api, result_index[c][2]))
                c += 1
    def test_getPageTitles(self):
        # This test download the title list using API and index.php
        # Compare both lists in length and title by title
        # Check the presence of some special titles, like odd chars
        # The tested wikis are from different wikifarms and some alone

        print '#' * 73, '\n', 'test_getPageTitles', '\n', '#' * 73
        tests = [
            # Alone wikis
            [
                'http://archiveteam.org/index.php',
                'http://archiveteam.org/api.php', u'April Fools\' Day'
            ],
            [
                'http://skilledtests.com/wiki/index.php',
                'http://skilledtests.com/wiki/api.php',
                u'Conway\'s Game of Life'
            ],

            # Test old allpages API behaviour
            [
                'http://wiki.damirsystems.com/index.php',
                'http://wiki.damirsystems.com/api.php', 'SQL Server Tips'
            ],
        ]

        session = requests.Session()
        session.headers = {'User-Agent': getUserAgent()}
        for index, api, pagetocheck in tests:
            # Testing with API
            print '\nTesting', api
            print 'Trying to parse', pagetocheck, 'with API'
            config_api = {
                'api': api,
                'delay': 0,
                'namespaces': ['all'],
                'exnamespaces': []
            }
            result_api = getPageTitles(config=config_api, session=session)
            self.assertTrue(pagetocheck in result_api)

            # Testing with index
            print 'Testing', index
            print 'Trying to parse', pagetocheck, 'with index'
            config_index = {
                'index': index,
                'delay': 0,
                'namespaces': ['all'],
                'exnamespaces': []
            }
            result_index = getPageTitles(config=config_index, session=session)
            self.assertTrue(pagetocheck in result_index)
            self.assertEqual(len(result_api), len(result_index))

            # Compare every page in both lists, with/without API
            c = 0
            for pagename_api in result_api:
                self.assertEqual(
                    pagename_api, result_index[c],
                    u'{0} and {1} are different'.format(
                        pagename_api, result_index[c]))
                c += 1
Beispiel #5
0
def upload(wikis, config={}):
    headers = {'User-Agent': dumpgenerator.getUserAgent()}

    for wiki in wikis:
        print "#"*73
        print "# Uploading", wiki
        print "#"*73
        wiki = wiki.lower()
        prefix = dumpgenerator.domain2prefix(config={'api': wiki})

        wikiname = prefix.split('-')[0]
        dumps = []
        for dirname, dirnames, filenames in os.walk('.'):
            if dirname == '.':
                for f in filenames:
                    if f.startswith('%s-' % (wikiname)) and (f.endswith('-wikidump.7z') or f.endswith('-history.xml.7z')):
                        dumps.append(f)
                break

        c = 0
        for dump in dumps:
            wikidate = dump.split('-')[1]
            item = get_item('wiki-' + wikiname)
            if dump in uploadeddumps:
                if config['prune-directories']:
                    rmline='rm -rf %s-%s-wikidump/' % (wikiname, wikidate)
                    # With -f the deletion might have happened before and we won't know
                    if not os.system(rmline):
                        print 'DELETED %s-%s-wikidump/' % (wikiname, wikidate)
                if config['prune-wikidump'] and dump.endswith('wikidump.7z'):
                        # Simplistic quick&dirty check for the presence of this file in the item
                        stdout, stderr = subprocess.Popen(["md5sum", dump], stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
                        dumphash = re.sub(' +.+\n?', '', stdout)

                        if dumphash in map(lambda x: x['md5'], item.files):
                            log(wiki, dump, 'verified')
                            rmline='rm -rf %s' % dump
                            if not os.system(rmline):
                                print 'DELETED ' + dump
                            print '%s was uploaded before, skipping...' % (dump)
                            continue
                        else:
                            print 'ERROR: The online item misses ' + dump
                            log(wiki, dump, 'missing')
                            # We'll exit this if and go upload the dump
                else:
                    print '%s was uploaded before, skipping...' % (dump)
                    continue

            time.sleep(0.1)
            wikidate_text = wikidate[0:4]+'-'+wikidate[4:6]+'-'+wikidate[6:8]
            print wiki, wikiname, wikidate, dump

            # Does the item exist already?
            ismissingitem = not item.exists

            # Logo path
            logourl = ''

            if ismissingitem or config['update']:
                #get metadata from api.php
                #first sitename and base url
                params = {'action': 'query', 'meta': 'siteinfo', 'format': 'xml'}
                data = urllib.urlencode(params)
                req = urllib2.Request(url=wiki, data=data, headers=headers)
                xml = ''
                try:
                    f = urllib2.urlopen(req)
                    xml = f.read()
                    f.close()
                except:
                    pass

                sitename = ''
                baseurl = ''
                lang = ''
                try:
                    sitename = re.findall(ur"sitename=\"([^\"]+)\"", xml)[0]
                except:
                    pass
                try:
                    baseurl = re.findall(ur"base=\"([^\"]+)\"", xml)[0]
                except:
                    pass
                try:
                    lang = re.findall(ur"lang=\"([^\"]+)\"", xml)[0]
                except:
                    pass

                if not sitename:
                    sitename = wikiname
                if not baseurl:
                    baseurl = re.sub(ur"(?im)/api\.php", ur"", wiki)
                if lang:
                    lang = convertlang.has_key(lang.lower()) and convertlang[lang.lower()] or lang.lower()

                #now copyright info from API
                params = {'action': 'query', 'siprop': 'general|rightsinfo', 'format': 'xml'}
                data = urllib.urlencode(params)
                req = urllib2.Request(url=wiki, data=data, headers=headers)
                xml = ''
                try:
                    f = urllib2.urlopen(req)
                    xml = f.read()
                    f.close()
                except:
                    pass

                rightsinfourl = ''
                rightsinfotext = ''
                try:
                    rightsinfourl = re.findall(ur"rightsinfo url=\"([^\"]+)\"", xml)[0]
                    rightsinfotext = re.findall(ur"text=\"([^\"]+)\"", xml)[0]
                except:
                    pass

                raw = ''
                try:
                    f = urllib.urlopen(baseurl)
                    raw = f.read()
                    f.close()
                except:
                    pass

                #or copyright info from #footer in mainpage
                if baseurl and not rightsinfourl and not rightsinfotext:
                    rightsinfotext = ''
                    rightsinfourl = ''
                    try:
                        rightsinfourl = re.findall(ur"<link rel=\"copyright\" href=\"([^\"]+)\" />", raw)[0]
                    except:
                        pass
                    try:
                        rightsinfotext = re.findall(ur"<li id=\"copyright\">([^\n\r]*?)</li>", raw)[0]
                    except:
                        pass
                    if rightsinfotext and not rightsinfourl:
                        rightsinfourl = baseurl + '#footer'
                try:
                    logourl = re.findall(ur'p-logo["\'][^>]*>\s*<a [^>]*background-image:\s*(?:url\()?([^;)"]+)', raw)[0]
                except:
                    pass
                print logourl

                #retrieve some info from the wiki
                wikititle = "Wiki - %s" % (sitename) # Wiki - ECGpedia
                wikidesc = "<a href=\"%s\">%s</a> dumped with <a href=\"https://github.com/WikiTeam/wikiteam\" rel=\"nofollow\">WikiTeam</a> tools." % (baseurl, sitename)# "<a href=\"http://en.ecgpedia.org/\" rel=\"nofollow\">ECGpedia,</a>: a free electrocardiography (ECG) tutorial and textbook to which anyone can contribute, designed for medical professionals such as cardiac care nurses and physicians. Dumped with <a href=\"https://github.com/WikiTeam/wikiteam\" rel=\"nofollow\">WikiTeam</a> tools."
                wikikeys = ['wiki', 'wikiteam', 'MediaWiki', sitename, wikiname] # ecg; ECGpedia; wiki; wikiteam; MediaWiki
                if not rightsinfourl and not rightsinfotext:
                    wikikeys.append('unknowncopyright')

                wikilicenseurl = rightsinfourl # http://creativecommons.org/licenses/by-nc-sa/3.0/
                wikirights = rightsinfotext # e.g. http://en.ecgpedia.org/wiki/Frequently_Asked_Questions : hard to fetch automatically, could be the output of API's rightsinfo if it's not a usable licenseurl or "Unknown copyright status" if nothing is found.
                wikiurl = wiki # we use api here http://en.ecgpedia.org/api.php
            else:
                print 'Item already exists.'
                lang = 'foo'
                wikititle = 'foo'
                wikidesc = 'foo'
                wikikeys = 'foo'
                wikilicenseurl = 'foo'
                wikirights = 'foo'
                wikiurl = 'foo'

            if c == 0:
                # Item metadata
                md = {
                    'mediatype': 'web',
                    'collection': config['collection'],
                    'title': wikititle,
                    'description': wikidesc,
                    'language': lang,
                    'last-updated-date': wikidate_text,
                    'subject': '; '.join(wikikeys), # Keywords should be separated by ; but it doesn't matter much; the alternative is to set one per field with subject[0], subject[1], ...
                    'licenseurl': wikilicenseurl and urlparse.urljoin(wiki, wikilicenseurl),
                    'rights': wikirights,
                    'originalurl': wikiurl,
                }

            #Upload files and update metadata
            try:
                item.upload(dump, metadata=md, access_key=accesskey, secret_key=secretkey, verbose=True)
                item.modify_metadata(md) # update
                print 'You can find it in https://archive.org/details/wiki-%s' % (wikiname)
                if logourl:
                    logo = StringIO.StringIO(urllib.urlopen(urlparse.urljoin(wiki, logourl)).read())
                    logoextension = logourl.split('.')[-1] if logourl.split('.') else 'unknown'
                    logo.name = 'wiki-' + wikiname + '_logo.' + logoextension
                    item.upload(logo, access_key=accesskey, secret_key=secretkey, verbose=True)
                uploadeddumps.append(dump)
                log(wiki, dump, 'ok')
            except:
                print wiki, dump, 'error when uploading?'

            c += 1
Beispiel #6
0
def upload(wikis, config={}, uploadeddumps=[]):
    headers = {'User-Agent': dumpgenerator.getUserAgent()}
    dumpdir = config.wikidump_dir

    filelist = os.listdir(dumpdir)
    for wiki in wikis:
        print "#" * 73
        print "# Uploading", wiki
        print "#" * 73
        wiki = wiki.lower()
        configtemp = config
        try:
            prefix = dumpgenerator.domain2prefix(config={'api': wiki})
        except KeyError:
            print "ERROR: could not produce the prefix for %s" % wiki
        config = configtemp

        wikiname = prefix.split('-')[0]
        dumps = []
        for f in filelist:
            if f.startswith('%s-' %
                            (wikiname)) and (f.endswith('-wikidump.7z')
                                             or f.endswith('-history.xml.7z')):
                print "%s found" % f
                dumps.append(f)
                # Re-introduce the break here if you only need to upload one file
                # and the I/O is too slow
                # break

        c = 0
        for dump in dumps:
            wikidate = dump.split('-')[1]
            item = get_item('wiki-' + wikiname)
            if dump in uploadeddumps:
                if config.prune_directories:
                    rmline = 'rm -rf %s-%s-wikidump/' % (wikiname, wikidate)
                    # With -f the deletion might have happened before and we won't know
                    if not os.system(rmline):
                        print 'DELETED %s-%s-wikidump/' % (wikiname, wikidate)
                if config.prune_wikidump and dump.endswith('wikidump.7z'):
                    # Simplistic quick&dirty check for the presence of this file in the item
                    print "Checking content in previously uploaded files"
                    stdout, stderr = subprocess.Popen(
                        ["md5sum", dumpdir + '/' + dump],
                        stdout=subprocess.PIPE,
                        stderr=subprocess.PIPE).communicate()
                    dumphash = re.sub(' +.+\n?', '', stdout)

                    if dumphash in map(lambda x: x['md5'], item.files):
                        log(wiki, dump, 'verified', config)
                        rmline = 'rm -rf %s' % dumpdir + '/' + dump
                        if not os.system(rmline):
                            print 'DELETED ' + dumpdir + '/' + dump
                        print '%s was uploaded before, skipping...' % (dump)
                        continue
                    else:
                        print 'ERROR: The online item misses ' + dump
                        log(wiki, dump, 'missing', config)
                        # We'll exit this if and go upload the dump
                else:
                    print '%s was uploaded before, skipping...' % (dump)
                    continue
            else:
                print '%s was not uploaded before' % dump

            time.sleep(0.1)
            wikidate_text = wikidate[0:4] + '-' + wikidate[
                4:6] + '-' + wikidate[6:8]
            print wiki, wikiname, wikidate, dump

            # Does the item exist already?
            ismissingitem = not item.exists

            # Logo path
            logourl = ''

            if ismissingitem or config.update:
                #get metadata from api.php
                #first sitename and base url
                params = {
                    'action': 'query',
                    'meta': 'siteinfo',
                    'format': 'xml'
                }
                data = urllib.urlencode(params)
                req = urllib2.Request(url=wiki, data=data, headers=headers)
                xml = ''
                try:
                    f = urllib2.urlopen(req, timeout=10)
                    xml = f.read()
                    f.close()
                except:
                    pass

                sitename = ''
                baseurl = ''
                lang = ''
                try:
                    sitename = re.findall(ur"sitename=\"([^\"]+)\"", xml)[0]
                except:
                    pass
                try:
                    baseurl = re.findall(ur"base=\"([^\"]+)\"", xml)[0]
                except:
                    pass
                try:
                    lang = re.findall(ur"lang=\"([^\"]+)\"", xml)[0]
                except:
                    pass

                if not sitename:
                    sitename = wikiname
                if not baseurl:
                    baseurl = re.sub(ur"(?im)/api\.php", ur"", wiki)
                if lang:
                    lang = convertlang.has_key(lang.lower()) and convertlang[
                        lang.lower()] or lang.lower()

                #now copyright info from API
                params = {
                    'action': 'query',
                    'siprop': 'general|rightsinfo',
                    'format': 'xml'
                }
                data = urllib.urlencode(params)
                req = urllib2.Request(url=wiki, data=data, headers=headers)
                xml = ''
                try:
                    f = urllib2.urlopen(req, timeout=10)
                    xml = f.read()
                    f.close()
                except:
                    pass

                rightsinfourl = ''
                rightsinfotext = ''
                try:
                    rightsinfourl = re.findall(ur"rightsinfo url=\"([^\"]+)\"",
                                               xml)[0]
                    rightsinfotext = re.findall(ur"text=\"([^\"]+)\"", xml)[0]
                except:
                    pass

                raw = ''
                try:
                    f = urllib.urlopen(baseurl, timeout=10)
                    raw = f.read()
                    f.close()
                except:
                    pass

                #or copyright info from #footer in mainpage
                if baseurl and not rightsinfourl and not rightsinfotext:
                    rightsinfotext = ''
                    rightsinfourl = ''
                    try:
                        rightsinfourl = re.findall(
                            ur"<link rel=\"copyright\" href=\"([^\"]+)\" />",
                            raw)[0]
                    except:
                        pass
                    try:
                        rightsinfotext = re.findall(
                            ur"<li id=\"copyright\">([^\n\r]*?)</li>", raw)[0]
                    except:
                        pass
                    if rightsinfotext and not rightsinfourl:
                        rightsinfourl = baseurl + '#footer'
                try:
                    logourl = re.findall(
                        ur'p-logo["\'][^>]*>\s*<a [^>]*background-image:\s*(?:url\()?([^;)"]+)',
                        raw)[0]
                except:
                    pass

                #retrieve some info from the wiki
                wikititle = "Wiki - %s" % (sitename)  # Wiki - ECGpedia
                wikidesc = "<a href=\"%s\">%s</a> dumped with <a href=\"https://github.com/WikiTeam/wikiteam\" rel=\"nofollow\">WikiTeam</a> tools." % (
                    baseurl, sitename
                )  # "<a href=\"http://en.ecgpedia.org/\" rel=\"nofollow\">ECGpedia,</a>: a free electrocardiography (ECG) tutorial and textbook to which anyone can contribute, designed for medical professionals such as cardiac care nurses and physicians. Dumped with <a href=\"https://github.com/WikiTeam/wikiteam\" rel=\"nofollow\">WikiTeam</a> tools."
                wikikeys = [
                    'wiki', 'wikiteam', 'MediaWiki', sitename, wikiname
                ]  # ecg; ECGpedia; wiki; wikiteam; MediaWiki
                if not rightsinfourl and not rightsinfotext:
                    wikikeys.append('unknowncopyright')

                wikilicenseurl = rightsinfourl  # http://creativecommons.org/licenses/by-nc-sa/3.0/
                wikirights = rightsinfotext  # e.g. http://en.ecgpedia.org/wiki/Frequently_Asked_Questions : hard to fetch automatically, could be the output of API's rightsinfo if it's not a usable licenseurl or "Unknown copyright status" if nothing is found.
                wikiurl = wiki  # we use api here http://en.ecgpedia.org/api.php
            else:
                print 'Item already exists.'
                lang = 'foo'
                wikititle = 'foo'
                wikidesc = 'foo'
                wikikeys = 'foo'
                wikilicenseurl = 'foo'
                wikirights = 'foo'
                wikiurl = 'foo'

            if c == 0:
                # Item metadata
                md = {
                    'mediatype':
                    'web',
                    'collection':
                    config.collection,
                    'title':
                    wikititle,
                    'description':
                    wikidesc,
                    'language':
                    lang,
                    'last-updated-date':
                    wikidate_text,
                    'subject':
                    '; '.join(
                        wikikeys
                    ),  # Keywords should be separated by ; but it doesn't matter much; the alternative is to set one per field with subject[0], subject[1], ...
                    'licenseurl':
                    wikilicenseurl and urlparse.urljoin(wiki, wikilicenseurl),
                    'rights':
                    wikirights,
                    'originalurl':
                    wikiurl,
                }

            #Upload files and update metadata
            try:
                item.upload(dumpdir + '/' + dump,
                            metadata=md,
                            access_key=accesskey,
                            secret_key=secretkey,
                            verbose=True,
                            queue_derive=False)
                item.modify_metadata(md)  # update
                print 'You can find it in https://archive.org/details/wiki-%s' % (
                    wikiname)
                uploadeddumps.append(dump)
                log(wiki, dump, 'ok', config)
                if logourl:
                    logo = StringIO.StringIO(
                        urllib.urlopen(urlparse.urljoin(wiki, logourl),
                                       timeout=10).read())
                    logoextension = logourl.split('.')[-1] if logourl.split(
                        '.') else 'unknown'
                    logo.name = 'wiki-' + wikiname + '_logo.' + logoextension
                    item.upload(logo,
                                access_key=accesskey,
                                secret_key=secretkey,
                                verbose=True)
            except Exception as e:
                print wiki, dump, 'Error when uploading?'
                print e.message

            c += 1
Beispiel #7
0
 def test_getImages(self):
     # This test download the image list using API and index.php
     # Compare both lists in length and file by file
     # Check the presence of some special files, like odd chars filenames
     # The tested wikis are from different wikifarms and some alone
     
     print '#'*73, '\n', 'test_getImages', '\n', '#'*73
     tests = [
         # Alone wikis
         #['http://wiki.annotation.jp/index.php', 'http://wiki.annotation.jp/api.php', u'かずさアノテーション - ソーシャル・ゲノム・アノテーション.jpg'],
         ['http://archiveteam.org/index.php', 'http://archiveteam.org/api.php', u'Archive-is 2013-07-02 17-05-40.png'],
         ['http://skilledtests.com/wiki/index.php', 'http://skilledtests.com/wiki/api.php', u'Benham\'s disc (animated).gif'],
         
         # Editthis wikifarm
         # It has a page view limit
         
         # Gamepedia wikifarm
         ['http://dawngate.gamepedia.com/index.php', 'http://dawngate.gamepedia.com/api.php', u'Spell Vanquish.png'],
         
         # Gentoo wikifarm
         ['http://wiki.gentoo.org/index.php', 'http://wiki.gentoo.org/api.php', u'Openclonk screenshot1.png'],
         
         # Neoseeker wikifarm
         ['http://digimon.neoseeker.com/w/index.php', 'http://digimon.neoseeker.com/w/api.php', u'Ogremon card.png'],
         
         # Orain wikifarm
         #['http://mc.orain.org/w/index.php', 'http://mc.orain.org/w/api.php', u'Mojang logo.svg'],
         
         # Referata wikifarm
         ['http://wikipapers.referata.com/w/index.php', 'http://wikipapers.referata.com/w/api.php', u'Avbot logo.png'],
         
         # ShoutWiki wikifarm
         ['http://commandos.shoutwiki.com/w/index.php', 'http://commandos.shoutwiki.com/w/api.php', u'Night of the Wolves loading.png'],
         
         # Wiki-site wikifarm
         ['http://minlingo.wiki-site.com/index.php', 'http://minlingo.wiki-site.com/api.php', u'一 (書方灋ᅗᅩ).png'],
         
         # Wikkii wikifarm
         # It seems offline
     ]
     session = requests.Session()
     session.headers = {'User-Agent': getUserAgent()}
     for index, api, filetocheck in tests:
         # Testing with API
         print '\nTesting', api
         config_api = {'api': api, 'delay': 0}
         req = urllib2.Request(url=api, data=urllib.urlencode({'action': 'query', 'meta': 'siteinfo', 'siprop': 'statistics', 'format': 'json'}), headers={'User-Agent': getUserAgent()})
         f = urllib2.urlopen(req)
         imagecount = int(json.loads(f.read())['query']['statistics']['images'])
         f.close()
         
         print 'Trying to parse', filetocheck, 'with API'
         result_api = getImageNames(config=config_api, session=session)
         self.assertEqual(len(result_api), imagecount)
         self.assertTrue(filetocheck in [filename for filename, url, uploader in result_api])
         
         # Testing with index
         print '\nTesting', index
         config_index = {'index': index, 'delay': 0}
         req = urllib2.Request(url=api, data=urllib.urlencode({'action': 'query', 'meta': 'siteinfo', 'siprop': 'statistics', 'format': 'json'}), headers={'User-Agent': getUserAgent()})
         f = urllib2.urlopen(req)
         imagecount = int(json.loads(f.read())['query']['statistics']['images'])
         f.close()
 
         print 'Trying to parse', filetocheck, 'with index'
         result_index = getImageNames(config=config_index, session=session)
         #print 111, set([filename for filename, url, uploader in result_api]) - set([filename for filename, url, uploader in result_index])
         self.assertEqual(len(result_index), imagecount)
         self.assertTrue(filetocheck in [filename for filename, url, uploader in result_index])
         
         # Compare every image in both lists, with/without API
         c = 0
         for filename_api, url_api, uploader_api in result_api:
             self.assertEqual(filename_api, result_index[c][0], u'{0} and {1} are different'.format(filename_api, result_index[c][0]))
             self.assertEqual(url_api, result_index[c][1], u'{0} and {1} are different'.format(url_api, result_index[c][1]))
             self.assertEqual(uploader_api, result_index[c][2], u'{0} and {1} are different'.format(uploader_api, result_index[c][2]))
             c += 1
Beispiel #8
0
def upload(wikis, config={}):
    for wiki in wikis:
        print "#"*73
        print "# Uploading", wiki
        print "#"*73
        wiki = wiki.lower()
        prefix = dumpgenerator.domain2prefix(config={'api': wiki})

        wikiname = prefix.split('-')[0]
        dumps = []
        for dirname, dirnames, filenames in os.walk('.'):
            if dirname == '.':
                for f in filenames:
                    if f.startswith('%s-' % (wikiname)) and (f.endswith('-wikidump.7z') or f.endswith('-history.xml.7z')):
                        dumps.append(f)
                break

        c = 0
        for dump in dumps:
            wikidate = dump.split('-')[1]
            if dump in uploadeddumps:
                if config['prune-directories']:
                    rmline='rm -rf %s-%s-wikidump/' % (wikiname, wikidate)
                    # With -f the deletion might have happened before and we won't know
                    if not os.system(rmline):
                        print 'DELETED %s-%s-wikidump/' % (wikiname, wikidate)
                if config['prune-wikidump'] and dump.endswith('wikidump.7z'):
                        # Simplistic quick&dirty check for the presence of this file in the item
                        stdout, stderr = subprocess.Popen(["md5sum", dump], stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
                        dumphash = re.sub(' +.+\n?', '', stdout)

                        headers = {'User-Agent': dumpgenerator.getUserAgent()}
                        itemdata = urllib2.Request(url='http://archive.org/metadata/wiki-' + wikiname, headers=headers)
                        if re.search(dumphash, urllib2.urlopen(itemdata).read()):
                            log(wiki, dump, 'verified')
                            rmline='rm -rf %s' % dump
                            if not os.system(rmline):
                                print 'DELETED ' + dump
                            print '%s was uploaded before, skipping...' % (dump)
                            continue
                        else:
                            print 'ERROR: The online item misses ' + dump
                            log(wiki, dump, 'missing')
                            # We'll exit this if and go upload the dump
                else:
                    print '%s was uploaded before, skipping...' % (dump)
                    continue

            time.sleep(0.1)
            wikidate_text = wikidate[0:4]+'-'+wikidate[4:6]+'-'+wikidate[6:8]
            print wiki, wikiname, wikidate, dump

            # Does the item exist already?
            headers = {'User-Agent': dumpgenerator.getUserAgent()}
            itemdata = urllib2.Request(url='http://archive.org/metadata/wiki-' + wikiname, headers=headers)
            if urllib2.urlopen(itemdata).read() == '{}':
                ismissingitem = True
            else:
                ismissingitem = False

            # We don't know a way to fix/overwrite metadata if item exists already:
            # just pass bogus data and save some time
            if ismissingitem:
                #get metadata from api.php
                #first sitename and base url
                params = {'action': 'query', 'meta': 'siteinfo', 'format': 'xml'}
                data = urllib.urlencode(params)
                req = urllib2.Request(url=wiki, data=data, headers=headers)
                xml = ''
                try:
                    f = urllib2.urlopen(req)
                    xml = f.read()
                    f.close()
                except:
                    pass

                sitename = ''
                baseurl = ''
                lang = ''
                try:
                    sitename = re.findall(ur"sitename=\"([^\"]+)\"", xml)[0]
                except:
                    pass
                try:
                    baseurl = re.findall(ur"base=\"([^\"]+)\"", xml)[0]
                except:
                    pass
                try:
                    lang = re.findall(ur"lang=\"([^\"]+)\"", xml)[0]
                except:
                    pass

                if not sitename:
                    sitename = wikiname
                if not baseurl:
                    baseurl = re.sub(ur"(?im)/api\.php", ur"", wiki)
                if lang:
                    lang = convertlang.has_key(lang.lower()) and convertlang[lang.lower()] or lang.lower()

                #now copyright info from API
                params = {'action': 'query', 'siprop': 'general|rightsinfo', 'format': 'xml'}
                data = urllib.urlencode(params)
                req = urllib2.Request(url=wiki, data=data, headers=headers)
                xml = ''
                try:
                    f = urllib2.urlopen(req)
                    xml = f.read()
                    f.close()
                except:
                    pass

                rightsinfourl = ''
                rightsinfotext = ''
                try:
                    rightsinfourl = re.findall(ur"rightsinfo url=\"([^\"]+)\"", xml)[0]
                    rightsinfotext = re.findall(ur"text=\"([^\"]+)\"", xml)[0]
                except:
                    pass

                #or copyright info from #footer in mainpage
                if baseurl and not rightsinfourl and not rightsinfotext:
                    raw = ''
                    try:
                        f = urllib.urlopen(baseurl)
                        raw = f.read()
                        f.close()
                    except:
                        pass
                    rightsinfotext = ''
                    rightsinfourl = ''
                    try:
                        rightsinfourl = re.findall(ur"<link rel=\"copyright\" href=\"([^\"]+)\" />", raw)[0]
                    except:
                        pass
                    try:
                        rightsinfotext = re.findall(ur"<li id=\"copyright\">([^\n\r]*?)</li>", raw)[0]
                    except:
                        pass
                    if rightsinfotext and not rightsinfourl:
                        rightsinfourl = baseurl + '#footer'

                #retrieve some info from the wiki
                wikititle = "Wiki - %s" % (sitename) # Wiki - ECGpedia
                wikidesc = "<a href=\"%s\">%s</a> dumped with <a href=\"http://code.google.com/p/wikiteam/\" rel=\"nofollow\">WikiTeam</a> tools." % (baseurl, sitename)# "<a href=\"http://en.ecgpedia.org/\" rel=\"nofollow\">ECGpedia,</a>: a free electrocardiography (ECG) tutorial and textbook to which anyone can contribute, designed for medical professionals such as cardiac care nurses and physicians. Dumped with <a href=\"http://code.google.com/p/wikiteam/\" rel=\"nofollow\">WikiTeam</a> tools."
                wikikeys = ['wiki', 'wikiteam', 'MediaWiki', sitename, wikiname] # ecg; ECGpedia; wiki; wikiteam; MediaWiki
                if not rightsinfourl and not rightsinfotext:
                    wikikeys.append('unknowncopyright')

                wikilicenseurl = rightsinfourl # http://creativecommons.org/licenses/by-nc-sa/3.0/
                wikirights = rightsinfotext # e.g. http://en.ecgpedia.org/wiki/Frequently_Asked_Questions : hard to fetch automatically, could be the output of API's rightsinfo if it's not a usable licenseurl or "Unknown copyright status" if nothing is found.
                wikiurl = wiki # we use api here http://en.ecgpedia.org/api.php
            else:
                print 'Item already exists.'
                lang = 'foo'
                wikititle = 'foo'
                wikidesc = 'foo'
                wikikeys = 'foo'
                wikilicenseurl = 'foo'
                wikirights = 'foo'
                wikiurl = 'foo'

            #creates curl command
            curl = ['curl', '--location',
                '--header', "'x-amz-auto-make-bucket:1'", # Creates the item automatically, need to give some time for the item to correctly be created on archive.org, or everything else will fail, showing "bucket not found" error
                '--header', "'x-archive-queue-derive:0'",
                '--header', "'x-archive-size-hint:%d'" % (os.path.getsize(dump)),
                '--header', "'authorization: LOW %s:%s'" % (accesskey, secretkey),
            ]
            if c == 0:
                curl += ['--header', "'x-archive-meta-mediatype:web'",
                    '--header', "'x-archive-meta-collection:%s'" % (config['collection']),
                    '--header', quoteattr('x-archive-meta-title:' + wikititle),
                    '--header', "'x-archive-meta-description:%s'" % wikidesc.replace("'", r"\'"),
                    '--header', quoteattr('x-archive-meta-language:' + lang),
                    '--header', "'x-archive-meta-last-updated-date:%s'" % (wikidate_text),
                    '--header', "'x-archive-meta-subject:%s'" % ('; '.join(wikikeys)), # Keywords should be separated by ; but it doesn't matter much; the alternative is to set one per field with subject[0], subject[1], ...
                    '--header', quoteattr('x-archive-meta-licenseurl:' + wikilicenseurl),
                    '--header', "'x-archive-meta-rights:%s'" % wikirights.replace("'", r"\'"),
                    '--header', quoteattr('x-archive-meta-originalurl:' + wikiurl),
                ]

            curl += ['--upload-file', "%s" % (dump),
                    "http://s3.us.archive.org/wiki-%s/%s" % (wikiname, dump), # It could happen that the identifier is taken by another user; only wikiteam collection admins will be able to upload more files to it, curl will fail immediately and get a permissions error by s3.
                    '> /dev/null',
                    #FIXME: Must be NUL instead on Windows, how to make compatible?
            ]
            #now also to update the metadata
            #TODO: not needed for the second file in an item
            curlmeta = ['curl --silent',
                '--data-urlencode -target=metadata',
                """--data-urlencode -patch='{"replace":"/last-updated-date", "value":"%s"}'""" % (wikidate_text),
                '--data-urlencode access=' + accesskey,
                '--data-urlencode secret=' + secretkey,
                'http://archive.org/metadata/wiki-' + wikiname,
                '> /dev/null'
            ]
            curlline = ' '.join(curl)
            curlmetaline = ' '.join(curlmeta)
            if not os.system(curlline):
                uploadeddumps.append(dump)
                log(wiki, dump, 'ok')
                if not ismissingitem:
                    os.system(curlmetaline)
            c += 1
Beispiel #9
0
    def test_getPageTitles(self):
        # This test download the title list using API and index.php
        # Compare both lists in length and title by title
        # Check the presence of some special titles, like odd chars
        # The tested wikis are from different wikifarms and some alone

        print '\n', '#' * 73, '\n', 'test_getPageTitles', '\n', '#' * 73
        tests = [
            # Alone wikis
            [
                'http://archiveteam.org/index.php',
                'http://archiveteam.org/api.php', u'April Fools\' Day'
            ],
            [
                'http://skilledtests.com/wiki/index.php',
                'http://skilledtests.com/wiki/api.php',
                u'Conway\'s Game of Life'
            ],

            # Test old allpages API behaviour
            [
                'http://wiki.damirsystems.com/index.php',
                'http://wiki.damirsystems.com/api.php', 'SQL Server Tips'
            ],

            # Test BOM encoding
            #['http://www.libreidea.org/w/index.php', 'http://www.libreidea.org/w/api.php', 'Main Page'],
        ]

        session = requests.Session()
        session.headers = {'User-Agent': getUserAgent()}
        for index, api, pagetocheck in tests:
            # Testing with API
            print '\nTesting', api
            print 'Trying to parse', pagetocheck, 'with API'
            config_api = {
                'api': api,
                'index': '',
                'delay': 0,
                'namespaces': ['all'],
                'exnamespaces': [],
                'date': datetime.datetime.now().strftime('%Y%m%d'),
                'path': '.',
                'retries': 5,
            }

            titles_api = getPageTitles(config=config_api, session=session)
            result_api = open(titles_api,
                              'r').read().decode('utf8').splitlines()
            os.remove(titles_api)
            self.assertTrue(pagetocheck in result_api)

            # Testing with index
            print 'Testing', index
            print 'Trying to parse', pagetocheck, 'with index'
            config_index = {
                'index': index,
                'api': '',
                'delay': 0,
                'namespaces': ['all'],
                'exnamespaces': [],
                'date': datetime.datetime.now().strftime('%Y%m%d'),
                'path': '.',
                'retries': 5
            }

            titles_index = getPageTitles(config=config_index, session=session)
            result_index = open(titles_index,
                                'r').read().decode('utf8').splitlines()
            os.remove(titles_index)
            self.assertTrue(pagetocheck in result_index)
            self.assertEqual(len(result_api), len(result_index))

            # Compare every page in both lists, with/without API
            c = 0
            for pagename_api in result_api:
                chk = pagename_api in result_index
                self.assertEqual(chk, True,
                                 u'%s not in result_index' % (pagename_api))
                c += 1
Beispiel #10
0
def upload(wikis, config={}):
    for wiki in wikis:
        print "#" * 73
        print "# Uploading", wiki
        print "#" * 73
        wiki = wiki.lower()
        prefix = dumpgenerator.domain2prefix(config={'api': wiki})

        wikiname = prefix.split('-')[0]
        dumps = []
        for dirname, dirnames, filenames in os.walk('.'):
            if dirname == '.':
                for f in filenames:
                    if f.startswith(
                            '%s-' %
                        (wikiname)) and (f.endswith('-wikidump.7z')
                                         or f.endswith('-history.xml.7z')):
                        dumps.append(f)
                break

        c = 0
        for dump in dumps:
            wikidate = dump.split('-')[1]
            if dump in uploadeddumps:
                if config['prune-directories']:
                    rmline = 'rm -rf %s-%s-wikidump/' % (wikiname, wikidate)
                    # With -f the deletion might have happened before and we won't know
                    if not os.system(rmline):
                        print 'DELETED %s-%s-wikidump/' % (wikiname, wikidate)
                if config['prune-wikidump'] and dump.endswith('wikidump.7z'):
                    # Simplistic quick&dirty check for the presence of this file in the item
                    stdout, stderr = subprocess.Popen(
                        ["md5sum", dump],
                        stdout=subprocess.PIPE,
                        stderr=subprocess.PIPE).communicate()
                    dumphash = re.sub(' +.+\n?', '', stdout)

                    headers = {'User-Agent': dumpgenerator.getUserAgent()}
                    itemdata = urllib2.Request(
                        url='http://archive.org/metadata/wiki-' + wikiname,
                        headers=headers)
                    if re.search(dumphash, urllib2.urlopen(itemdata).read()):
                        log(wiki, dump, 'verified')
                        rmline = 'rm -rf %s' % dump
                        if not os.system(rmline):
                            print 'DELETED ' + dump
                        print '%s was uploaded before, skipping...' % (dump)
                        continue
                    else:
                        print 'ERROR: The online item misses ' + dump
                        log(wiki, dump, 'missing')
                        # We'll exit this if and go upload the dump
                else:
                    print '%s was uploaded before, skipping...' % (dump)
                    continue

            time.sleep(0.1)
            wikidate_text = wikidate[0:4] + '-' + wikidate[
                4:6] + '-' + wikidate[6:8]
            print wiki, wikiname, wikidate, dump

            # Does the item exist already?
            headers = {'User-Agent': dumpgenerator.getUserAgent()}
            itemdata = urllib2.Request(
                url='http://archive.org/metadata/wiki-' + wikiname,
                headers=headers)
            if urllib2.urlopen(itemdata).read() == '{}':
                ismissingitem = True
            else:
                ismissingitem = False

            # We don't know a way to fix/overwrite metadata if item exists already:
            # just pass bogus data and save some time
            if ismissingitem:
                #get metadata from api.php
                #first sitename and base url
                params = {
                    'action': 'query',
                    'meta': 'siteinfo',
                    'format': 'xml'
                }
                data = urllib.urlencode(params)
                req = urllib2.Request(url=wiki, data=data, headers=headers)
                xml = ''
                try:
                    f = urllib2.urlopen(req)
                    xml = f.read()
                    f.close()
                except:
                    pass

                sitename = ''
                baseurl = ''
                lang = ''
                try:
                    sitename = re.findall(ur"sitename=\"([^\"]+)\"", xml)[0]
                except:
                    pass
                try:
                    baseurl = re.findall(ur"base=\"([^\"]+)\"", xml)[0]
                except:
                    pass
                try:
                    lang = re.findall(ur"lang=\"([^\"]+)\"", xml)[0]
                except:
                    pass

                if not sitename:
                    sitename = wikiname
                if not baseurl:
                    baseurl = re.sub(ur"(?im)/api\.php", ur"", wiki)
                if lang:
                    lang = convertlang.has_key(lang.lower()) and convertlang[
                        lang.lower()] or lang.lower()

                #now copyright info from API
                params = {
                    'action': 'query',
                    'siprop': 'general|rightsinfo',
                    'format': 'xml'
                }
                data = urllib.urlencode(params)
                req = urllib2.Request(url=wiki, data=data, headers=headers)
                xml = ''
                try:
                    f = urllib2.urlopen(req)
                    xml = f.read()
                    f.close()
                except:
                    pass

                rightsinfourl = ''
                rightsinfotext = ''
                try:
                    rightsinfourl = re.findall(ur"rightsinfo url=\"([^\"]+)\"",
                                               xml)[0]
                    rightsinfotext = re.findall(ur"text=\"([^\"]+)\"", xml)[0]
                except:
                    pass

                #or copyright info from #footer in mainpage
                if baseurl and not rightsinfourl and not rightsinfotext:
                    raw = ''
                    try:
                        f = urllib.urlopen(baseurl)
                        raw = f.read()
                        f.close()
                    except:
                        pass
                    rightsinfotext = ''
                    rightsinfourl = ''
                    try:
                        rightsinfourl = re.findall(
                            ur"<link rel=\"copyright\" href=\"([^\"]+)\" />",
                            raw)[0]
                    except:
                        pass
                    try:
                        rightsinfotext = re.findall(
                            ur"<li id=\"copyright\">([^\n\r]*?)</li>", raw)[0]
                    except:
                        pass
                    if rightsinfotext and not rightsinfourl:
                        rightsinfourl = baseurl + '#footer'

                #retrieve some info from the wiki
                wikititle = "Wiki - %s" % (sitename)  # Wiki - ECGpedia
                wikidesc = "<a href=\"%s\">%s</a> dumped with <a href=\"http://code.google.com/p/wikiteam/\" rel=\"nofollow\">WikiTeam</a> tools." % (
                    baseurl, sitename
                )  # "<a href=\"http://en.ecgpedia.org/\" rel=\"nofollow\">ECGpedia,</a>: a free electrocardiography (ECG) tutorial and textbook to which anyone can contribute, designed for medical professionals such as cardiac care nurses and physicians. Dumped with <a href=\"http://code.google.com/p/wikiteam/\" rel=\"nofollow\">WikiTeam</a> tools."
                wikikeys = [
                    'wiki', 'wikiteam', 'MediaWiki', sitename, wikiname
                ]  # ecg; ECGpedia; wiki; wikiteam; MediaWiki
                if not rightsinfourl and not rightsinfotext:
                    wikikeys.append('unknowncopyright')

                wikilicenseurl = rightsinfourl  # http://creativecommons.org/licenses/by-nc-sa/3.0/
                wikirights = rightsinfotext  # e.g. http://en.ecgpedia.org/wiki/Frequently_Asked_Questions : hard to fetch automatically, could be the output of API's rightsinfo if it's not a usable licenseurl or "Unknown copyright status" if nothing is found.
                wikiurl = wiki  # we use api here http://en.ecgpedia.org/api.php
            else:
                print 'Item already exists.'
                lang = 'foo'
                wikititle = 'foo'
                wikidesc = 'foo'
                wikikeys = 'foo'
                wikilicenseurl = 'foo'
                wikirights = 'foo'
                wikiurl = 'foo'

            #creates curl command
            curl = [
                'curl',
                '--location',
                '--header',
                "'x-amz-auto-make-bucket:1'",  # Creates the item automatically, need to give some time for the item to correctly be created on archive.org, or everything else will fail, showing "bucket not found" error
                '--header',
                "'x-archive-queue-derive:0'",
                '--header',
                "'x-archive-size-hint:%d'" % (os.path.getsize(dump)),
                '--header',
                "'authorization: LOW %s:%s'" % (accesskey, secretkey),
            ]
            if c == 0:
                curl += [
                    '--header',
                    "'x-archive-meta-mediatype:web'",
                    '--header',
                    "'x-archive-meta-collection:%s'" % (config['collection']),
                    '--header',
                    quoteattr('x-archive-meta-title:' + wikititle),
                    '--header',
                    "'x-archive-meta-description:%s'" %
                    wikidesc.replace("'", r"\'"),
                    '--header',
                    quoteattr('x-archive-meta-language:' + lang),
                    '--header',
                    "'x-archive-meta-last-updated-date:%s'" % (wikidate_text),
                    '--header',
                    "'x-archive-meta-subject:%s'" % (
                        '; '.join(wikikeys)
                    ),  # Keywords should be separated by ; but it doesn't matter much; the alternative is to set one per field with subject[0], subject[1], ...
                    '--header',
                    quoteattr('x-archive-meta-licenseurl:' + wikilicenseurl),
                    '--header',
                    "'x-archive-meta-rights:%s'" %
                    wikirights.replace("'", r"\'"),
                    '--header',
                    quoteattr('x-archive-meta-originalurl:' + wikiurl),
                ]

            curl += [
                '--upload-file',
                "%s" % (dump),
                "http://s3.us.archive.org/wiki-%s/%s" % (
                    wikiname, dump
                ),  # It could happen that the identifier is taken by another user; only wikiteam collection admins will be able to upload more files to it, curl will fail immediately and get a permissions error by s3.
                '> /dev/null',
                #FIXME: Must be NUL instead on Windows, how to make compatible?
            ]
            #now also to update the metadata
            #TODO: not needed for the second file in an item
            curlmeta = [
                'curl --silent', '--data-urlencode -target=metadata',
                """--data-urlencode -patch='{"replace":"/last-updated-date", "value":"%s"}'"""
                % (wikidate_text), '--data-urlencode access=' + accesskey,
                '--data-urlencode secret=' + secretkey,
                'http://archive.org/metadata/wiki-' + wikiname, '> /dev/null'
            ]
            curlline = ' '.join(curl)
            curlmetaline = ' '.join(curlmeta)
            if not os.system(curlline):
                uploadeddumps.append(dump)
                log(wiki, dump, 'ok')
                if not ismissingitem:
                    os.system(curlmetaline)
            c += 1
Beispiel #11
0
def upload(wikis, config={}):
    headers = {'User-Agent': dumpgenerator.getUserAgent()}

    for wiki in wikis:
        print "#"*73
        print "# Uploading", wiki
        print "#"*73
        wiki = wiki.lower()
        prefix = dumpgenerator.domain2prefix(config={'api': wiki})

        wikiname = prefix.split('-')[0]
        dumps = []
        for dirname, dirnames, filenames in os.walk('.'):
            if dirname == '.':
                for f in filenames:
                    if f.startswith('%s-' % (wikiname)) and (f.endswith('-wikidump.7z') or f.endswith('-history.xml.7z')):
                        dumps.append(f)
                break

        c = 0
        for dump in dumps:
            wikidate = dump.split('-')[1]
            item = get_item('wiki-' + wikiname)

            if dump in uploadeddumps:
                if config['prune-directories']:
                    rmline='rm -rf %s-%s-wikidump/' % (wikiname, wikidate)
                    # With -f the deletion might have happened before and we won't know
                    if not os.system(rmline):
                        print 'DELETED %s-%s-wikidump/' % (wikiname, wikidate)
                if config['prune-wikidump'] and dump.endswith('wikidump.7z'):
                        # Simplistic quick&dirty check for the presence of this file in the item
                        stdout, stderr = subprocess.Popen(["md5sum", dump], stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
                        dumphash = re.sub(' +.+\n?', '', stdout)

                        if dumphash in map(lambda x: x['md5'], item.files):
                            log(wiki, dump, 'verified')
                            rmline='rm -rf %s' % dump
                            if not os.system(rmline):
                                print 'DELETED ' + dump
                            print '%s was uploaded before, skipping...' % (dump)
                            continue
                        else:
                            print 'ERROR: The online item misses ' + dump
                            log(wiki, dump, 'missing')
                            # We'll exit this if and go upload the dump
                else:
                    print '%s was uploaded before, skipping...' % (dump)
                    continue

            time.sleep(0.1)
            wikidate_text = wikidate[0:4]+'-'+wikidate[4:6]+'-'+wikidate[6:8]
            print wiki, wikiname, wikidate, dump

            # Does the item exist already?
            ismissingitem = not item.exists

            # We don't know a way to fix/overwrite metadata if item exists already:
            # just pass bogus data and save some time
            if ismissingitem:
                #get metadata from api.php
                #first sitename and base url
                params = {'action': 'query', 'meta': 'siteinfo', 'format': 'xml'}
                data = urllib.urlencode(params)
                req = urllib2.Request(url=wiki, data=data, headers=headers)
                xml = ''
                try:
                    f = urllib2.urlopen(req)
                    xml = f.read()
                    f.close()
                except:
                    pass

                sitename = ''
                baseurl = ''
                lang = ''
                try:
                    sitename = re.findall(ur"sitename=\"([^\"]+)\"", xml)[0]
                except:
                    pass
                try:
                    baseurl = re.findall(ur"base=\"([^\"]+)\"", xml)[0]
                except:
                    pass
                try:
                    lang = re.findall(ur"lang=\"([^\"]+)\"", xml)[0]
                except:
                    pass

                if not sitename:
                    sitename = wikiname
                if not baseurl:
                    baseurl = re.sub(ur"(?im)/api\.php", ur"", wiki)
                if lang:
                    lang = convertlang.has_key(lang.lower()) and convertlang[lang.lower()] or lang.lower()

                #now copyright info from API
                params = {'action': 'query', 'siprop': 'general|rightsinfo', 'format': 'xml'}
                data = urllib.urlencode(params)
                req = urllib2.Request(url=wiki, data=data, headers=headers)
                xml = ''
                try:
                    f = urllib2.urlopen(req)
                    xml = f.read()
                    f.close()
                except:
                    pass

                rightsinfourl = ''
                rightsinfotext = ''
                try:
                    rightsinfourl = re.findall(ur"rightsinfo url=\"([^\"]+)\"", xml)[0]
                    rightsinfotext = re.findall(ur"text=\"([^\"]+)\"", xml)[0]
                except:
                    pass

                #or copyright info from #footer in mainpage
                if baseurl and not rightsinfourl and not rightsinfotext:
                    raw = ''
                    try:
                        f = urllib.urlopen(baseurl)
                        raw = f.read()
                        f.close()
                    except:
                        pass
                    rightsinfotext = ''
                    rightsinfourl = ''
                    try:
                        rightsinfourl = re.findall(ur"<link rel=\"copyright\" href=\"([^\"]+)\" />", raw)[0]
                    except:
                        pass
                    try:
                        rightsinfotext = re.findall(ur"<li id=\"copyright\">([^\n\r]*?)</li>", raw)[0]
                    except:
                        pass
                    if rightsinfotext and not rightsinfourl:
                        rightsinfourl = baseurl + '#footer'

                #retrieve some info from the wiki
                wikititle = "Wiki - %s" % (sitename) # Wiki - ECGpedia
                wikidesc = "<a href=\"%s\">%s</a> dumped with <a href=\"https://github.com/WikiTeam/wikiteam\" rel=\"nofollow\">WikiTeam</a> tools." % (baseurl, sitename)# "<a href=\"http://en.ecgpedia.org/\" rel=\"nofollow\">ECGpedia,</a>: a free electrocardiography (ECG) tutorial and textbook to which anyone can contribute, designed for medical professionals such as cardiac care nurses and physicians. Dumped with <a href=\"https://github.com/WikiTeam/wikiteam\" rel=\"nofollow\">WikiTeam</a> tools."
                wikikeys = ['wiki', 'wikiteam', 'MediaWiki', sitename, wikiname] # ecg; ECGpedia; wiki; wikiteam; MediaWiki
                if not rightsinfourl and not rightsinfotext:
                    wikikeys.append('unknowncopyright')

                wikilicenseurl = rightsinfourl # http://creativecommons.org/licenses/by-nc-sa/3.0/
                wikirights = rightsinfotext # e.g. http://en.ecgpedia.org/wiki/Frequently_Asked_Questions : hard to fetch automatically, could be the output of API's rightsinfo if it's not a usable licenseurl or "Unknown copyright status" if nothing is found.
                wikiurl = wiki # we use api here http://en.ecgpedia.org/api.php
            else:
                print 'Item already exists.'
                lang = 'foo'
                wikititle = 'foo'
                wikidesc = 'foo'
                wikikeys = 'foo'
                wikilicenseurl = 'foo'
                wikirights = 'foo'
                wikiurl = 'foo'

            if c == 0:
                # Item metadata
                md = {
                    'mediatype': 'web',
                    'collection': config['collection'],
                    'title': wikititle,
                    'description': wikidesc,
                    'language': lang,
                    'last-updated-date': wikidate_text,
                    'subject': '; '.join(wikikeys), # Keywords should be separated by ; but it doesn't matter much; the alternative is to set one per field with subject[0], subject[1], ...
                    'licenseurl': wikilicenseurl,
                    'rights': wikirights,
                    'originalurl': wikiurl,
                }

            #now also to update the metadata
            #TODO: not needed for the second file in an item
            try:
                item.upload(dump, metadata=md, access_key=accesskey, secret_key=secretkey, verbose=True)
                uploadeddumps.append(dump)
                log(wiki, dump, 'ok')
            except:
                log(wiki, dump, 'error?')
            c += 1