Ejemplo n.º 1
0
def getIndex(config={}):
    """ Returns Index.php for a wiki, if available """

    index = ''
    if config['wikiengine'] == 'mediawiki':
        import mediawiki
        index = mediawiki.mwGetIndex(config=config)

    return index
Ejemplo n.º 2
0
def getIndex(config={}):
    """ Returns Index.php for a wiki, if available """
    
    index = ''
    if config['wikiengine'] == 'mediawiki':
        import mediawiki
        index = mediawiki.mwGetIndex(config=config)
    
    return index
Ejemplo n.º 3
0
def getParameters(params=[]):
    """ Import parameters into variable """

    if not params:
        params = sys.argv

    config = {}
    parser = argparse.ArgumentParser(
        description='Tools for downloading and preserving wikis.')

    # General params
    parser.add_argument('-v',
                        '--version',
                        action='version',
                        version=getVersion())
    parser.add_argument('--cookies',
                        metavar="cookies.txt",
                        help="Path to a cookies.txt file.")
    parser.add_argument('--delay',
                        metavar=5,
                        default=0,
                        type=float,
                        help="Adds a delay (in seconds).")
    parser.add_argument('--retries',
                        metavar=5,
                        default=5,
                        help="Maximum number of retries.")
    parser.add_argument('--path', help='Path to store wiki dump at.')
    parser.add_argument(
        '--resume',
        action='store_true',
        help='Resumes previous incomplete dump (requires --path).')
    parser.add_argument('--force', action='store_true', help='')
    parser.add_argument('--user',
                        help='Username if authentication is required.')
    parser.add_argument('--pass',
                        dest='password',
                        help='Password if authentication is required.')

    # URL params
    # This script should work with any general URL, finding out
    # API, index.php or whatever by itself when necessary
    groupWiki = parser.add_argument_group()
    groupWiki.add_argument('wiki',
                           default='',
                           nargs='?',
                           help="URL to wiki (e.g. http://wiki.domain.org).")
    # URL params for MediaWiki
    groupWiki.add_argument(
        '--mwapi',
        help="URL to MediaWiki API (e.g. http://wiki.domain.org/w/api.php).")
    groupWiki.add_argument(
        '--mwindex',
        help=
        "URL to MediaWiki index.php (e.g. http://wiki.domain.org/w/index.php)."
    )

    # Download params
    groupDownload = parser.add_argument_group(
        'Data to download', 'What info download from the wiki')
    groupDownload.add_argument(
        '--pages',
        action='store_true',
        help=
        "Generates a dump of pages (--pages --curonly for current revisions only)."
    )
    groupDownload.add_argument('--curonly',
                               action='store_true',
                               help='Store only the current version of pages.')
    groupDownload.add_argument('--images',
                               action='store_true',
                               help="Generates an image dump.")
    groupDownload.add_argument(
        '--namespaces',
        metavar="1,2,3",
        help='Comma-separated value of namespaces to include (all by default).'
    )
    groupDownload.add_argument(
        '--exnamespaces',
        metavar="1,2,3",
        help='Comma-separated value of namespaces to exclude.')

    # Meta info params
    groupMeta = parser.add_argument_group(
        'Meta info', 'What meta info to retrieve from the wiki')
    groupMeta.add_argument('--get-api',
                           action='store_true',
                           help="Returns wiki API when available.")
    groupMeta.add_argument('--get-index',
                           action='store_true',
                           help="Returns wiki Index.php when available.")
    groupMeta.add_argument('--get-page-titles',
                           action='store_true',
                           help="Returns wiki page titles.")
    groupMeta.add_argument('--get-image-names',
                           action='store_true',
                           help="Returns wiki image names.")
    groupMeta.add_argument('--get-namespaces',
                           action='store_true',
                           help="Returns wiki namespaces.")
    groupMeta.add_argument('--get-wiki-engine',
                           action='store_true',
                           help="Returns wiki engine.")

    args = parser.parse_args()
    #sys.stderr.write(args)

    # Not wiki? Exit
    if not args.wiki:
        sys.stderr.write('ERROR: Provide a URL to a wiki\n')
        parser.print_help()
        sys.exit(1)

    # Don't mix download params and meta info params
    if (args.pages or args.images) and \
       (args.get_api or args.get_index or args.get_page_titles or args.get_image_names or args.get_namespaces or args.get_wiki_engine):
        sys.stderr.write(
            'ERROR: Don\'t mix download params and meta info params\n')
        parser.print_help()
        sys.exit(1)

    # No download params and no meta info params? Exit
    if (not args.pages and not args.images) and \
       (not args.get_api and not args.get_index and not args.get_page_titles and not args.get_image_names and not args.get_namespaces and not args.get_wiki_engine):
        sys.stderr.write(
            'ERROR: Use at least one download param or meta info param\n')
        parser.print_help()
        sys.exit(1)

    # Load cookies
    cj = cookielib.MozillaCookieJar()
    if args.cookies:
        cj.load(args.cookies)
        sys.stderr.write('Using cookies from %s\n' % args.cookies)

    # check user and pass (one requires both)
    if (args.user and not args.password) or (args.password and not args.user):
        sys.stderr.write(
            'ERROR: Both --user and --pass are required for authentication.\n')
        parser.print_help()
        sys.exit(1)

    session = None
    if args.user and args.password:
        import requests
        session = requests.Session()
        session.cookies = cj
        session.headers.update({'User-Agent': getUserAgent()})
        session.auth = (args.user, args.password)
        #session.mount(args.mw_api.split('/api.php')[0], HTTPAdapter(max_retries=max_ret)) Mediawiki-centric, be careful

    # check URLs
    for url in [args.mwapi, args.mwindex, args.wiki]:
        if url and (not url.startswith('http://')
                    and not url.startswith('https://')):
            sys.stderr.write(url)
            sys.stderr.write(
                'ERROR: URLs must start with http:// or https://\n')
            parser.print_help()
            sys.exit(1)

    # Meta info params
    metainfo = ''  # only one allowed, so we don't mix output
    if args.get_api:
        metainfo = 'get_api'
    elif args.get_index:
        metainfo = 'get_index'
    elif args.get_page_titles:
        metainfo = 'get_page_titles'
    elif args.get_image_names:
        metainfo = 'get_image_names'
    elif args.get_namespaces:
        metainfo = 'get_namespaces'
    elif args.get_wiki_engine:
        metainfo = 'get_wiki_engine'

    namespaces = ['all']
    exnamespaces = []
    # Process namespace inclusions
    if args.namespaces:
        # fix, why - ?  and... --namespaces= all with a space works?
        if re.search(r'[^\d, \-]',
                     args.namespaces) and args.namespaces.lower() != 'all':
            sys.stderr.write(
                "Invalid namespace values.\nValid format is integer(s) separated by commas\n"
            )
            sys.exit()
        else:
            ns = re.sub(' ', '', args.namespaces)
            if ns.lower() == 'all':
                namespaces = ['all']
            else:
                namespaces = [int(i) for i in ns.split(',')]

    # Process namespace exclusions
    if args.exnamespaces:
        if re.search(r'[^\d, \-]', args.exnamespaces):
            sys.stderr.write(
                "Invalid namespace values.\nValid format is integer(s) separated by commas\n"
            )
            sys.exit(1)
        else:
            ns = re.sub(' ', '', args.exnamespaces)
            if ns.lower() == 'all':
                sys.stderr.write('You cannot exclude all namespaces.\n')
                sys.exit(1)
            else:
                exnamespaces = [int(i) for i in ns.split(',')]

    # --curonly requires --xml
    if args.curonly and not args.pages:
        sys.stderr.write("--curonly requires --pages\n")
        parser.print_help()
        sys.exit(1)

    config = {
        'cookies': args.cookies or '',
        'curonly': args.curonly,
        'date': datetime.datetime.now().strftime('%Y%m%d'),
        'delay': args.delay,
        'exnamespaces': exnamespaces,
        'images': args.images,
        'logs': False,
        'metainfo': metainfo,
        'namespaces': namespaces,
        'pages': args.pages,
        'path': args.path and os.path.normpath(args.path) or '',
        'retries': int(args.retries),
        'wiki': args.wiki,
        'wikicanonical': '',
        'wikiengine': getWikiEngine(args.wiki),
        'other': {
            'configfilename': 'config.txt',
            'filenamelimit': 100,  # do not change
            'force': args.force,
            'resume': args.resume,
            'session': session,
        }
    }

    # Get ready special variables (API for MediWiki, etc)
    if config['wikiengine'] == 'mediawiki':
        import mediawiki
        config['mwexport'] = 'Special:Export'
        if not args.mwapi:
            config['mwapi'] = mediawiki.mwGetAPI(config=config)
            if not config['mwapi']:
                sys.stderr.write('ERROR: Provide a URL to API\n')
                sys.exit(1)
            else:
                data = {
                    'action': 'query',
                    'meta': 'siteinfo',
                    'siprop': 'namespaces',
                    'format': 'json'
                }
                r = getURL(config['mwapi'], data=data)
                config['mwexport'] = getJSON(r)['query']['namespaces']['-1']['*'] \
                    + ':Export'
        if not args.mwindex:
            config['mwindex'] = mediawiki.mwGetIndex(config=config)
            if not config['mwindex']:
                sys.stderr.write('ERROR: Provide a URL to Index.php\n')
                sys.exit(1)
    elif wikiengine == 'wikispaces':
        import wikispaces
        # use wikicanonical for base url for Wikispaces?

    # calculating path, if not defined by user with --path=
    if not config['path']:
        config['path'] = './%s-%s-wikidump' % (domain2prefix(config=config),
                                               config['date'])

    return config
Ejemplo n.º 4
0
def getParameters(params=[]):
    """ Import parameters into variable """
    
    if not params:
        params = sys.argv
    
    config = {}
    parser = argparse.ArgumentParser(description='Tools for downloading and preserving wikis.')

    # General params
    parser.add_argument(
        '-v', '--version', action='version', version=getVersion())
    parser.add_argument(
        '--cookies', metavar="cookies.txt", help="Path to a cookies.txt file.")
    parser.add_argument(
        '--delay',
        metavar=5,
        default=0,
        type=float,
        help="Adds a delay (in seconds).")
    parser.add_argument(
        '--retries',
        metavar=5,
        default=5,
        help="Maximum number of retries.")
    parser.add_argument('--path', help='Path to store wiki dump at.')
    parser.add_argument(
        '--resume',
        action='store_true',
        help='Resumes previous incomplete dump (requires --path).')
    parser.add_argument('--force', action='store_true', help='')
    parser.add_argument(
        '--user', help='Username if authentication is required.')
    parser.add_argument(
        '--pass',
        dest='password',
        help='Password if authentication is required.')

    # URL params
    # This script should work with any general URL, finding out
    # API, index.php or whatever by itself when necessary
    groupWiki = parser.add_argument_group()
    groupWiki.add_argument(
        'wiki',
        default='',
        nargs='?',
        help="URL to wiki (e.g. http://wiki.domain.org).")
    # URL params for MediaWiki
    groupWiki.add_argument(
        '--mwapi',
        help="URL to MediaWiki API (e.g. http://wiki.domain.org/w/api.php).")
    groupWiki.add_argument(
        '--mwindex',
        help="URL to MediaWiki index.php (e.g. http://wiki.domain.org/w/index.php).")

    # Download params
    groupDownload = parser.add_argument_group(
        'Data to download',
        'What info download from the wiki')
    groupDownload.add_argument(
        '--pages',
        action='store_true',
        help="Generates a dump of pages (--pages --curonly for current revisions only).")
    groupDownload.add_argument('--curonly', action='store_true',
                               help='Store only the current version of pages.')
    groupDownload.add_argument(
        '--images', action='store_true', help="Generates an image dump.")
    groupDownload.add_argument(
        '--namespaces',
        metavar="1,2,3",
        help='Comma-separated value of namespaces to include (all by default).')
    groupDownload.add_argument(
        '--exnamespaces',
        metavar="1,2,3",
        help='Comma-separated value of namespaces to exclude.')

    # Meta info params
    groupMeta = parser.add_argument_group(
        'Meta info',
        'What meta info to retrieve from the wiki')
    groupMeta.add_argument(
        '--get-api',
        action='store_true',
        help="Returns wiki API when available.")
    groupMeta.add_argument(
        '--get-index',
        action='store_true',
        help="Returns wiki Index.php when available.")
    groupMeta.add_argument(
        '--get-page-titles',
        action='store_true',
        help="Returns wiki page titles.")
    groupMeta.add_argument(
        '--get-image-names',
        action='store_true',
        help="Returns wiki image names.")
    groupMeta.add_argument(
        '--get-namespaces',
        action='store_true',
        help="Returns wiki namespaces.")
    groupMeta.add_argument(
        '--get-wiki-engine',
        action='store_true',
        help="Returns wiki engine.")

    args = parser.parse_args()
    #sys.stderr.write(args)
    
    # Not wiki? Exit
    if not args.wiki:
        sys.stderr.write('ERROR: Provide a URL to a wiki\n')
        parser.print_help()
        sys.exit(1)
    
    # Don't mix download params and meta info params
    if (args.pages or args.images) and \
       (args.get_api or args.get_index or args.get_page_titles or args.get_image_names or args.get_namespaces or args.get_wiki_engine):
        sys.stderr.write('ERROR: Don\'t mix download params and meta info params\n')
        parser.print_help()
        sys.exit(1)

    # No download params and no meta info params? Exit
    if (not args.pages and not args.images) and \
       (not args.get_api and not args.get_index and not args.get_page_titles and not args.get_image_names and not args.get_namespaces and not args.get_wiki_engine):
        sys.stderr.write('ERROR: Use at least one download param or meta info param\n')
        parser.print_help()
        sys.exit(1)

    # Load cookies
    cj = cookielib.MozillaCookieJar()
    if args.cookies:
        cj.load(args.cookies)
        sys.stderr.write('Using cookies from %s\n' % args.cookies)

    # check user and pass (one requires both)
    if (args.user and not args.password) or (args.password and not args.user):
        sys.stderr.write('ERROR: Both --user and --pass are required for authentication.\n')
        parser.print_help()
        sys.exit(1)
    
    session = None
    if args.user and args.password:
        import requests
        session = requests.Session()
        session.cookies = cj
        session.headers.update({'User-Agent': getUserAgent()})
        session.auth = (args.user, args.password)
        #session.mount(args.mw_api.split('/api.php')[0], HTTPAdapter(max_retries=max_ret)) Mediawiki-centric, be careful

    # check URLs
    for url in [args.mwapi, args.mwindex, args.wiki]:
        if url and (not url.startswith('http://') and not url.startswith('https://')):
            sys.stderr.write(url)
            sys.stderr.write('ERROR: URLs must start with http:// or https://\n')
            parser.print_help()
            sys.exit(1)
    
    # Meta info params
    metainfo = '' # only one allowed, so we don't mix output
    if args.get_api:
        metainfo = 'get_api'
    elif args.get_index:
        metainfo = 'get_index'
    elif args.get_page_titles:
        metainfo = 'get_page_titles'
    elif args.get_image_names:
        metainfo = 'get_image_names'
    elif args.get_namespaces:
        metainfo = 'get_namespaces'
    elif args.get_wiki_engine:
        metainfo = 'get_wiki_engine'

    namespaces = ['all']
    exnamespaces = []
    # Process namespace inclusions
    if args.namespaces:
        # fix, why - ?  and... --namespaces= all with a space works?
        if re.search(
                r'[^\d, \-]',
                args.namespaces) and args.namespaces.lower() != 'all':
            sys.stderr.write("Invalid namespace values.\nValid format is integer(s) separated by commas\n")
            sys.exit()
        else:
            ns = re.sub(' ', '', args.namespaces)
            if ns.lower() == 'all':
                namespaces = ['all']
            else:
                namespaces = [int(i) for i in ns.split(',')]

    # Process namespace exclusions
    if args.exnamespaces:
        if re.search(r'[^\d, \-]', args.exnamespaces):
            sys.stderr.write("Invalid namespace values.\nValid format is integer(s) separated by commas\n")
            sys.exit(1)
        else:
            ns = re.sub(' ', '', args.exnamespaces)
            if ns.lower() == 'all':
                sys.stderr.write('You cannot exclude all namespaces.\n')
                sys.exit(1)
            else:
                exnamespaces = [int(i) for i in ns.split(',')]

    # --curonly requires --xml
    if args.curonly and not args.pages:
        sys.stderr.write("--curonly requires --pages\n")
        parser.print_help()
        sys.exit(1)
    
    config = {
        'cookies': args.cookies or '', 
        'curonly': args.curonly, 
        'date': datetime.datetime.now().strftime('%Y%m%d'), 
        'delay': args.delay, 
        'exnamespaces': exnamespaces, 
        'images': args.images, 
        'logs': False, 
        'metainfo': metainfo, 
        'namespaces': namespaces, 
        'pages': args.pages, 
        'path': args.path and os.path.normpath(args.path) or '', 
        'retries': int(args.retries), 
        'wiki': args.wiki, 
        'wikicanonical': '', 
        'wikiengine': getWikiEngine(args.wiki), 
        'other': {
            'configfilename': 'config.txt', 
            'filenamelimit': 100,  # do not change
            'force': args.force, 
            'resume': args.resume, 
            'session': session, 
        }
    }
    
    # Get ready special variables (API for MediWiki, etc)
    if config['wikiengine'] == 'mediawiki':
        import mediawiki
        config['mwexport'] = 'Special:Export'
        if not args.mwapi:
            config['mwapi'] = mediawiki.mwGetAPI(config=config)
            if not config['mwapi']:
                sys.stderr.write('ERROR: Provide a URL to API\n')
                sys.exit(1)
            else:
                data={
                    'action': 'query',
                    'meta': 'siteinfo',
                    'siprop': 'namespaces',
                    'format': 'json'}
                r = getURL(config['mwapi'], data=data)
                config['mwexport'] = getJSON(r)['query']['namespaces']['-1']['*'] \
                    + ':Export'
        if not args.mwindex:
            config['mwindex'] = mediawiki.mwGetIndex(config=config)
            if not config['mwindex']:
                sys.stderr.write('ERROR: Provide a URL to Index.php\n')
                sys.exit(1)
    elif wikiengine == 'wikispaces':
        import wikispaces
        # use wikicanonical for base url for Wikispaces?
    
    # calculating path, if not defined by user with --path=
    if not config['path']:
        config['path'] = './%s-%s-wikidump' % (domain2prefix(config=config), config['date'])

    return config