def do_abstractsbackup(wikidb, output_files, variants,
                       wikiconf, start, end, dryrun, verbose):
    '''
    do an abstracts xml dump one piece at a time, writing into uncompressed
    temporary files and shovelling those into gzip's stdin for the
    concatenated compressed output
    '''
    outfiles = {}
    index = 0
    for variant in variants:
        outfiles[variant] = {'name': output_files[index]}
        index += 1

    for filetype in outfiles:
        outfiles[filetype]['temp'] = os.path.join(
            FileUtils.wiki_tempdir(wikidb, wikiconf.temp_dir),
            os.path.basename(outfiles[filetype]['name']) + "_tmp")
        if dryrun:
            outfiles[filetype]['compr'] = [None, outfiles[filetype]['name']]
        else:
            outfiles[filetype]['compr'] = [gzippit_append, outfiles[filetype]['name']]

    script_command = MultiVersion.mw_script_as_array(wikiconf,
                                                     "dumpBackup.php")
    command = [wikiconf.php] + script_command
    version = MultiVersion.mw_version(wikiconf, wikidb)
    abstract_cmd_dir = wikiconf.wiki_dir
    if version:
        abstract_cmd_dir = abstract_cmd_dir + "/" + version
    filter_path = os.path.join(abstract_cmd_dir, "extensions/ActiveAbstract/AbstractFilter.php")
    if not os.path.exists(filter_path):
        filter_path = os.path.join(abstract_cmd_dir,
                                   "extensions/ActiveAbstract/includes/AbstractFilter.php")
    abstract_filter = ("--plugin=AbstractFilter:" + filter_path)

    command.extend(["--wiki=%s" % wikidb, abstract_cmd_dir,
                    abstract_filter,
                    "--current", "--report=1000", "--namespaces=0"])

    for filetype in outfiles:
        command.extend(["--output=file:%s" % outfiles[filetype]['temp'],
                        "--filter=namespace:NS_MAIN",
                        "--filter=noredirect",
                        "--filter=abstract%s" % filetype])

    do_xml_stream(wikidb, outfiles, command, wikiconf,
                  start, end, dryrun, 'page_id', 'page',
                  5000, 10000, '</doc>\n', verbose=verbose, header=True)
    do_xml_stream(wikidb, outfiles, command, wikiconf,
                  start, end, dryrun, 'page_id', 'page',
                  5000, 10000, '</doc>\n', verbose=verbose)
    do_xml_stream(wikidb, outfiles, command, wikiconf,
                  start, end, dryrun, 'page_id', 'page',
                  5000, 10000, '</doc>\n', verbose=verbose, footer=True)
Example #2
0
 def dump_revs(self):
     '''
     dump revision content corresponding to previously-dumped
     stubs (revision metadata)
     '''
     if not self.steps['revs']['run']:
         return True
     dumpdir = MiscDumpDir(self.wiki.config, self.wiki.date)
     outputdir = dumpdir.get_dumpdir(self.wiki.db_name, self.wiki.date)
     revsfile = RevsFile(self.wiki.config, self.wiki.date, self.wiki.db_name)
     outputfile = revsfile.get_filename()
     script_command = MultiVersion.mw_script_as_array(self.wiki.config,
                                                      "dumpTextPass.php")
     command = [self.wiki.config.php]
     command.extend(script_command)
     stubfile = StubFile(self.wiki.config, self.wiki.date, self.wiki.db_name)
     stuboutputfile = stubfile.get_filename()
     command.extend(["--wiki=%s" % self.wiki.db_name,
                     "--stub=gzip:%s" % os.path.join(outputdir, stuboutputfile),
                     "--quiet",
                     "--spawn=%s" % self.wiki.config.php,
                     "--output=bzip2:%s" % os.path.join(outputdir, outputfile)])
     if self.dryrun:
         print "would run command for revs dump:", command
     else:
         log.info("running with no output: " + " ".join(command))
         success = RunSimpleCommand.run_with_no_output(
             command, shell=False, timeout=self.get_lock_timeout_interval(),
             timeout_callback=self.periodic_callback)
         if not success:
             log.warning("error producing revision text files"
                         " for wiki %s", self.wiki.db_name)
             return False
     return True
Example #3
0
    def dump_stub(self, start_revid, end_revid):
        '''
        dump stubs (metadata) for revs from start_revid
        up to but not including end_revid
        '''
        if not self.steps['stubs']['run']:
            return True

        dumpdir = MiscDumpDir(self.wiki.config, self.wiki.date)
        outputdir = dumpdir.get_dumpdir(self.wiki.db_name, self.wiki.date)
        stubfile = StubFile(self.wiki.config, self.wiki.date, self.wiki.db_name)
        outputfile = stubfile.get_filename()
        script_command = MultiVersion.mw_script_as_array(self.wiki.config,
                                                         "dumpBackup.php")
        command = [self.wiki.config.php]
        command.extend(script_command)
        command.extend(["--wiki=%s" % self.wiki.db_name, "--stub", "--quiet",
                        "--output=gzip:%s" % os.path.join(outputdir, outputfile),
                        "--revrange", "--revstart=%s" % start_revid,
                        "--revend=%s" % end_revid])
        if self.dryrun:
            print "would run command for stubs dump:", command
        else:
            log.info("running with no output: " + " ".join(command))
            success = RunSimpleCommand.run_with_no_output(
                command, shell=False, timeout=self.get_lock_timeout_interval(),
                timeout_callback=self.periodic_callback)
            if not success:
                log.warning("error producing stub files for wiki %s", self.wiki.db_name)
                return False
        return True
Example #4
0
    def run(self, runner):
        self.cleanup_old_files(runner.dump_dir, runner)
        files = self.list_outfiles_for_build_command(runner.dump_dir)
        if len(files) > 1:
            raise BackupError("flow content step wants to produce more than one output file")
        output_file_obj = files[0]
        if not os.path.exists(runner.wiki.config.php):
            raise BackupError("php command %s not found" % runner.wiki.config.php)

        flow_output_file = runner.dump_dir.filename_public_path(output_file_obj)
        script_command = MultiVersion.mw_script_as_array(
            runner.wiki.config, "extensions/Flow/maintenance/dumpBackup.php")

        command = [runner.wiki.config.php]
        command.extend(script_command)
        command.extend(["--wiki=%s" % runner.db_name,
                        "--current", "--report=1000",
                        "--output=bzip2:%s" % flow_output_file])
        if self.history:
            command.append("--full")

        pipeline = [command]
        series = [pipeline]
        error = runner.run_command([series], callback_stderr=self.progress_callback,
                                   callback_stderr_arg=runner)
        if error:
            raise BackupError("error dumping flow page files")
Example #5
0
def dologsbackup(wikidb, outfile,
                 wikiconf, start, end, dryrun):
    '''
    do a logs xml dump one piece at a time, writing into uncompressed
    temporary files and shovelling those into gzip's stdin for the
    concatenated compressed output
    '''
    outfiles = {'logs': {'name': outfile}}
    for filetype in outfiles:
        outfiles[filetype]['temp'] = os.path.join(
            wikiconf.temp_dir, os.path.basename(outfiles[filetype]['name']) + "_tmp")
        if dryrun:
            outfiles[filetype]['compr'] = None
        else:
            outfiles[filetype]['compr'] = gzippit(outfiles[filetype]['name'])

    script_command = MultiVersion.mw_script_as_array(wikiconf, "dumpBackup.php")
    command = [wikiconf.php] + script_command

    command.extend(["--wiki=%s" % wikidb,
                    "--logs", "--report=1000",
                    "--output=file:%s" % outfiles['logs']['temp']
                    ])

    do_xml_stream(wikidb, outfiles, command, wikiconf,
                  start, end, dryrun, 'log_id', 'logging',
                  50000, 100000, '</logitem>\n')
Example #6
0
def dostubsbackup(wikidb, history_file, current_file, articles_file,
                  wikiconf, start, end, dryrun, verbose):
    '''
    do a stubs xml dump one piece at a time, writing into uncompressed
    temporary files and shovelling those into gzip's stdin for the
    concatenated compressed output
    '''
    outfiles = {}
    if history_file is not None:
        outfiles['history'] = {'name': history_file}
    if current_file is not None:
        outfiles['current'] = {'name': current_file}
    if articles_file is not None:
        outfiles['articles'] = {'name': articles_file}

    for filetype in outfiles:
        outfiles[filetype]['temp'] = os.path.join(
            FileUtils.wiki_tempdir(wikidb, wikiconf.temp_dir),
            os.path.basename(outfiles[filetype]['name']) + "_tmp")
        if dryrun:
            outfiles[filetype]['compr'] = [None, outfiles[filetype]['name']]
        else:
            outfiles[filetype]['compr'] = [gzippit_append, outfiles[filetype]['name']]

    script_command = MultiVersion.mw_script_as_array(wikiconf, "dumpBackup.php")
    command = [wikiconf.php] + script_command

    command.extend(["--wiki=%s" % wikidb,
                    "--full", "--stub", "--report=1000"])
    if history_file is not None:
        command.append("--output=file:%s" % outfiles['history']['temp'])
    if current_file is not None:
        command.extend(["--output=file:%s" % outfiles['current']['temp'],
                        "--filter=latest"])
    if articles_file is not None:
        command.extend(["--output=file:%s" % outfiles['articles']['temp'],
                        "--filter=latest", "--filter=notalk",
                        "--filter=namespace:!NS_USER"])

    if wikiconf.stubs_orderrevs:
        command.append("--orderrevs")
        callback = get_page_interval
    else:
        callback = None

    # the xml header, the body, and the xml footer should be separate gzipped
    # streams all concatted together
    # note that do_xml_stream exits on failure after cleaning up all output files
    # so the parent process must simply retry later
    do_xml_stream(wikidb, outfiles, command, wikiconf,
                  start, end, dryrun, 'page_id', 'page',
                  5000, 20000, '</page>\n', verbose=verbose, callback=callback, header=True)
    do_xml_stream(wikidb, outfiles, command, wikiconf,
                  start, end, dryrun, 'page_id', 'page',
                  5000, 20000, '</page>\n', verbose=verbose, callback=callback)
    do_xml_stream(wikidb, outfiles, command, wikiconf,
                  start, end, dryrun, 'page_id', 'page',
                  5000, 20000, '</page>\n', verbose=verbose, callback=callback, footer=True)
Example #7
0
def do_abstractsbackup(wikidb, output_files, variants,
                       wikiconf, start, end, dryrun):
    '''
    do an abstracts xml dump one piece at a time, writing into uncompressed
    temporary files and shovelling those into gzip's stdin for the
    concatenated compressed output
    '''
    outfiles = {}
    index = 0
    for variant in variants:
        outfiles[variant] = {'name': output_files[index]}
        index += 1

    for filetype in outfiles:
        outfiles[filetype]['temp'] = os.path.join(
            wikiconf.temp_dir,
            os.path.basename(outfiles[filetype]['name']) + "_tmp")
        if dryrun:
            outfiles[filetype]['compr'] = None
        else:
            outfiles[filetype]['compr'] = catit(outfiles[filetype]['name'])

    script_command = MultiVersion.mw_script_as_array(wikiconf,
                                                     "dumpBackup.php")
    command = [wikiconf.php] + script_command
    version = MultiVersion.mw_version(wikiconf, wikidb)
    abstract_cmd_dir = wikiconf.wiki_dir
    if version:
        abstract_cmd_dir = abstract_cmd_dir + "/" + version
    abstract_filter = ("--plugin=AbstractFilter:"
                       "%s/extensions/ActiveAbstract/AbstractFilter.php"
                       % abstract_cmd_dir)
    command.extend(["--wiki=%s" % wikidb, abstract_cmd_dir,
                    abstract_filter,
                    "--current", "--report=1000"])

    for filetype in outfiles:
            command.extend(["--output=file:%s" % outfiles[filetype]['temp'],
                            "--filter=namespace:NS_MAIN",
                            "--filter=noredirect",
                            "--filter=abstract%s" % filetype])

    do_xml_stream(wikidb, outfiles, command, wikiconf,
                  start, end, dryrun, 'page_id', 'page',
                  20000, 30000, '</doc>\n')
Example #8
0
 def get_domain_from_wikidbname(self):
     '''
     given the name of the wiki db, turn this into the
     fqdn of the wiki project (i.e. enwiki -> en.wikipedia.org)
     '''
     script_command = MultiVersion.mw_script_as_array(self.wiki.config,
                                                      "eval.php")
     # echo $wgCanonicalServer | php "$multiversionscript" eval.php $wiki
     command = ["echo", "'echo $wgCanonicalServer;'", "|", self.wiki.config.php]
     command.extend(script_command)
     command.append(self.wiki.db_name)
     command_text = " ".join(command)
     self.log.info("running with no output: %s", command_text)
     output = RunSimpleCommand.run_with_output(command_text, shell=True)
     if not output:
         self.log.warning("error retrieving domain for wiki %s", self.wiki.db_name)
         return None
     # rstrip gets rid of any trailing newlines from eval.php
     return output.decode('utf-8').split('//')[1].rstrip()
Example #9
0
    def get_db_user_and_password(self):
        # get these by running a MediaWiki maintenance script;
        # yes, this means you need a full installation of MediaWiki
        # (but not web service) in order to use these methods

        command_list = MultiVersion.mw_script_as_array(self.config, "getConfiguration.php")
        pull_vars = ["wgDBuser", "wgDBpassword"]
        command = "{php} {command} --wiki={dbname} --format=json --regex='{vars}'"
        command = command.format(
            php=MiscUtils.shell_escape(self.config.php),
            command=" ".join(command_list),
            dbname=MiscUtils.shell_escape(self.db_name),
            vars="|".join(pull_vars))
        results = RunSimpleCommand.run_with_output(command, shell=True).strip()
        settings = json.loads(results.decode('utf-8'))
        db_user = settings['wgDBuser']
        db_password = settings['wgDBpassword']

        return db_user, db_password
    def build_command(self, runner, stub_dfname, prefetch, output_dfname):
        """
        Build the command line for the dump, minus output and filter options
        args:
            Runner, stub DumpFilename, ....
        """
        stub_path = os.path.join(
            FileUtils.wiki_tempdir(self.wiki.db_name, self.wiki.config.temp_dir),
            stub_dfname.filename)
        if os.path.exists(stub_path):
            # if this is a pagerange stub file in temp dir, use that
            stub_option = "--stub=gzip:%s" % stub_path
        else:
            # use regular stub file
            if runner.wiki.is_private():
                stub_option = "--stub=gzip:%s" % runner.dump_dir.filename_private_path(stub_dfname)
            else:
                stub_option = "--stub=gzip:%s" % runner.dump_dir.filename_public_path(stub_dfname)
        if self.jobinfo['spawn']:
            spawn = "--spawn=%s" % (self.wiki.config.php)
        else:
            spawn = ""

        if not exists(self.wiki.config.php):
            raise BackupError("php command %s not found" % self.wiki.config.php)

        script_command = MultiVersion.mw_script_as_array(runner.wiki.config, "dumpTextPass.php")
        dump_command = [self.wiki.config.php]
        dump_command.extend(script_command)
        dump_command.extend(["--wiki=%s" % runner.db_name,
                             "%s" % stub_option,
                             "%s" % prefetch,
                             "--report=1000",
                             "%s" % spawn])

        dump_command = [entry for entry in dump_command if entry is not None]
        dump_command.extend([self.build_filters(runner, output_dfname), self.build_eta()])
        pipeline = [dump_command]
        # return a command series of one pipeline
        series = [pipeline]
        return series
Example #11
0
 def get_command(self, wiki, output_dir, outfile_base, base):
     '''
     given the output directory and filename and the wiki
     object, put together and return an array consisting
     of the script name, args, and any multiversion
     invocations that need to precede it
     '''
     if base is None:
         base = wiki
     if self.scriptname.endswith('.php'):
         script_command = MultiVersion.mw_script_as_array(
             base.config, self.scriptname)
         script_command = [base.config.php] + script_command
         script_command.extend(["--wiki", base.db_name])
     else:
         script_command = [self.scriptname]
     if self.args is not None:
         script_command.extend(self.args)
     script_command = [field.format(DIR=output_dir, FILE=outfile_base, w=wiki.db_name)
                       for field in script_command]
     return script_command
Example #12
0
    def build_command(self, runner, output_dfname):
        if not os.path.exists(runner.wiki.config.php):
            raise BackupError("php command %s not found" % runner.wiki.config.php)

        if runner.wiki.is_private():
            flow_output_fpath = runner.dump_dir.filename_private_path(output_dfname)
        else:
            flow_output_fpath = runner.dump_dir.filename_public_path(output_dfname)
        script_command = MultiVersion.mw_script_as_array(
            runner.wiki.config, "extensions/Flow/maintenance/dumpBackup.php")

        command = [runner.wiki.config.php]
        command.extend(script_command)
        command.extend(["--wiki=%s" % runner.db_name,
                        "--current", "--report=1000",
                        "--output=bzip2:%s" % DumpFilename.get_inprogress_name(flow_output_fpath)])
        if self.history:
            command.append("--full")
        pipeline = [command]
        series = [pipeline]
        return series
Example #13
0
def dostubsbackup(wikidb, history_file, current_file, articles_file,
                  wikiconf, start, end, dryrun):
    '''
    do a stubs xml dump one piece at a time, writing into uncompressed
    temporary files and shovelling those into gzip's stdin for the
    concatenated compressed output
    '''
    outfiles = {'history': {'name': history_file},
                'current': {'name': current_file},
                'articles': {'name': articles_file}}
    for filetype in outfiles:
        outfiles[filetype]['temp'] = os.path.join(
            wikiconf.temp_dir, os.path.basename(outfiles[filetype]['name']) + "_tmp")
        if dryrun:
            outfiles[filetype]['compr'] = None
        else:
            outfiles[filetype]['compr'] = gzippit(outfiles[filetype]['name'])

    script_command = MultiVersion.mw_script_as_array(wikiconf, "dumpBackup.php")
    command = [wikiconf.php] + script_command

    command.extend(["--wiki=%s" % wikidb,
                    "--full", "--stub", "--report=1000",
                    "--output=file:%s" % outfiles['history']['temp'],
                    "--output=file:%s" % outfiles['current']['temp'],
                    "--filter=latest",
                    "--output=file:%s" % outfiles['articles']['temp'],
                    "--filter=latest", "--filter=notalk",
                    "--filter=namespace:!NS_USER"])

    if wikiconf.stubs_orderrevs:
        command.append("--orderrevs")
        callback = get_page_interval
    else:
        callback = None

    do_xml_stream(wikidb, outfiles, command, wikiconf,
                  start, end, dryrun, 'page_id', 'page',
                  5000, 100000, '</page>\n', callback)
Example #14
0
    def build_command(self, runner, stub_file):
        """Build the command line for the dump, minus output and filter options"""

        # we write a temp file, it will be checkpointed every so often.
        temp = bool(self._checkpoints_enabled)

        output_file = DumpFilename(self.wiki, stub_file.date, self.dumpname,
                                   self.get_filetype(), self.file_ext, stub_file.partnum,
                                   DumpFilename.make_checkpoint_string(stub_file.first_page_id,
                                                                       stub_file.last_page_id),
                                   temp)

        stub_path = os.path.join(self.wiki.config.temp_dir, stub_file.filename)
        if os.path.exists(stub_path):
            # if this is a partial stub file in temp dir, use that
            stub_option = "--stub=gzip:%s" % stub_path
        else:
            # use regular stub file
            stub_option = "--stub=gzip:%s" % runner.dump_dir.filename_public_path(stub_file)

        # Try to pull text from the previous run; most stuff hasn't changed
        # Source=$OutputDir/pages_$section.xml.bz2
        sources = []
        possible_sources = None
        if self._prefetch:
            possible_sources = self._find_previous_dump(runner, output_file.partnum)
            # if we have a list of more than one then
            # we need to check existence for each and put them together in a string
            if possible_sources:
                for sourcefile in possible_sources:
                    # if we are doing partial stub run, include only the analogous
                    # checkpointed prefetch files, if there are checkpointed files
                    # otherwise we'll use the all the sourcefiles reported
                    if not self.chkptfile_in_pagerange(stub_file, sourcefile):
                        continue
                    sname = runner.dump_dir.filename_public_path(sourcefile, sourcefile.date)
                    if exists(sname):
                        sources.append(sname)
        if output_file.partnum:
            partnum_str = "%s" % stub_file.partnum
        else:
            partnum_str = ""
        if len(sources) > 0:
            source = "bzip2:%s" % (";".join(sources))
            runner.show_runner_state("... building %s %s XML dump, with text prefetch from %s..." %
                                     (self._subset, partnum_str, source))
            prefetch = "--prefetch=%s" % (source)
        else:
            runner.show_runner_state("... building %s %s XML dump, no text prefetch..." %
                                     (self._subset, partnum_str))
            prefetch = ""

        if self._spawn:
            spawn = "--spawn=%s" % (self.wiki.config.php)
        else:
            spawn = ""

        if not exists(self.wiki.config.php):
            raise BackupError("php command %s not found" % self.wiki.config.php)

        if self._checkpoints_enabled:
            checkpoint_time = "--maxtime=%s" % (self.wiki.config.checkpoint_time)
            checkpoint_file = "--checkpointfile=%s" % output_file.new_filename(
                output_file.dumpname, output_file.file_type, output_file.file_ext,
                output_file.date, output_file.partnum, "p%sp%s", None)
        else:
            checkpoint_time = ""
            checkpoint_file = ""
        script_command = MultiVersion.mw_script_as_array(runner.wiki.config, "dumpTextPass.php")
        dump_command = [self.wiki.config.php]
        dump_command.extend(script_command)
        dump_command.extend(["--wiki=%s" % runner.db_name,
                             "%s" % stub_option,
                             "%s" % prefetch,
                             "%s" % checkpoint_time,
                             "%s" % checkpoint_file,
                             "--report=1000",
                             "%s" % spawn])

        dump_command = [entry for entry in dump_command if entry is not None]
        command = dump_command
        filters = self.build_filters(runner, output_file)
        eta = self.build_eta(runner)
        command.extend([filters, eta])
        pipeline = [command]
        series = [pipeline]
        return series