Python DumpFilename Examples, dumps.fileutils.DumpFilename Python Examples

Example #1

0

Show file

File: apijobs.py Project: wikimedia/operations-dumps

    def run(self, runner):
        retries = 0
        maxretries = runner.wiki.config.max_retries
        dfnames = self.list_outfiles_for_build_command(runner.dump_dir)
        if len(dfnames) > 1:
            raise BackupError("siteinfo dump %s trying to produce more than one file" %
                              self.dumpname)
        output_dfname = dfnames[0]
        commands = self.build_command(runner)
        if runner.wiki.is_private():
            command_series = runner.get_save_command_series(
                commands, DumpFilename.get_inprogress_name(
                    runner.dump_dir.filename_private_path(output_dfname)))
        else:
            command_series = runner.get_save_command_series(
                commands, DumpFilename.get_inprogress_name(
                    runner.dump_dir.filename_public_path(output_dfname)))
        self.setup_command_info(runner, command_series, [output_dfname])

        error, _broken = runner.save_command(command_series, self.command_completion_callback)
        while error and retries < maxretries:
            retries = retries + 1
            time.sleep(5)
            error, _broken = runner.save_command(command_series)
        if error:
            raise BackupError("error dumping siteinfo props %s" % ','.join(self._properties))

Example #2

0

Show file

    def remove_symlinks_from_old_runs(self, date_string, dump_name=None, partnum=None,
                                      checkpoint=None, onlyparts=False):
        # fixme
        # this needs to do more work if there are file parts or checkpoint files linked in here from
        # earlier dates. checkpoint ranges change, and configuration of parallel jobs for file parts
        # changes too, so maybe old files still exist and the links need to be removed because we
        # have newer files for the same phase of the dump.

        if SymLinks.NAME in self._enabled:
            latest_dir = self.dump_dir.latest_dir()
            files = os.listdir(latest_dir)
            for filename in files:
                link = os.path.join(latest_dir, filename)
                if os.path.islink(link):
                    realfile = os.readlink(link)
                    file_obj = DumpFilename(self.dump_dir._wiki)
                    file_obj.new_from_filename(os.path.basename(realfile))
                    if file_obj.date < date_string:
                        # fixme check that these are ok if the value is None
                        if dump_name and (file_obj.dumpname != dump_name):
                            continue
                        if (partnum or onlyparts) and (file_obj.partnum != partnum):
                            continue
                        if checkpoint and (file_obj.checkpoint != checkpoint):
                            continue
                        self.debugfn("Removing old symlink %s -> %s" % (link, realfile))
                        os.remove(link)

Example #3

0

Show file

File: recompressjobs.py Project: wikimedia/operations-dumps

    def build_command(self, runner, output_dfname):
        '''
        arguments:
        runner: Runner object
        output_dfname: output file that will be produced
        '''

        input_dfname = DumpFilename(self.wiki, None, output_dfname.dumpname,
                                    output_dfname.file_type,
                                    self.item_for_recompression.file_ext,
                                    output_dfname.partnum, output_dfname.checkpoint)
        if runner.wiki.is_private():
            outfilepath = runner.dump_dir.filename_private_path(
                self.get_multistream_dfname(output_dfname))
            outfilepath_index = runner.dump_dir.filename_private_path(
                self.get_multistream_index_dfname(output_dfname))
            infilepath = runner.dump_dir.filename_private_path(input_dfname)
        else:
            outfilepath = runner.dump_dir.filename_public_path(
                self.get_multistream_dfname(output_dfname))
            outfilepath_index = runner.dump_dir.filename_public_path(
                self.get_multistream_index_dfname(output_dfname))
            infilepath = runner.dump_dir.filename_public_path(input_dfname)
        command_pipe = [["%s -dc %s | %s --pagesperstream 100 --buildindex %s -o %s" %
                         (self.wiki.config.bzip2, infilepath, self.wiki.config.recompressxml,
                          DumpFilename.get_inprogress_name(outfilepath_index),
                          DumpFilename.get_inprogress_name(outfilepath))]]
        return [command_pipe]

Example #4

0

Show file

File: checksummers.py Project: wikimedia/operations-dumps

 def get_per_file_path(self, htype, filename):
     '''
     return the full path to the file containing the checksum of
     the specified type for the given filename. this is only in txt format
     '''
     dfname = DumpFilename(self.wiki, None)
     # fixme check to see if this is right or what
     dfname.new_from_filename(Checksummer.get_checksum_basename_perfile(htype, filename))
     return self.dump_dir.filename_public_path(dfname)

Example #5

0

Show file

File: symlinks.py Project: wikimedia/operations-dumps

    def remove_symlinks_from_old_runs(self, date_string, dump_name=None, partnum=None,
                                      checkpoint=None, onlyparts=False):
        """
        Remove symlinks from the 'latest' directory for (some) links that point to
        files from other runs than the current one (of 'date_string').
        If dump_name, part_num, checkpoint are False or None, we remove all the old symlinks
        for all values of the arg in the filename.
        example: if partnum is False or None then we remove all old values for all file parts

        This needs to do more work if there are file parts or checkpoint files linked in here from
        earlier dates. checkpoint ranges change, and configuration of parallel jobs for file parts
        changes too, so maybe old files still exist and the links need to be removed because we
        have newer files for the same phase of the dump. So we keep symlinks to files from
        one older run only, and clean up the rest. We do this because here at WMF we do partial
        and full runs alternating, and we like to keep the links to files from the full runs around
        until a new full run is in place. Really the number of keeps should be configurable
        (FIXME later I guess).
        """
        if SymLinks.NAME in self._enabled:
            latest_dir = self.dump_dir.latest_dir()
            files = os.listdir(latest_dir)
            dates = []

            files_for_cleanup = []
            for filename in files:
                link = os.path.join(latest_dir, filename)
                if os.path.islink(link):
                    realfilepath = os.readlink(link)
                    dfname = DumpFilename(self.dump_dir._wiki)
                    dfname.new_from_filename(os.path.basename(realfilepath))
                    files_for_cleanup.append({'link': link, 'dfname': dfname, 'path': realfilepath})
                    dates.append(dfname.date)
            try:
                index = dates.index(date_string)
                prev_run_date = dates[index - 1]
            except Exception:
                if len(dates) >= 2:
                    prev_run_date = dates[-2]
                else:
                    prev_run_date = None
            for item in files_for_cleanup:
                if item['dfname'].date < date_string:
                    if dump_name and (item['dfname'].dumpname != dump_name):
                        continue
                    if prev_run_date is None or item['dfname'].date == prev_run_date:
                        # for the previous run, or the only existing run, if different
                        # from the current one, we are very careful. For all older runs
                        # we pretty much want to toss everything

                        # fixme check that these are ok if the value is None
                        if (partnum or onlyparts) and (item['dfname'].partnum != partnum):
                            continue
                        if checkpoint and (item['dfname'].checkpoint != checkpoint):
                            continue
                    self.debugfn("Removing old symlink %s -> %s" % (item['link'], item['path']))
                    os.remove(item['link'])

Example #6

0

Show file

File: tablesjobs.py Project: wikimedia/operations-dumps

 def build_command(self, runner, query, out_dfname):
     if not exists(runner.wiki.config.gzip):
         raise BackupError("gzip command %s not found" % runner.wiki.config.gzip)
     series = runner.db_server_info.build_sql_command(query, runner.wiki.config.gzip)
     if runner.wiki.is_private():
         return runner.get_save_command_series(
             series, DumpFilename.get_inprogress_name(
                 runner.dump_dir.filename_private_path(out_dfname)))
     return runner.get_save_command_series(
         series, DumpFilename.get_inprogress_name(
             runner.dump_dir.filename_public_path(out_dfname)))

Example #7

0

Show file

File: tablesjobs.py Project: wikimedia/operations-dumps

 def build_command(self, runner, output_dfname):
     commands = runner.db_server_info.build_sqldump_command(self._table, runner.wiki.config.gzip)
     if self.private or runner.wiki.is_private():
         command_series = runner.get_save_command_series(
             commands, DumpFilename.get_inprogress_name(
                 runner.dump_dir.filename_private_path(output_dfname)))
     else:
         command_series = runner.get_save_command_series(
             commands, DumpFilename.get_inprogress_name(
                 runner.dump_dir.filename_public_path(output_dfname)))
     return command_series

Example #8

0

Show file

File: xmlcontentjobs.py Project: wikimedia/operations-dumps

    def build_filters(self, runner, input_dfname):
        """
        Construct the output filter options for dumpTextPass.php
        args:
            Runner, DumpFilename
        """
        # do we need checkpoints? ummm
        if runner.wiki.is_private():
            xmlbz2_path = runner.dump_dir.filename_private_path(input_dfname)
        else:
            xmlbz2_path = runner.dump_dir.filename_public_path(input_dfname)

        if 'history' in self.jobinfo['subset'] and runner.wiki.config.lbzip2forhistory:
            # we will use lbzip2 for compression of pages-meta-history for this wiki
            # if configured
            bz2mode = "lbzip2"
            if not exists(self.wiki.config.lbzip2):
                raise BackupError("lbzip2 command %s not found" % self.wiki.config.lbzip2)
        elif self.wiki.config.bzip2[-6:] == "dbzip2":
            bz2mode = "dbzip2"
        else:
            bz2mode = "bzip2"
            if not exists(self.wiki.config.bzip2):
                raise BackupError("bzip2 command %s not found" % self.wiki.config.bzip2)
        return "--output=%s:%s" % (bz2mode, DumpFilename.get_inprogress_name(xmlbz2_path))

Example #9

0

Show file

File: xmljobs.py Project: wikimedia/operations-dumps

    def build_command(self, runner, output_dfname):
        if not os.path.exists(runner.wiki.config.php):
            raise BackupError("php command %s not found" % runner.wiki.config.php)

        if runner.wiki.is_private():
            logging_path = runner.dump_dir.filename_private_path(output_dfname)
        else:
            logging_path = runner.dump_dir.filename_public_path(output_dfname)

        config_file_arg = runner.wiki.config.files[0]
        if runner.wiki.config.override_section:
            config_file_arg = config_file_arg + ":" + runner.wiki.config.override_section
        command = ["/usr/bin/python3", "xmllogs.py", "--config",
                   config_file_arg, "--wiki", runner.db_name,
                   "--outfile", DumpFilename.get_inprogress_name(logging_path)]

        if output_dfname.partnum:
            # set up start end end pageids for this piece
            # note there is no item id 0 I guess. so we start with 1
            start = sum([int(self._parts[i]) for i in range(0, output_dfname.partnum_int - 1)]) + 1
            startopt = "--start=%s" % start
            # if we are on the last file part, we should get up to the last log item id,
            # whatever that is.
            command.append(startopt)
            if output_dfname.partnum_int < len(self._parts):
                end = sum([int(self._parts[i]) for i in range(0, output_dfname.partnum_int)]) + 1
                endopt = "--end=%s" % end
                command.append(endopt)

        pipeline = [command]
        series = [pipeline]
        return series

Example #10

0

Show file

 def chkpt_file_from_page_range(self, page_range, partnum):
     checkpoint_string = DumpFilename.make_checkpoint_string(
         page_range[0], page_range[1])
     output_file = DumpFilename(self.wiki, self.wiki.date, self.dumpname,
                                self.get_filetype(), self.get_file_ext(),
                                partnum, checkpoint=checkpoint_string,
                                temp=False)
     return output_file

Example #11

0

Show file

File: xmljobs.py Project: wikimedia/operations-dumps

    def build_command(self, runner, novariant_dfname, output_dfnames):
        """
        args:
            Runner, DumpFilename for output without any language variant
        """
        config_file_arg = runner.wiki.config.files[0]
        if runner.wiki.config.override_section:
            config_file_arg = config_file_arg + ":" + runner.wiki.config.override_section
        command = ["/usr/bin/python3", "xmlabstracts.py", "--config",
                   config_file_arg, "--wiki", self.db_name]

        output_paths = []
        variants = []
        for dfname in output_dfnames:
            variant = self.get_variant_from_dumpname(dfname.dumpname)
            variant_option = self._variant_option(variant)
            if runner.wiki.is_private():
                output_paths.append(DumpFilename.get_inprogress_name(
                    runner.dump_dir.filename_private_path(dfname)))
            else:
                output_paths.append(DumpFilename.get_inprogress_name(
                    runner.dump_dir.filename_public_path(dfname)))
            variants.append(variant_option)

        command.extend(["--outfiles=%s" % ",".join(output_paths),
                        "--variants=%s" % ",".join(variants)])

        if novariant_dfname.partnum:
            # set up start end end pageids for this piece
            # note there is no page id 0 I guess. so we start with 1
            start = sum([int(self._parts[i])
                         for i in range(0, novariant_dfname.partnum_int - 1)]) + 1
            startopt = "--start=%s" % start
            # if we are on the last file part, we should get up to the last pageid,
            # whatever that is.
            command.append(startopt)
            if novariant_dfname.partnum_int < len(self._parts):
                end = sum([int(self._parts[i]) for i in range(0, novariant_dfname.partnum_int)]) + 1
                endopt = "--end=%s" % end
                command.append(endopt)
        pipeline = [command]
        series = [pipeline]
        return series

Example #12

0

Show file

File: xmljobs.py Project: wikimedia/operations-dumps

    def build_command(self, runner, output_dfname, history_dfname, current_dfname):
        if not os.path.exists(runner.wiki.config.php):
            raise BackupError("php command %s not found" % runner.wiki.config.php)

        config_file_arg = runner.wiki.config.files[0]
        if runner.wiki.config.override_section:
            config_file_arg = config_file_arg + ":" + runner.wiki.config.override_section
        command = ["/usr/bin/python3", "xmlstubs.py", "--config", config_file_arg,
                   "--wiki", runner.db_name]
        output_dir = self.get_output_dir(runner)
        if output_dfname is not None:
            command.extend(["--articles", DumpFilename.get_inprogress_name(
                os.path.join(output_dir, output_dfname.filename))])
        if history_dfname is not None:
            command.extend(["--history", DumpFilename.get_inprogress_name(
                os.path.join(output_dir, history_dfname.filename))])
        if current_dfname is not None:
            command.extend(["--current", DumpFilename.get_inprogress_name(
                os.path.join(output_dir, current_dfname.filename))])

        partnum = None
        if output_dfname is not None:
            partnum = output_dfname.partnum
        elif history_dfname is not None:
            partnum = history_dfname.partnum
        elif current_dfname is not None:
            partnum = current_dfname.partnum
        if partnum is not None:
            # set up start end end pageids for this piece
            # note there is no page id 0 I guess. so we start with 1
            start = sum([int(self._parts[i]) for i in range(0, int(partnum) - 1)]) + 1
            startopt = "--start=%s" % start
            # if we are on the last file part, we should get up to the last pageid,
            # whatever that is.
            command.append(startopt)
            if int(partnum) < len(self._parts):
                end = sum([int(self._parts[i]) for i in range(0, int(partnum))]) + 1
                endopt = "--end=%s" % end
                command.append(endopt)

        pipeline = [command]
        series = [pipeline]
        return series

Example #13

0

Show file

File: specialfileinfo.py Project: wikimedia/operations-dumps

    def write_specialfilesinfo_file(self):
        """
        get info about all files for the most current dump of a given
        wiki, possibly in progress, that don't contain dump job
        output; write this info to an output file
        """
        if SpecialFileInfo.NAME not in self._enabled:
            return

        dump_dir = DumpDir(self.wiki, self.wiki.db_name)
        files = self.get_special_filenames()
        fileinfo = {}
        for filename in files:
            fileinfo[filename] = {}
            path = os.path.join(self.wiki.public_dir(), self.wiki.date, filename)
            fileinfo[filename]['status'] = 'present'
            try:
                size = os.path.getsize(path)
                fileinfo[filename]['size'] = size
            except Exception:
                fileinfo[filename]['status'] = 'missing'
                continue

            dfname = DumpFilename(self.wiki)
            dfname.new_from_filename(os.path.basename(path))
            fileinfo[filename]['url'] = dump_dir.web_path_relative(dfname)

        contents = {}
        contents['files'] = fileinfo
        contents['version'] = SpecialFileInfo.VERSION

        try:
            self.write_contents(contents)
        except Exception:
            if self.verbose:
                exc_type, exc_value, exc_traceback = sys.exc_info()
                sys.stderr.write(repr(traceback.format_exception(exc_type, exc_value,
                                                                 exc_traceback)))
            message = "Couldn't write special files info. Continuing anyways"
            if self.error_callback:
                self.error_callback(message)
            else:
                sys.stderr.write("%s\n" % message)

Example #14

0

Show file

File: jobs.py Project: wikimedia/operations-dumps

    def command_completion_callback(self, series):
        """
        if the series of commands ran successfully to completion,
        mv produced output files from temporary to permanent
        names

        we write the data into temporary locations initially so that
        as each command series completes, the output files can
        be made available as done immediately, rather than waiting
        for all the parallel processes of a dump step to complete
        first.

        args: CommandSeries for which all commands have
              completed
        """
        if not series.exited_successfully():
            return

        for commands in self.commands_submitted:
            if commands['series'] == series._command_series:
                if not commands['output_files']:
                    return
                for inprogress_filename in commands['output_files']:
                    if not inprogress_filename.endswith(DumpFilename.INPROG):
                        continue
                    final_dfname = DumpFilename(commands['runner'].wiki)
                    final_dfname.new_from_filename(
                        inprogress_filename[:-1 * len(DumpFilename.INPROG)])

                    in_progress_path = os.path.join(commands['output_dir'], inprogress_filename)
                    final_path = os.path.join(commands['output_dir'], final_dfname.filename)
                    try:
                        os.rename(in_progress_path, final_path)
                    except Exception:
                        if self.verbose:
                            exc_type, exc_value, exc_traceback = sys.exc_info()
                            sys.stderr.write(repr(
                                traceback.format_exception(exc_type, exc_value, exc_traceback)))
                        continue
                    # sanity check of file contents, move if bad
                    self.move_if_truncated(commands['runner'], final_dfname)

Example #15

0

Show file

File: xmlcontentjobs.py Project: wikimedia/operations-dumps

 def make_dfname_from_pagerange(self, pagerange, partnum):
     """
     given pagerange, make output file for appropriate type
     of page content dumps
     args: (startpage<str>, endpage<str>), string
     """
     checkpoint_string = DumpFilename.make_checkpoint_string(
         pagerange[0], pagerange[1])
     output_dfname = DumpFilename(self.wiki, self.wiki.date, self.get_dumpname(),
                                  self.get_filetype(), self.get_file_ext(),
                                  partnum, checkpoint=checkpoint_string,
                                  temp=False)
     return output_dfname

Example #16

0

Show file

File: xmlcontentjobs.py Project: wikimedia/operations-dumps

 def get_pagerange_stub_dfname(self, wanted, runner):
     """
     return the dumpfilename for stub file that would have
     the page range in 'wanted'
     """
     stub_input_dfname = self.get_stub_dfname(wanted['partnum'], runner)
     stub_output_dfname = DumpFilename(
         self.wiki, stub_input_dfname.date, stub_input_dfname.dumpname,
         stub_input_dfname.file_type,
         stub_input_dfname.file_ext,
         stub_input_dfname.partnum,
         DumpFilename.make_checkpoint_string(
             wanted['outfile'].first_page_id, wanted['outfile'].last_page_id), temp=False)
     return stub_output_dfname

Example #17

0

Show file

File: jobs.py Project: wikimedia/operations-dumps

    def cleanup_inprog_files(self, dump_dir, runner):
        if self.checkpoint_file is not None:
            # we only rerun this one, so just remove this one
            pub_path = DumpFilename.get_inprogress_name(
                dump_dir.filename_public_path(self.checkpoint_file))
            priv_path = DumpFilename.get_inprogress_name(
                dump_dir.filename_private_path(self.checkpoint_file))
            if os.path.exists(pub_path):
                if runner.dryrun:
                    print("would remove", pub_path)
                else:
                    os.remove(pub_path)
            elif os.path.exists(priv_path):
                if runner.dryrun:
                    print("would remove", priv_path)
                else:
                    os.remove(priv_path)

        dfnames = self.list_inprog_files_for_cleanup(dump_dir)
        if runner.dryrun:
            print("would remove ", [dfname.filename for dfname in dfnames])
        else:
            for dfname in dfnames:
                self.remove_output_file(dump_dir, dfname)

Example #18

0

Show file

 def get_chkptfile_from_pageids(self):
     if ',' in self.page_id_range:
         first_page_id, last_page_id = self.page_id_range.split(',', 1)
     else:
         first_page_id = self.page_id_range
         last_page_id = "00000"  # indicates no last page id specified, go to end of stub
     checkpoint_string = DumpFilename.make_checkpoint_string(first_page_id, last_page_id)
     if self._partnum_todo:
         partnum = self._partnum_todo
     else:
         # fixme is that right? maybe NOT
         partnum = None
     fileobj = DumpFilename(self.get_dumpname(), self.wiki.date, self.get_filetype(),
                            self.get_file_ext(), partnum, checkpoint_string)
     return fileobj.filename

Example #19

0

Show file

File: recompressjobs.py Project: wikimedia/operations-dumps

    def build_command(self, runner, output_dfnames):
        '''
        arguments:
        runner: Runner object
        output_dfnames: if checkpointing of files is enabled, this should be a
                        list of checkpoint files (DumpFilename), otherwise it
                        should be a list of the one file that will be produced
                        by the dump
        Note that checkpoint files get done one at a time, not in parallel
        '''
        # FIXME need shell escape
        if self.wiki.config.lbzip2threads:
            if not exists(self.wiki.config.lbzip2):
                raise BackupError("lbzip2 command %s not found" % self.wiki.config.lbzip2)
        elif not exists(self.wiki.config.bzip2):
            raise BackupError("bzip2 command %s not found" % self.wiki.config.bzip2)
        if not exists(self.wiki.config.sevenzip):
            raise BackupError("7zip command %s not found" % self.wiki.config.sevenzip)

        command_series = []
        for out_dfname in output_dfnames:
            input_dfname = DumpFilename(self.wiki, None, out_dfname.dumpname, out_dfname.file_type,
                                        self.item_for_recompression.file_ext, out_dfname.partnum,
                                        out_dfname.checkpoint)
            if runner.wiki.is_private():
                outfilepath = runner.dump_dir.filename_private_path(out_dfname)
                infilepath = runner.dump_dir.filename_private_path(input_dfname)
            else:
                outfilepath = runner.dump_dir.filename_public_path(out_dfname)
                infilepath = runner.dump_dir.filename_public_path(input_dfname)

            if self.wiki.config.lbzip2threads:
                # one thread only, as these already run in parallel
                decompr_command = "{lbzip2} -dc -n 1 {infile}".format(
                    lbzip2=self.wiki.config.lbzip2, infile=infilepath)
            else:
                decompr_command = "{bzip2} -dc {infile}".format(bzip2=self.wiki.config.bzip2,
                                                                infile=infilepath)
            command_pipe = [["{decompr} | {sevenzip} a -mx=4 -si {ofile}".format(
                decompr=decompr_command, sevenzip=self.wiki.config.sevenzip,
                ofile=DumpFilename.get_inprogress_name(outfilepath))]]
            command_series.append(command_pipe)
        return command_series

Example #20

0

Show file

File: flowjob.py Project: wikimedia/operations-dumps

    def build_command(self, runner, output_dfname):
        if not os.path.exists(runner.wiki.config.php):
            raise BackupError("php command %s not found" % runner.wiki.config.php)

        if runner.wiki.is_private():
            flow_output_fpath = runner.dump_dir.filename_private_path(output_dfname)
        else:
            flow_output_fpath = runner.dump_dir.filename_public_path(output_dfname)
        script_command = MultiVersion.mw_script_as_array(
            runner.wiki.config, "extensions/Flow/maintenance/dumpBackup.php")

        command = [runner.wiki.config.php]
        command.extend(script_command)
        command.extend(["--wiki=%s" % runner.db_name,
                        "--current", "--report=1000",
                        "--output=bzip2:%s" % DumpFilename.get_inprogress_name(flow_output_fpath)])
        if self.history:
            command.append("--full")
        pipeline = [command]
        series = [pipeline]
        return series

Example #21

0

Show file

    def run(self, runner):
        # here we will either clean up or not depending on how we were called FIXME
        self.cleanup_old_files(runner.dump_dir, runner)
        commands = []

        todo = []

        if self.page_id_range is not None:
            # convert to checkpoint filename, handle the same way
            self.checkpoint_file = self.get_chkptfile_from_pageids()

        if self.checkpoint_file:
            todo = [self.checkpoint_file]
        else:
            # list all the output files that would be produced w/o
            # checkpoint files on
            outfiles = self.get_reg_files_for_filepart_possible(
                runner.dump_dir, self.get_fileparts_list(), self.list_dumpnames())
            if self._checkpoints_enabled:

                # get the stub list that would be used for the current run
                stubs = self.get_stub_files(runner)
                stubs = sorted(stubs, key=lambda thing: thing.filename)

                # get the page ranges covered by stubs
                stub_ranges = []
                for stub in stubs:
                    fname = DumpFile(self.wiki,
                                     runner.dump_dir.filename_public_path(stub, stub.date),
                                     stub, self.verbose)
                    stub_ranges.append((fname.find_first_page_id_in_file(),
                                        self.find_last_page_id(stub, runner), stub.partnum))

                # get list of existing checkpoint files
                chkpt_files = self.list_checkpt_files(
                    runner.dump_dir, [self.dumpname], runner.wiki.date, parts=None)
                chkpt_files = sorted(chkpt_files, key=lambda thing: thing.filename)
                # get the page ranges covered by existing checkpoint files
                checkpoint_ranges = [(chkptfile.first_page_id, chkptfile.last_page_id,
                                      chkptfile.partnum)
                                     for chkptfile in chkpt_files]
                if self.verbose:
                    print "checkpoint_ranges is", checkpoint_ranges
                    print "stub_ranges is", stub_ranges

                if not checkpoint_ranges:
                    # no page ranges covered by checkpoints. do all output files
                    # the usual way
                    todo = outfiles
                else:
                    todo = []
                    parts = self.get_fileparts_list()
                    for partnum in parts:
                        if not [int(chkpt_range[2]) for chkpt_range in checkpoint_ranges
                                if int(chkpt_range[2]) == int(partnum)]:
                            # no page ranges covered by checkpoints for a particular
                            # file part (subjob) so do that output file the
                            # regular way
                            todo.extend([outfile for outfile in outfiles
                                         if int(outfile.partnum) == int(partnum)])

                    missing = self.find_missing_ranges(stub_ranges, checkpoint_ranges)
                    todo.extend([self.chkpt_file_from_page_range((first, last), partnum)
                                 for (first, last, partnum) in missing])

            else:
                # do the missing files only
                # FIXME public or private depending on the wiki!
                todo = [outfile for outfile in outfiles
                        if not os.path.exists(runner.dump_dir.filename_public_path(outfile))]

        partial_stubs = []
        if self.verbose:
            print "todo is", [to.filename for to in todo]
        for fileobj in todo:

            stub_for_file = self.get_stub_files(runner, fileobj.partnum_int)[0]

            if fileobj.first_page_id is None:
                partial_stubs.append(stub_for_file)
            else:
                stub_output_file = DumpFilename(
                    self.wiki, fileobj.date, fileobj.dumpname,
                    self.item_for_stubs.get_filetype(),
                    self.item_for_stubs.get_file_ext(),
                    fileobj.partnum,
                    DumpFilename.make_checkpoint_string(
                        fileobj.first_page_id, fileobj.last_page_id), temp=True)

                self.write_partial_stub(stub_for_file, stub_output_file, runner)
                if not self.has_no_entries(stub_output_file, runner):
                    partial_stubs.append(stub_output_file)

        if self.verbose:
            print "partial_stubs is", [ps.filename for ps in partial_stubs]
        if partial_stubs:
            stub_files = partial_stubs
        else:
            return

        for stub_file in stub_files:
            series = self.build_command(runner, stub_file)
            commands.append(series)

        error = runner.run_command(commands, callback_stderr=self.progress_callback,
                                   callback_stderr_arg=runner)
        if error:
            raise BackupError("error producing xml file(s) %s" % self.dumpname)

Example #22

0

Show file

    def __init__(self, wiki, prefetch=True, prefetchdate=None, spawn=True,
                 job=None, skip_jobs=None,
                 restart=False, notice="", dryrun=False, enabled=None,
                 partnum_todo=None, checkpoint_file=None, page_id_range=None,
                 skipdone=False, cleanup=False, do_prereqs=False, verbose=False):
        self.wiki = wiki
        self.db_name = wiki.db_name
        self.prefetch = prefetch
        self.prefetchdate = prefetchdate
        self.spawn = spawn
        self.filepart_info = FilePartInfo(wiki, self.db_name, self.log_and_print)
        self.restart = restart
        self.html_notice_file = None
        self.log = None
        self.dryrun = dryrun
        self._partnum_todo = partnum_todo
        self.checkpoint_file = checkpoint_file
        self.page_id_range = page_id_range
        self.skipdone = skipdone
        self.verbose = verbose
        self.enabled = enabled
        self.cleanup_old_files = cleanup
        self.do_prereqs = do_prereqs

        if self.checkpoint_file is not None:
            fname = DumpFilename(self.wiki)
            fname.new_from_filename(checkpoint_file)
            # we should get file partnum if any
            if self._partnum_todo is None and fname.partnum_int:
                self._partnum_todo = fname.partnum_int
            elif (self._partnum_todo is not None and fname.partnum_int and
                  self._partnum_todo != fname.partnum_int):
                raise BackupError("specifed partnum to do does not match part number "
                                  "of checkpoint file %s to redo", self.checkpoint_file)
            self.checkpoint_file = fname

        if self.enabled is None:
            self.enabled = {}
        for setting in [StatusHtml.NAME, IndexHtml.NAME, Checksummer.NAME,
                        RunInfoFile.NAME, SymLinks.NAME, RunSettings.NAME,
                        Feeds.NAME, NoticeFile.NAME, "makedir", "clean_old_dumps",
                        "cleanup_old_files", "check_trunc_files"]:
            self.enabled[setting] = True

        if not self.cleanup_old_files:
            if "cleanup_old_files" in self.enabled:
                del self.enabled["cleanup_old_files"]

        if self.dryrun or self._partnum_todo is not None or self.checkpoint_file is not None:
            for setting in [StatusHtml.NAME, IndexHtml.NAME, Checksummer.NAME,
                            RunInfoFile.NAME, SymLinks.NAME, RunSettings.NAME,
                            Feeds.NAME, NoticeFile.NAME, "makedir", "clean_old_dumps"]:
                if setting in self.enabled:
                    del self.enabled[setting]

        if self.dryrun:
            for setting in ["check_trunc_files"]:
                if setting in self.enabled:
                    del self.enabled[setting]
            if "logging" in self.enabled:
                del self.enabled["logging"]

        self.job_requested = job

        if self.job_requested == "latestlinks":
            for setting in [StatusHtml.NAME, IndexHtml.NAME, RunInfoFile.NAME]:
                if setting in self.enabled:
                    del self.enabled[setting]

        if self.job_requested == "createdirs":
            for setting in [SymLinks.NAME, Feeds.NAME, RunSettings.NAME]:
                if setting in self.enabled:
                    del self.enabled[setting]

        if self.job_requested == "latestlinks" or self.job_requested == "createdirs":
            for setting in [Checksummer.NAME, NoticeFile.NAME, "makedir",
                            "clean_old_dumps", "check_trunc_files"]:
                if setting in self.enabled:
                    del self.enabled[setting]

        if self.job_requested == "noop":
            for setting in ["clean_old_dumps", "check_trunc_files"]:
                if setting in self.enabled:
                    del self.enabled[setting]

        self.skip_jobs = skip_jobs
        if skip_jobs is None:
            self.skip_jobs = []

        self.db_server_info = DbServerInfo(self.wiki, self.db_name, self.log_and_print)
        self.dump_dir = DumpDir(self.wiki, self.db_name)

        # these must come after the dumpdir setup so we know which directory we are in
        if "logging" in self.enabled and "makedir" in self.enabled:
            file_obj = DumpFilename(self.wiki)
            file_obj.new_from_filename(self.wiki.config.log_file)
            self.log_filename = self.dump_dir.filename_private_path(file_obj)
            self.make_dir(os.path.join(self.wiki.private_dir(), self.wiki.date))
            self.log = Logger(self.log_filename)
            # thread should die horribly when main script dies. no exceptions.
            self.log.daemon = True
            self.log.start()

        self.dumpjobdata = DumpRunJobData(self.wiki, self.dump_dir, notice,
                                          self.log_and_print, self.debug, self.enabled,
                                          self.verbose)

        # some or all of these dump_items will be marked to run
        self.dump_item_list = DumpItemList(self.wiki, self.prefetch, self.prefetchdate,
                                           self.spawn,
                                           self._partnum_todo, self.checkpoint_file,
                                           self.job_requested, self.skip_jobs,
                                           self.filepart_info, self.page_id_range,
                                           self.dumpjobdata, self.dump_dir, self.verbose)
        # only send email failure notices for full runs
        if self.job_requested:
            email = False
        else:
            email = True
        self.failurehandler = FailureHandler(self.wiki, email)
        self.statushtml = StatusHtml(self.wiki, self.dump_dir,
                                     self.dump_item_list.dump_items,
                                     self.dumpjobdata, self.enabled,
                                     self.failurehandler,
                                     self.log_and_print, self.verbose)
        self.indexhtml = IndexHtml(self.wiki, self.dump_dir,
                                   self.dump_item_list.dump_items,
                                   self.dumpjobdata, self.enabled,
                                   self.failurehandler,
                                   self.log_and_print, self.verbose)

Example #23

0

Show file

File: xmlcontentjobs.py Project: wikimedia/operations-dumps

    def run(self, runner):
        # here we will either clean up or not depending on how we were called
        # FIXME callers should set this appropriately and they don't right now
        self.cleanup_old_files(runner.dump_dir, runner)

        # clean up all tmp output files from previous attempts of this job
        # for this dump wiki and date, otherwise we'll wind up indexing
        # them and hashsumming them etc.
        # they may have been left around from an interrupted or failed earlier
        # run

        # in cases where we have request of specific file, do it as asked,
        # no splitting it up into smaller pieces
        do_bitesize = False

        self.cleanup_tmp_files(runner.dump_dir, runner)

        commands = []

        dfnames_todo = []
        if self.jobinfo['pageid_range'] is not None:
            # convert to checkpoint filename, handle the same way
            dfnames_todo = [self.get_pagerange_output_dfname()]
        elif self.checkpoint_file:
            dfnames_todo = [self.checkpoint_file]
        elif self._checkpoints_enabled:
            do_bitesize = True
            stub_pageranges = self.get_ranges_covered_by_stubs(runner)
            stub_pageranges = sorted(stub_pageranges, key=lambda x: int(x[0]))
            dfnames_todo = self.get_dfnames_for_missing_pranges(runner, stub_pageranges)
            # replace stub ranges for output files that cover smaller
            # ranges, with just those numbers
            new_stub_ranges = []
            for dfname in dfnames_todo:
                if dfname.is_checkpoint_file:
                    new_stub_ranges.append((dfname.first_page_id,
                                            dfname.last_page_id, dfname.partnum))
                else:
                    for srange in stub_pageranges:
                        if srange[2] == dfname.partnum:
                            new_stub_ranges.append(srange)
            stub_pageranges = new_stub_ranges
        else:
            output_dfnames = self.get_reg_files_for_filepart_possible(
                runner.dump_dir, self.get_fileparts_list(), self.list_dumpnames())
            # at least some page ranges are covered, just do those that
            if runner.wiki.is_private():
                dfnames_todo = [
                    dfname for dfname in output_dfnames if not os.path.exists(
                        runner.dump_dir.filename_private_path(dfname))]
            else:
                dfnames_todo = [
                    dfname for dfname in output_dfnames if not os.path.exists(
                        runner.dump_dir.filename_public_path(dfname))]
        if self._checkpoints_enabled and do_bitesize:
            dfnames_todo = self.make_bitesize_jobs(dfnames_todo, stub_pageranges)

        if self.jobinfo['prefetch']:
            if runner.wiki.config.sevenzip_prefetch:
                file_exts = ['7z', self.file_ext]
            else:
                file_exts = [self.file_ext]
            prefetcher = PrefetchFinder(
                self.wiki,
                {'name': self.name(), 'desc': self.jobinfo['desc'],
                 'dumpname': self.get_dumpname(),
                 'ftype': self.file_type, 'fexts': file_exts,
                 'subset': self.jobinfo['subset']},
                {'date': self.jobinfo['prefetchdate'], 'parts': self._parts},
                self.verbose)

        wanted = [self.setup_wanted(dfname, runner, prefetcher) for dfname in dfnames_todo]

        to_generate = []
        for entry in wanted:
            if entry['generate']:
                to_generate.append((entry['stub_input'], entry['stub']))
        if self._parts:
            batchsize = int(len(self._parts) / 2)
        else:
            batchsize = 1
        self.stubber.write_pagerange_stubs(to_generate, runner, batchsize, self.move_if_truncated)

        for entry in wanted:
            if entry['generate']:
                if self.stubber.has_no_pages(entry['stub'], runner, tempdir=True):
                    # this page range has no pages in it (all deleted?) so we need not
                    # keep info on how to generate it
                    continue
            # series = self.build_command(runner, entry['stub'], entry['prefetch'])
            output_dfname = DumpFilename(self.wiki, entry['stub'].date, self.get_dumpname(),
                                         self.get_filetype(), self.file_ext, entry['stub'].partnum,
                                         DumpFilename.make_checkpoint_string(
                                             entry['stub'].first_page_id,
                                             entry['stub'].last_page_id),
                                         False)
            entry['command'] = self.build_command(runner, entry['stub'],
                                                  entry['prefetch'], output_dfname)
            self.setup_command_info(runner, entry['command'], [output_dfname])
            commands.append(entry['command'])

        # don't do them all at once, do only up to _parts commands at the same time
        if self._parts:
            batchsize = len(self._parts)
        else:
            batchsize = 1
        errors = False
        failed_commands = []
        max_retries = self.wiki.config.max_retries
        retries = 0
        while commands and (retries < max_retries or retries == 0):
            command_batch = commands[:batchsize]
            error, broken = runner.run_command(
                command_batch, callback_stderr=self.progress_callback,
                callback_stderr_arg=runner,
                callback_on_completion=self.command_completion_callback)
            if error:
                for series in broken:
                    for pipeline in series:
                        runner.log_and_print("error from commands: %s" % " ".join(
                            [entry for entry in pipeline]))
                failed_commands.append(broken)
                errors = True
            commands = commands[batchsize:]
            if not commands and failed_commands:
                retries += 1
                if retries < max_retries:
                    # retry failed commands
                    commands = failed_commands
                    failed_commands = []
                    # no instant retries, give the servers a break
                    time.sleep(self.wiki.config.retry_wait)
                    errors = False
        if errors:
            raise BackupError("error producing xml file(s) %s" % self.get_dumpname())

Example #24

0

Show file

    def build_command(self, runner, stub_file):
        """Build the command line for the dump, minus output and filter options"""

        # we write a temp file, it will be checkpointed every so often.
        temp = bool(self._checkpoints_enabled)

        output_file = DumpFilename(self.wiki, stub_file.date, self.dumpname,
                                   self.get_filetype(), self.file_ext, stub_file.partnum,
                                   DumpFilename.make_checkpoint_string(stub_file.first_page_id,
                                                                       stub_file.last_page_id),
                                   temp)

        stub_path = os.path.join(self.wiki.config.temp_dir, stub_file.filename)
        if os.path.exists(stub_path):
            # if this is a partial stub file in temp dir, use that
            stub_option = "--stub=gzip:%s" % stub_path
        else:
            # use regular stub file
            stub_option = "--stub=gzip:%s" % runner.dump_dir.filename_public_path(stub_file)

        # Try to pull text from the previous run; most stuff hasn't changed
        # Source=$OutputDir/pages_$section.xml.bz2
        sources = []
        possible_sources = None
        if self._prefetch:
            possible_sources = self._find_previous_dump(runner, output_file.partnum)
            # if we have a list of more than one then
            # we need to check existence for each and put them together in a string
            if possible_sources:
                for sourcefile in possible_sources:
                    # if we are doing partial stub run, include only the analogous
                    # checkpointed prefetch files, if there are checkpointed files
                    # otherwise we'll use the all the sourcefiles reported
                    if not self.chkptfile_in_pagerange(stub_file, sourcefile):
                        continue
                    sname = runner.dump_dir.filename_public_path(sourcefile, sourcefile.date)
                    if exists(sname):
                        sources.append(sname)
        if output_file.partnum:
            partnum_str = "%s" % stub_file.partnum
        else:
            partnum_str = ""
        if len(sources) > 0:
            source = "bzip2:%s" % (";".join(sources))
            runner.show_runner_state("... building %s %s XML dump, with text prefetch from %s..." %
                                     (self._subset, partnum_str, source))
            prefetch = "--prefetch=%s" % (source)
        else:
            runner.show_runner_state("... building %s %s XML dump, no text prefetch..." %
                                     (self._subset, partnum_str))
            prefetch = ""

        if self._spawn:
            spawn = "--spawn=%s" % (self.wiki.config.php)
        else:
            spawn = ""

        if not exists(self.wiki.config.php):
            raise BackupError("php command %s not found" % self.wiki.config.php)

        if self._checkpoints_enabled:
            checkpoint_time = "--maxtime=%s" % (self.wiki.config.checkpoint_time)
            checkpoint_file = "--checkpointfile=%s" % output_file.new_filename(
                output_file.dumpname, output_file.file_type, output_file.file_ext,
                output_file.date, output_file.partnum, "p%sp%s", None)
        else:
            checkpoint_time = ""
            checkpoint_file = ""
        script_command = MultiVersion.mw_script_as_array(runner.wiki.config, "dumpTextPass.php")
        dump_command = [self.wiki.config.php]
        dump_command.extend(script_command)
        dump_command.extend(["--wiki=%s" % runner.db_name,
                             "%s" % stub_option,
                             "%s" % prefetch,
                             "%s" % checkpoint_time,
                             "%s" % checkpoint_file,
                             "--report=1000",
                             "%s" % spawn])

        dump_command = [entry for entry in dump_command if entry is not None]
        command = dump_command
        filters = self.build_filters(runner, output_file)
        eta = self.build_eta(runner)
        command.extend([filters, eta])
        pipeline = [command]
        series = [pipeline]
        return series