def run(self, runner): retries = 0 maxretries = runner.wiki.config.max_retries dfnames = self.list_outfiles_for_build_command(runner.dump_dir) if len(dfnames) > 1: raise BackupError("siteinfo dump %s trying to produce more than one file" % self.dumpname) output_dfname = dfnames[0] commands = self.build_command(runner) if runner.wiki.is_private(): command_series = runner.get_save_command_series( commands, DumpFilename.get_inprogress_name( runner.dump_dir.filename_private_path(output_dfname))) else: command_series = runner.get_save_command_series( commands, DumpFilename.get_inprogress_name( runner.dump_dir.filename_public_path(output_dfname))) self.setup_command_info(runner, command_series, [output_dfname]) error, _broken = runner.save_command(command_series, self.command_completion_callback) while error and retries < maxretries: retries = retries + 1 time.sleep(5) error, _broken = runner.save_command(command_series) if error: raise BackupError("error dumping siteinfo props %s" % ','.join(self._properties))
def remove_symlinks_from_old_runs(self, date_string, dump_name=None, partnum=None, checkpoint=None, onlyparts=False): # fixme # this needs to do more work if there are file parts or checkpoint files linked in here from # earlier dates. checkpoint ranges change, and configuration of parallel jobs for file parts # changes too, so maybe old files still exist and the links need to be removed because we # have newer files for the same phase of the dump. if SymLinks.NAME in self._enabled: latest_dir = self.dump_dir.latest_dir() files = os.listdir(latest_dir) for filename in files: link = os.path.join(latest_dir, filename) if os.path.islink(link): realfile = os.readlink(link) file_obj = DumpFilename(self.dump_dir._wiki) file_obj.new_from_filename(os.path.basename(realfile)) if file_obj.date < date_string: # fixme check that these are ok if the value is None if dump_name and (file_obj.dumpname != dump_name): continue if (partnum or onlyparts) and (file_obj.partnum != partnum): continue if checkpoint and (file_obj.checkpoint != checkpoint): continue self.debugfn("Removing old symlink %s -> %s" % (link, realfile)) os.remove(link)
def build_command(self, runner, output_dfname): ''' arguments: runner: Runner object output_dfname: output file that will be produced ''' input_dfname = DumpFilename(self.wiki, None, output_dfname.dumpname, output_dfname.file_type, self.item_for_recompression.file_ext, output_dfname.partnum, output_dfname.checkpoint) if runner.wiki.is_private(): outfilepath = runner.dump_dir.filename_private_path( self.get_multistream_dfname(output_dfname)) outfilepath_index = runner.dump_dir.filename_private_path( self.get_multistream_index_dfname(output_dfname)) infilepath = runner.dump_dir.filename_private_path(input_dfname) else: outfilepath = runner.dump_dir.filename_public_path( self.get_multistream_dfname(output_dfname)) outfilepath_index = runner.dump_dir.filename_public_path( self.get_multistream_index_dfname(output_dfname)) infilepath = runner.dump_dir.filename_public_path(input_dfname) command_pipe = [["%s -dc %s | %s --pagesperstream 100 --buildindex %s -o %s" % (self.wiki.config.bzip2, infilepath, self.wiki.config.recompressxml, DumpFilename.get_inprogress_name(outfilepath_index), DumpFilename.get_inprogress_name(outfilepath))]] return [command_pipe]
def get_per_file_path(self, htype, filename): ''' return the full path to the file containing the checksum of the specified type for the given filename. this is only in txt format ''' dfname = DumpFilename(self.wiki, None) # fixme check to see if this is right or what dfname.new_from_filename(Checksummer.get_checksum_basename_perfile(htype, filename)) return self.dump_dir.filename_public_path(dfname)
def remove_symlinks_from_old_runs(self, date_string, dump_name=None, partnum=None, checkpoint=None, onlyparts=False): """ Remove symlinks from the 'latest' directory for (some) links that point to files from other runs than the current one (of 'date_string'). If dump_name, part_num, checkpoint are False or None, we remove all the old symlinks for all values of the arg in the filename. example: if partnum is False or None then we remove all old values for all file parts This needs to do more work if there are file parts or checkpoint files linked in here from earlier dates. checkpoint ranges change, and configuration of parallel jobs for file parts changes too, so maybe old files still exist and the links need to be removed because we have newer files for the same phase of the dump. So we keep symlinks to files from one older run only, and clean up the rest. We do this because here at WMF we do partial and full runs alternating, and we like to keep the links to files from the full runs around until a new full run is in place. Really the number of keeps should be configurable (FIXME later I guess). """ if SymLinks.NAME in self._enabled: latest_dir = self.dump_dir.latest_dir() files = os.listdir(latest_dir) dates = [] files_for_cleanup = [] for filename in files: link = os.path.join(latest_dir, filename) if os.path.islink(link): realfilepath = os.readlink(link) dfname = DumpFilename(self.dump_dir._wiki) dfname.new_from_filename(os.path.basename(realfilepath)) files_for_cleanup.append({'link': link, 'dfname': dfname, 'path': realfilepath}) dates.append(dfname.date) try: index = dates.index(date_string) prev_run_date = dates[index - 1] except Exception: if len(dates) >= 2: prev_run_date = dates[-2] else: prev_run_date = None for item in files_for_cleanup: if item['dfname'].date < date_string: if dump_name and (item['dfname'].dumpname != dump_name): continue if prev_run_date is None or item['dfname'].date == prev_run_date: # for the previous run, or the only existing run, if different # from the current one, we are very careful. For all older runs # we pretty much want to toss everything # fixme check that these are ok if the value is None if (partnum or onlyparts) and (item['dfname'].partnum != partnum): continue if checkpoint and (item['dfname'].checkpoint != checkpoint): continue self.debugfn("Removing old symlink %s -> %s" % (item['link'], item['path'])) os.remove(item['link'])
def build_command(self, runner, query, out_dfname): if not exists(runner.wiki.config.gzip): raise BackupError("gzip command %s not found" % runner.wiki.config.gzip) series = runner.db_server_info.build_sql_command(query, runner.wiki.config.gzip) if runner.wiki.is_private(): return runner.get_save_command_series( series, DumpFilename.get_inprogress_name( runner.dump_dir.filename_private_path(out_dfname))) return runner.get_save_command_series( series, DumpFilename.get_inprogress_name( runner.dump_dir.filename_public_path(out_dfname)))
def build_command(self, runner, output_dfname): commands = runner.db_server_info.build_sqldump_command(self._table, runner.wiki.config.gzip) if self.private or runner.wiki.is_private(): command_series = runner.get_save_command_series( commands, DumpFilename.get_inprogress_name( runner.dump_dir.filename_private_path(output_dfname))) else: command_series = runner.get_save_command_series( commands, DumpFilename.get_inprogress_name( runner.dump_dir.filename_public_path(output_dfname))) return command_series
def build_filters(self, runner, input_dfname): """ Construct the output filter options for dumpTextPass.php args: Runner, DumpFilename """ # do we need checkpoints? ummm if runner.wiki.is_private(): xmlbz2_path = runner.dump_dir.filename_private_path(input_dfname) else: xmlbz2_path = runner.dump_dir.filename_public_path(input_dfname) if 'history' in self.jobinfo['subset'] and runner.wiki.config.lbzip2forhistory: # we will use lbzip2 for compression of pages-meta-history for this wiki # if configured bz2mode = "lbzip2" if not exists(self.wiki.config.lbzip2): raise BackupError("lbzip2 command %s not found" % self.wiki.config.lbzip2) elif self.wiki.config.bzip2[-6:] == "dbzip2": bz2mode = "dbzip2" else: bz2mode = "bzip2" if not exists(self.wiki.config.bzip2): raise BackupError("bzip2 command %s not found" % self.wiki.config.bzip2) return "--output=%s:%s" % (bz2mode, DumpFilename.get_inprogress_name(xmlbz2_path))
def build_command(self, runner, output_dfname): if not os.path.exists(runner.wiki.config.php): raise BackupError("php command %s not found" % runner.wiki.config.php) if runner.wiki.is_private(): logging_path = runner.dump_dir.filename_private_path(output_dfname) else: logging_path = runner.dump_dir.filename_public_path(output_dfname) config_file_arg = runner.wiki.config.files[0] if runner.wiki.config.override_section: config_file_arg = config_file_arg + ":" + runner.wiki.config.override_section command = ["/usr/bin/python3", "xmllogs.py", "--config", config_file_arg, "--wiki", runner.db_name, "--outfile", DumpFilename.get_inprogress_name(logging_path)] if output_dfname.partnum: # set up start end end pageids for this piece # note there is no item id 0 I guess. so we start with 1 start = sum([int(self._parts[i]) for i in range(0, output_dfname.partnum_int - 1)]) + 1 startopt = "--start=%s" % start # if we are on the last file part, we should get up to the last log item id, # whatever that is. command.append(startopt) if output_dfname.partnum_int < len(self._parts): end = sum([int(self._parts[i]) for i in range(0, output_dfname.partnum_int)]) + 1 endopt = "--end=%s" % end command.append(endopt) pipeline = [command] series = [pipeline] return series
def chkpt_file_from_page_range(self, page_range, partnum): checkpoint_string = DumpFilename.make_checkpoint_string( page_range[0], page_range[1]) output_file = DumpFilename(self.wiki, self.wiki.date, self.dumpname, self.get_filetype(), self.get_file_ext(), partnum, checkpoint=checkpoint_string, temp=False) return output_file
def build_command(self, runner, novariant_dfname, output_dfnames): """ args: Runner, DumpFilename for output without any language variant """ config_file_arg = runner.wiki.config.files[0] if runner.wiki.config.override_section: config_file_arg = config_file_arg + ":" + runner.wiki.config.override_section command = ["/usr/bin/python3", "xmlabstracts.py", "--config", config_file_arg, "--wiki", self.db_name] output_paths = [] variants = [] for dfname in output_dfnames: variant = self.get_variant_from_dumpname(dfname.dumpname) variant_option = self._variant_option(variant) if runner.wiki.is_private(): output_paths.append(DumpFilename.get_inprogress_name( runner.dump_dir.filename_private_path(dfname))) else: output_paths.append(DumpFilename.get_inprogress_name( runner.dump_dir.filename_public_path(dfname))) variants.append(variant_option) command.extend(["--outfiles=%s" % ",".join(output_paths), "--variants=%s" % ",".join(variants)]) if novariant_dfname.partnum: # set up start end end pageids for this piece # note there is no page id 0 I guess. so we start with 1 start = sum([int(self._parts[i]) for i in range(0, novariant_dfname.partnum_int - 1)]) + 1 startopt = "--start=%s" % start # if we are on the last file part, we should get up to the last pageid, # whatever that is. command.append(startopt) if novariant_dfname.partnum_int < len(self._parts): end = sum([int(self._parts[i]) for i in range(0, novariant_dfname.partnum_int)]) + 1 endopt = "--end=%s" % end command.append(endopt) pipeline = [command] series = [pipeline] return series
def build_command(self, runner, output_dfname, history_dfname, current_dfname): if not os.path.exists(runner.wiki.config.php): raise BackupError("php command %s not found" % runner.wiki.config.php) config_file_arg = runner.wiki.config.files[0] if runner.wiki.config.override_section: config_file_arg = config_file_arg + ":" + runner.wiki.config.override_section command = ["/usr/bin/python3", "xmlstubs.py", "--config", config_file_arg, "--wiki", runner.db_name] output_dir = self.get_output_dir(runner) if output_dfname is not None: command.extend(["--articles", DumpFilename.get_inprogress_name( os.path.join(output_dir, output_dfname.filename))]) if history_dfname is not None: command.extend(["--history", DumpFilename.get_inprogress_name( os.path.join(output_dir, history_dfname.filename))]) if current_dfname is not None: command.extend(["--current", DumpFilename.get_inprogress_name( os.path.join(output_dir, current_dfname.filename))]) partnum = None if output_dfname is not None: partnum = output_dfname.partnum elif history_dfname is not None: partnum = history_dfname.partnum elif current_dfname is not None: partnum = current_dfname.partnum if partnum is not None: # set up start end end pageids for this piece # note there is no page id 0 I guess. so we start with 1 start = sum([int(self._parts[i]) for i in range(0, int(partnum) - 1)]) + 1 startopt = "--start=%s" % start # if we are on the last file part, we should get up to the last pageid, # whatever that is. command.append(startopt) if int(partnum) < len(self._parts): end = sum([int(self._parts[i]) for i in range(0, int(partnum))]) + 1 endopt = "--end=%s" % end command.append(endopt) pipeline = [command] series = [pipeline] return series
def write_specialfilesinfo_file(self): """ get info about all files for the most current dump of a given wiki, possibly in progress, that don't contain dump job output; write this info to an output file """ if SpecialFileInfo.NAME not in self._enabled: return dump_dir = DumpDir(self.wiki, self.wiki.db_name) files = self.get_special_filenames() fileinfo = {} for filename in files: fileinfo[filename] = {} path = os.path.join(self.wiki.public_dir(), self.wiki.date, filename) fileinfo[filename]['status'] = 'present' try: size = os.path.getsize(path) fileinfo[filename]['size'] = size except Exception: fileinfo[filename]['status'] = 'missing' continue dfname = DumpFilename(self.wiki) dfname.new_from_filename(os.path.basename(path)) fileinfo[filename]['url'] = dump_dir.web_path_relative(dfname) contents = {} contents['files'] = fileinfo contents['version'] = SpecialFileInfo.VERSION try: self.write_contents(contents) except Exception: if self.verbose: exc_type, exc_value, exc_traceback = sys.exc_info() sys.stderr.write(repr(traceback.format_exception(exc_type, exc_value, exc_traceback))) message = "Couldn't write special files info. Continuing anyways" if self.error_callback: self.error_callback(message) else: sys.stderr.write("%s\n" % message)
def command_completion_callback(self, series): """ if the series of commands ran successfully to completion, mv produced output files from temporary to permanent names we write the data into temporary locations initially so that as each command series completes, the output files can be made available as done immediately, rather than waiting for all the parallel processes of a dump step to complete first. args: CommandSeries for which all commands have completed """ if not series.exited_successfully(): return for commands in self.commands_submitted: if commands['series'] == series._command_series: if not commands['output_files']: return for inprogress_filename in commands['output_files']: if not inprogress_filename.endswith(DumpFilename.INPROG): continue final_dfname = DumpFilename(commands['runner'].wiki) final_dfname.new_from_filename( inprogress_filename[:-1 * len(DumpFilename.INPROG)]) in_progress_path = os.path.join(commands['output_dir'], inprogress_filename) final_path = os.path.join(commands['output_dir'], final_dfname.filename) try: os.rename(in_progress_path, final_path) except Exception: if self.verbose: exc_type, exc_value, exc_traceback = sys.exc_info() sys.stderr.write(repr( traceback.format_exception(exc_type, exc_value, exc_traceback))) continue # sanity check of file contents, move if bad self.move_if_truncated(commands['runner'], final_dfname)
def make_dfname_from_pagerange(self, pagerange, partnum): """ given pagerange, make output file for appropriate type of page content dumps args: (startpage<str>, endpage<str>), string """ checkpoint_string = DumpFilename.make_checkpoint_string( pagerange[0], pagerange[1]) output_dfname = DumpFilename(self.wiki, self.wiki.date, self.get_dumpname(), self.get_filetype(), self.get_file_ext(), partnum, checkpoint=checkpoint_string, temp=False) return output_dfname
def get_pagerange_stub_dfname(self, wanted, runner): """ return the dumpfilename for stub file that would have the page range in 'wanted' """ stub_input_dfname = self.get_stub_dfname(wanted['partnum'], runner) stub_output_dfname = DumpFilename( self.wiki, stub_input_dfname.date, stub_input_dfname.dumpname, stub_input_dfname.file_type, stub_input_dfname.file_ext, stub_input_dfname.partnum, DumpFilename.make_checkpoint_string( wanted['outfile'].first_page_id, wanted['outfile'].last_page_id), temp=False) return stub_output_dfname
def cleanup_inprog_files(self, dump_dir, runner): if self.checkpoint_file is not None: # we only rerun this one, so just remove this one pub_path = DumpFilename.get_inprogress_name( dump_dir.filename_public_path(self.checkpoint_file)) priv_path = DumpFilename.get_inprogress_name( dump_dir.filename_private_path(self.checkpoint_file)) if os.path.exists(pub_path): if runner.dryrun: print("would remove", pub_path) else: os.remove(pub_path) elif os.path.exists(priv_path): if runner.dryrun: print("would remove", priv_path) else: os.remove(priv_path) dfnames = self.list_inprog_files_for_cleanup(dump_dir) if runner.dryrun: print("would remove ", [dfname.filename for dfname in dfnames]) else: for dfname in dfnames: self.remove_output_file(dump_dir, dfname)
def get_chkptfile_from_pageids(self): if ',' in self.page_id_range: first_page_id, last_page_id = self.page_id_range.split(',', 1) else: first_page_id = self.page_id_range last_page_id = "00000" # indicates no last page id specified, go to end of stub checkpoint_string = DumpFilename.make_checkpoint_string(first_page_id, last_page_id) if self._partnum_todo: partnum = self._partnum_todo else: # fixme is that right? maybe NOT partnum = None fileobj = DumpFilename(self.get_dumpname(), self.wiki.date, self.get_filetype(), self.get_file_ext(), partnum, checkpoint_string) return fileobj.filename
def build_command(self, runner, output_dfnames): ''' arguments: runner: Runner object output_dfnames: if checkpointing of files is enabled, this should be a list of checkpoint files (DumpFilename), otherwise it should be a list of the one file that will be produced by the dump Note that checkpoint files get done one at a time, not in parallel ''' # FIXME need shell escape if self.wiki.config.lbzip2threads: if not exists(self.wiki.config.lbzip2): raise BackupError("lbzip2 command %s not found" % self.wiki.config.lbzip2) elif not exists(self.wiki.config.bzip2): raise BackupError("bzip2 command %s not found" % self.wiki.config.bzip2) if not exists(self.wiki.config.sevenzip): raise BackupError("7zip command %s not found" % self.wiki.config.sevenzip) command_series = [] for out_dfname in output_dfnames: input_dfname = DumpFilename(self.wiki, None, out_dfname.dumpname, out_dfname.file_type, self.item_for_recompression.file_ext, out_dfname.partnum, out_dfname.checkpoint) if runner.wiki.is_private(): outfilepath = runner.dump_dir.filename_private_path(out_dfname) infilepath = runner.dump_dir.filename_private_path(input_dfname) else: outfilepath = runner.dump_dir.filename_public_path(out_dfname) infilepath = runner.dump_dir.filename_public_path(input_dfname) if self.wiki.config.lbzip2threads: # one thread only, as these already run in parallel decompr_command = "{lbzip2} -dc -n 1 {infile}".format( lbzip2=self.wiki.config.lbzip2, infile=infilepath) else: decompr_command = "{bzip2} -dc {infile}".format(bzip2=self.wiki.config.bzip2, infile=infilepath) command_pipe = [["{decompr} | {sevenzip} a -mx=4 -si {ofile}".format( decompr=decompr_command, sevenzip=self.wiki.config.sevenzip, ofile=DumpFilename.get_inprogress_name(outfilepath))]] command_series.append(command_pipe) return command_series
def build_command(self, runner, output_dfname): if not os.path.exists(runner.wiki.config.php): raise BackupError("php command %s not found" % runner.wiki.config.php) if runner.wiki.is_private(): flow_output_fpath = runner.dump_dir.filename_private_path(output_dfname) else: flow_output_fpath = runner.dump_dir.filename_public_path(output_dfname) script_command = MultiVersion.mw_script_as_array( runner.wiki.config, "extensions/Flow/maintenance/dumpBackup.php") command = [runner.wiki.config.php] command.extend(script_command) command.extend(["--wiki=%s" % runner.db_name, "--current", "--report=1000", "--output=bzip2:%s" % DumpFilename.get_inprogress_name(flow_output_fpath)]) if self.history: command.append("--full") pipeline = [command] series = [pipeline] return series
def run(self, runner): # here we will either clean up or not depending on how we were called FIXME self.cleanup_old_files(runner.dump_dir, runner) commands = [] todo = [] if self.page_id_range is not None: # convert to checkpoint filename, handle the same way self.checkpoint_file = self.get_chkptfile_from_pageids() if self.checkpoint_file: todo = [self.checkpoint_file] else: # list all the output files that would be produced w/o # checkpoint files on outfiles = self.get_reg_files_for_filepart_possible( runner.dump_dir, self.get_fileparts_list(), self.list_dumpnames()) if self._checkpoints_enabled: # get the stub list that would be used for the current run stubs = self.get_stub_files(runner) stubs = sorted(stubs, key=lambda thing: thing.filename) # get the page ranges covered by stubs stub_ranges = [] for stub in stubs: fname = DumpFile(self.wiki, runner.dump_dir.filename_public_path(stub, stub.date), stub, self.verbose) stub_ranges.append((fname.find_first_page_id_in_file(), self.find_last_page_id(stub, runner), stub.partnum)) # get list of existing checkpoint files chkpt_files = self.list_checkpt_files( runner.dump_dir, [self.dumpname], runner.wiki.date, parts=None) chkpt_files = sorted(chkpt_files, key=lambda thing: thing.filename) # get the page ranges covered by existing checkpoint files checkpoint_ranges = [(chkptfile.first_page_id, chkptfile.last_page_id, chkptfile.partnum) for chkptfile in chkpt_files] if self.verbose: print "checkpoint_ranges is", checkpoint_ranges print "stub_ranges is", stub_ranges if not checkpoint_ranges: # no page ranges covered by checkpoints. do all output files # the usual way todo = outfiles else: todo = [] parts = self.get_fileparts_list() for partnum in parts: if not [int(chkpt_range[2]) for chkpt_range in checkpoint_ranges if int(chkpt_range[2]) == int(partnum)]: # no page ranges covered by checkpoints for a particular # file part (subjob) so do that output file the # regular way todo.extend([outfile for outfile in outfiles if int(outfile.partnum) == int(partnum)]) missing = self.find_missing_ranges(stub_ranges, checkpoint_ranges) todo.extend([self.chkpt_file_from_page_range((first, last), partnum) for (first, last, partnum) in missing]) else: # do the missing files only # FIXME public or private depending on the wiki! todo = [outfile for outfile in outfiles if not os.path.exists(runner.dump_dir.filename_public_path(outfile))] partial_stubs = [] if self.verbose: print "todo is", [to.filename for to in todo] for fileobj in todo: stub_for_file = self.get_stub_files(runner, fileobj.partnum_int)[0] if fileobj.first_page_id is None: partial_stubs.append(stub_for_file) else: stub_output_file = DumpFilename( self.wiki, fileobj.date, fileobj.dumpname, self.item_for_stubs.get_filetype(), self.item_for_stubs.get_file_ext(), fileobj.partnum, DumpFilename.make_checkpoint_string( fileobj.first_page_id, fileobj.last_page_id), temp=True) self.write_partial_stub(stub_for_file, stub_output_file, runner) if not self.has_no_entries(stub_output_file, runner): partial_stubs.append(stub_output_file) if self.verbose: print "partial_stubs is", [ps.filename for ps in partial_stubs] if partial_stubs: stub_files = partial_stubs else: return for stub_file in stub_files: series = self.build_command(runner, stub_file) commands.append(series) error = runner.run_command(commands, callback_stderr=self.progress_callback, callback_stderr_arg=runner) if error: raise BackupError("error producing xml file(s) %s" % self.dumpname)
def __init__(self, wiki, prefetch=True, prefetchdate=None, spawn=True, job=None, skip_jobs=None, restart=False, notice="", dryrun=False, enabled=None, partnum_todo=None, checkpoint_file=None, page_id_range=None, skipdone=False, cleanup=False, do_prereqs=False, verbose=False): self.wiki = wiki self.db_name = wiki.db_name self.prefetch = prefetch self.prefetchdate = prefetchdate self.spawn = spawn self.filepart_info = FilePartInfo(wiki, self.db_name, self.log_and_print) self.restart = restart self.html_notice_file = None self.log = None self.dryrun = dryrun self._partnum_todo = partnum_todo self.checkpoint_file = checkpoint_file self.page_id_range = page_id_range self.skipdone = skipdone self.verbose = verbose self.enabled = enabled self.cleanup_old_files = cleanup self.do_prereqs = do_prereqs if self.checkpoint_file is not None: fname = DumpFilename(self.wiki) fname.new_from_filename(checkpoint_file) # we should get file partnum if any if self._partnum_todo is None and fname.partnum_int: self._partnum_todo = fname.partnum_int elif (self._partnum_todo is not None and fname.partnum_int and self._partnum_todo != fname.partnum_int): raise BackupError("specifed partnum to do does not match part number " "of checkpoint file %s to redo", self.checkpoint_file) self.checkpoint_file = fname if self.enabled is None: self.enabled = {} for setting in [StatusHtml.NAME, IndexHtml.NAME, Checksummer.NAME, RunInfoFile.NAME, SymLinks.NAME, RunSettings.NAME, Feeds.NAME, NoticeFile.NAME, "makedir", "clean_old_dumps", "cleanup_old_files", "check_trunc_files"]: self.enabled[setting] = True if not self.cleanup_old_files: if "cleanup_old_files" in self.enabled: del self.enabled["cleanup_old_files"] if self.dryrun or self._partnum_todo is not None or self.checkpoint_file is not None: for setting in [StatusHtml.NAME, IndexHtml.NAME, Checksummer.NAME, RunInfoFile.NAME, SymLinks.NAME, RunSettings.NAME, Feeds.NAME, NoticeFile.NAME, "makedir", "clean_old_dumps"]: if setting in self.enabled: del self.enabled[setting] if self.dryrun: for setting in ["check_trunc_files"]: if setting in self.enabled: del self.enabled[setting] if "logging" in self.enabled: del self.enabled["logging"] self.job_requested = job if self.job_requested == "latestlinks": for setting in [StatusHtml.NAME, IndexHtml.NAME, RunInfoFile.NAME]: if setting in self.enabled: del self.enabled[setting] if self.job_requested == "createdirs": for setting in [SymLinks.NAME, Feeds.NAME, RunSettings.NAME]: if setting in self.enabled: del self.enabled[setting] if self.job_requested == "latestlinks" or self.job_requested == "createdirs": for setting in [Checksummer.NAME, NoticeFile.NAME, "makedir", "clean_old_dumps", "check_trunc_files"]: if setting in self.enabled: del self.enabled[setting] if self.job_requested == "noop": for setting in ["clean_old_dumps", "check_trunc_files"]: if setting in self.enabled: del self.enabled[setting] self.skip_jobs = skip_jobs if skip_jobs is None: self.skip_jobs = [] self.db_server_info = DbServerInfo(self.wiki, self.db_name, self.log_and_print) self.dump_dir = DumpDir(self.wiki, self.db_name) # these must come after the dumpdir setup so we know which directory we are in if "logging" in self.enabled and "makedir" in self.enabled: file_obj = DumpFilename(self.wiki) file_obj.new_from_filename(self.wiki.config.log_file) self.log_filename = self.dump_dir.filename_private_path(file_obj) self.make_dir(os.path.join(self.wiki.private_dir(), self.wiki.date)) self.log = Logger(self.log_filename) # thread should die horribly when main script dies. no exceptions. self.log.daemon = True self.log.start() self.dumpjobdata = DumpRunJobData(self.wiki, self.dump_dir, notice, self.log_and_print, self.debug, self.enabled, self.verbose) # some or all of these dump_items will be marked to run self.dump_item_list = DumpItemList(self.wiki, self.prefetch, self.prefetchdate, self.spawn, self._partnum_todo, self.checkpoint_file, self.job_requested, self.skip_jobs, self.filepart_info, self.page_id_range, self.dumpjobdata, self.dump_dir, self.verbose) # only send email failure notices for full runs if self.job_requested: email = False else: email = True self.failurehandler = FailureHandler(self.wiki, email) self.statushtml = StatusHtml(self.wiki, self.dump_dir, self.dump_item_list.dump_items, self.dumpjobdata, self.enabled, self.failurehandler, self.log_and_print, self.verbose) self.indexhtml = IndexHtml(self.wiki, self.dump_dir, self.dump_item_list.dump_items, self.dumpjobdata, self.enabled, self.failurehandler, self.log_and_print, self.verbose)
def run(self, runner): # here we will either clean up or not depending on how we were called # FIXME callers should set this appropriately and they don't right now self.cleanup_old_files(runner.dump_dir, runner) # clean up all tmp output files from previous attempts of this job # for this dump wiki and date, otherwise we'll wind up indexing # them and hashsumming them etc. # they may have been left around from an interrupted or failed earlier # run # in cases where we have request of specific file, do it as asked, # no splitting it up into smaller pieces do_bitesize = False self.cleanup_tmp_files(runner.dump_dir, runner) commands = [] dfnames_todo = [] if self.jobinfo['pageid_range'] is not None: # convert to checkpoint filename, handle the same way dfnames_todo = [self.get_pagerange_output_dfname()] elif self.checkpoint_file: dfnames_todo = [self.checkpoint_file] elif self._checkpoints_enabled: do_bitesize = True stub_pageranges = self.get_ranges_covered_by_stubs(runner) stub_pageranges = sorted(stub_pageranges, key=lambda x: int(x[0])) dfnames_todo = self.get_dfnames_for_missing_pranges(runner, stub_pageranges) # replace stub ranges for output files that cover smaller # ranges, with just those numbers new_stub_ranges = [] for dfname in dfnames_todo: if dfname.is_checkpoint_file: new_stub_ranges.append((dfname.first_page_id, dfname.last_page_id, dfname.partnum)) else: for srange in stub_pageranges: if srange[2] == dfname.partnum: new_stub_ranges.append(srange) stub_pageranges = new_stub_ranges else: output_dfnames = self.get_reg_files_for_filepart_possible( runner.dump_dir, self.get_fileparts_list(), self.list_dumpnames()) # at least some page ranges are covered, just do those that if runner.wiki.is_private(): dfnames_todo = [ dfname for dfname in output_dfnames if not os.path.exists( runner.dump_dir.filename_private_path(dfname))] else: dfnames_todo = [ dfname for dfname in output_dfnames if not os.path.exists( runner.dump_dir.filename_public_path(dfname))] if self._checkpoints_enabled and do_bitesize: dfnames_todo = self.make_bitesize_jobs(dfnames_todo, stub_pageranges) if self.jobinfo['prefetch']: if runner.wiki.config.sevenzip_prefetch: file_exts = ['7z', self.file_ext] else: file_exts = [self.file_ext] prefetcher = PrefetchFinder( self.wiki, {'name': self.name(), 'desc': self.jobinfo['desc'], 'dumpname': self.get_dumpname(), 'ftype': self.file_type, 'fexts': file_exts, 'subset': self.jobinfo['subset']}, {'date': self.jobinfo['prefetchdate'], 'parts': self._parts}, self.verbose) wanted = [self.setup_wanted(dfname, runner, prefetcher) for dfname in dfnames_todo] to_generate = [] for entry in wanted: if entry['generate']: to_generate.append((entry['stub_input'], entry['stub'])) if self._parts: batchsize = int(len(self._parts) / 2) else: batchsize = 1 self.stubber.write_pagerange_stubs(to_generate, runner, batchsize, self.move_if_truncated) for entry in wanted: if entry['generate']: if self.stubber.has_no_pages(entry['stub'], runner, tempdir=True): # this page range has no pages in it (all deleted?) so we need not # keep info on how to generate it continue # series = self.build_command(runner, entry['stub'], entry['prefetch']) output_dfname = DumpFilename(self.wiki, entry['stub'].date, self.get_dumpname(), self.get_filetype(), self.file_ext, entry['stub'].partnum, DumpFilename.make_checkpoint_string( entry['stub'].first_page_id, entry['stub'].last_page_id), False) entry['command'] = self.build_command(runner, entry['stub'], entry['prefetch'], output_dfname) self.setup_command_info(runner, entry['command'], [output_dfname]) commands.append(entry['command']) # don't do them all at once, do only up to _parts commands at the same time if self._parts: batchsize = len(self._parts) else: batchsize = 1 errors = False failed_commands = [] max_retries = self.wiki.config.max_retries retries = 0 while commands and (retries < max_retries or retries == 0): command_batch = commands[:batchsize] error, broken = runner.run_command( command_batch, callback_stderr=self.progress_callback, callback_stderr_arg=runner, callback_on_completion=self.command_completion_callback) if error: for series in broken: for pipeline in series: runner.log_and_print("error from commands: %s" % " ".join( [entry for entry in pipeline])) failed_commands.append(broken) errors = True commands = commands[batchsize:] if not commands and failed_commands: retries += 1 if retries < max_retries: # retry failed commands commands = failed_commands failed_commands = [] # no instant retries, give the servers a break time.sleep(self.wiki.config.retry_wait) errors = False if errors: raise BackupError("error producing xml file(s) %s" % self.get_dumpname())
def build_command(self, runner, stub_file): """Build the command line for the dump, minus output and filter options""" # we write a temp file, it will be checkpointed every so often. temp = bool(self._checkpoints_enabled) output_file = DumpFilename(self.wiki, stub_file.date, self.dumpname, self.get_filetype(), self.file_ext, stub_file.partnum, DumpFilename.make_checkpoint_string(stub_file.first_page_id, stub_file.last_page_id), temp) stub_path = os.path.join(self.wiki.config.temp_dir, stub_file.filename) if os.path.exists(stub_path): # if this is a partial stub file in temp dir, use that stub_option = "--stub=gzip:%s" % stub_path else: # use regular stub file stub_option = "--stub=gzip:%s" % runner.dump_dir.filename_public_path(stub_file) # Try to pull text from the previous run; most stuff hasn't changed # Source=$OutputDir/pages_$section.xml.bz2 sources = [] possible_sources = None if self._prefetch: possible_sources = self._find_previous_dump(runner, output_file.partnum) # if we have a list of more than one then # we need to check existence for each and put them together in a string if possible_sources: for sourcefile in possible_sources: # if we are doing partial stub run, include only the analogous # checkpointed prefetch files, if there are checkpointed files # otherwise we'll use the all the sourcefiles reported if not self.chkptfile_in_pagerange(stub_file, sourcefile): continue sname = runner.dump_dir.filename_public_path(sourcefile, sourcefile.date) if exists(sname): sources.append(sname) if output_file.partnum: partnum_str = "%s" % stub_file.partnum else: partnum_str = "" if len(sources) > 0: source = "bzip2:%s" % (";".join(sources)) runner.show_runner_state("... building %s %s XML dump, with text prefetch from %s..." % (self._subset, partnum_str, source)) prefetch = "--prefetch=%s" % (source) else: runner.show_runner_state("... building %s %s XML dump, no text prefetch..." % (self._subset, partnum_str)) prefetch = "" if self._spawn: spawn = "--spawn=%s" % (self.wiki.config.php) else: spawn = "" if not exists(self.wiki.config.php): raise BackupError("php command %s not found" % self.wiki.config.php) if self._checkpoints_enabled: checkpoint_time = "--maxtime=%s" % (self.wiki.config.checkpoint_time) checkpoint_file = "--checkpointfile=%s" % output_file.new_filename( output_file.dumpname, output_file.file_type, output_file.file_ext, output_file.date, output_file.partnum, "p%sp%s", None) else: checkpoint_time = "" checkpoint_file = "" script_command = MultiVersion.mw_script_as_array(runner.wiki.config, "dumpTextPass.php") dump_command = [self.wiki.config.php] dump_command.extend(script_command) dump_command.extend(["--wiki=%s" % runner.db_name, "%s" % stub_option, "%s" % prefetch, "%s" % checkpoint_time, "%s" % checkpoint_file, "--report=1000", "%s" % spawn]) dump_command = [entry for entry in dump_command if entry is not None] command = dump_command filters = self.build_filters(runner, output_file) eta = self.build_eta(runner) command.extend([filters, eta]) pipeline = [command] series = [pipeline] return series