def get_lineno_last_page(self, fileobj, runner): if not fileobj.filename or not exists(runner.dump_dir.filename_public_path(fileobj)): return None dumpfile = DumpFile(self.wiki, runner.dump_dir.filename_public_path(fileobj, self.wiki.date), fileobj, self.verbose) pipeline = dumpfile.setup_uncompression_command() grep = self.wiki.config.grep if not exists(grep): raise BackupError("grep command %s not found" % grep) pipeline.append([grep, "-n", "<page>"]) tail = self.wiki.config.tail if not exists(tail): raise BackupError("tail command %s not found" % tail) pipeline.append([tail, "-1"]) # without shell proc = CommandPipeline(pipeline, quiet=True) proc.run_pipeline_get_output() if (proc.exited_successfully() or (proc.get_failed_cmds_with_retcode() == [[-signal.SIGPIPE, pipeline[0]]]) or (proc.get_failed_cmds_with_retcode() == [[signal.SIGPIPE + 128, pipeline[0]]])): output = proc.output() # 339915646: <page> if ':' in output: linecount = output.split(':')[0] if linecount.isdigit(): return linecount return None
def has_no_entries(self, xmlfile, runner): ''' see if it has a page id in it or not. no? then return True ''' if xmlfile.is_temp_file: path = os.path.join(self.wiki.config.temp_dir, xmlfile.filename) else: path = runner.dump_dir.filename_public_path(xmlfile, self.wiki.date) fname = DumpFile(self.wiki, path, xmlfile, self.verbose) return bool(fname.find_first_page_id_in_file() is None)
def checksums(self, file_obj, dumpjobdata): """Run checksum for an output file, and append to the list.""" if Checksummer.NAME in self._enabled: for htype in Checksummer.HASHTYPES: checksum_filename = self._get_checksum_filename_tmp(htype) output = file(checksum_filename, "a") dumpjobdata.debugfn("Checksumming %s via %s" % (file_obj.filename, htype)) dumpfile = DumpFile(self.wiki, dumpjobdata.dump_dir.filename_public_path(file_obj), None, self.verbose) checksum = dumpfile.checksum(htype) if checksum is not None: output.write("%s %s\n" % (checksum, file_obj.filename)) output.close()
def get_last_lines_from_n(self, fileobj, runner, count): if not fileobj.filename or not exists(runner.dump_dir.filename_public_path(fileobj)): return None dumpfile = DumpFile(self.wiki, runner.dump_dir.filename_public_path(fileobj, self.wiki.date), fileobj, self.verbose) pipeline = dumpfile.setup_uncompression_command() tail = self.wiki.config.tail if not exists(tail): raise BackupError("tail command %s not found" % tail) tail_esc = MiscUtils.shell_escape(tail) pipeline.append([tail, "-n", "+%s" % count]) # without shell proc = CommandPipeline(pipeline, quiet=True) proc.run_pipeline_get_output() if (proc.exited_successfully() or (proc.get_failed_cmds_with_retcode() == [[-signal.SIGPIPE, pipeline[0]]]) or (proc.get_failed_cmds_with_retcode() == [[signal.SIGPIPE + 128, pipeline[0]]])): last_lines = proc.output() return last_lines
def check_for_truncated_files(self, runner): """Returns the number of files that have been detected to be truncated. This function expects that all files to check for truncation live in the public dir""" ret = 0 if "check_trunc_files" not in runner.enabled or not self._check_truncation: return ret for dump_fname in self.list_outfiles_to_check_for_truncation( runner.dump_dir): dfile = DumpFile(runner.wiki, runner.dump_dir.filename_public_path( dump_fname), dump_fname) file_truncated = True if exists(dfile.filename): if dfile.check_if_empty(): # file exists and is empty, move it out of the way dfile.rename(dfile.filename + ".empty") elif dfile.check_if_truncated(): # The file exists and is truncated, we move it out of the way dfile.rename(dfile.filename + ".truncated") # We detected a failure and could abort right now. However, # there might still be some further file parts, that are good. # Hence, we go on treating the remaining files and in the end # /all/ truncated files have been moved out of the way. So we # see, which parts (instead of the whole job) need a rerun. else: # The file exists and is not truncated. Heck, it's a good file! file_truncated = False else: # file doesn't exist, move on file_truncated = False if file_truncated: ret += 1 return ret
def get_relevant_prefetch_files(self, file_list, start_page_id, end_page_id, date, runner): possibles = [] if len(file_list): # (a) nasty hack, see below (b) maxparts = 0 for file_obj in file_list: if file_obj.is_file_part and file_obj.partnum_int > maxparts: maxparts = file_obj.partnum_int if not file_obj.first_page_id: fname = DumpFile( self.wiki, runner.dump_dir.filename_public_path(file_obj, date), file_obj, self.verbose) file_obj.first_page_id = fname.find_first_page_id_in_file() # get the files that cover our range for file_obj in file_list: # If some of the file_objs in file_list could not be properly be parsed, some of # the (int) conversions below will fail. However, it is of little use to us, # which conversion failed. /If any/ conversion fails, it means, that that we do # not understand how to make sense of the current file_obj. Hence we cannot use # it as prefetch object and we have to drop it, to avoid passing a useless file # to the text pass. (This could days as of a comment below, but by not passing # a likely useless file, we have to fetch more texts from the database) # # Therefore try...except-ing the whole block is sufficient: If whatever error # occurs, we do not abort, but skip the file for prefetch. try: # If we could properly parse first_page_id_in_file = int(file_obj.first_page_id) # fixme what do we do here? this could be very expensive. is that worth it?? if not file_obj.last_page_id: # (b) nasty hack, see (a) # it's not a checkpoint fle or we'd have the pageid in the filename # so... temporary hack which will give expensive results # if file part, and it's the last one, put none # if it's not the last part, get the first pageid in the next # part and subtract 1 # if not file part, put none. if file_obj.is_file_part and file_obj.partnum_int < maxparts: for fname in file_list: if fname.partnum_int == file_obj.partnum_int + 1: # not true! this could be a few past where it really is # (because of deleted pages that aren't included at all) file_obj.last_page_id = str(int(fname.first_page_id) - 1) if file_obj.last_page_id: last_page_id_in_file = int(file_obj.last_page_id) else: last_page_id_in_file = None # FIXME there is no point in including files that have just a # few rev ids in them that we need, and having to read through # the whole file... could take hours or days (later it won't matter, # right? but until a rewrite, this is important) # also be sure that if a critical page is deleted by the time we # try to figure out ranges, that we don't get hosed if ((first_page_id_in_file <= int(start_page_id) and (last_page_id_in_file is None or last_page_id_in_file >= int(start_page_id))) or (first_page_id_in_file >= int(start_page_id) and (end_page_id is None or first_page_id_in_file <= int(end_page_id)))): possibles.append(file_obj) except Exception as ex: runner.debug( "Couldn't process %s for prefetch. Format update? Corrupt file?" % file_obj.filename) return possibles
def run(self, runner): # here we will either clean up or not depending on how we were called FIXME self.cleanup_old_files(runner.dump_dir, runner) commands = [] todo = [] if self.page_id_range is not None: # convert to checkpoint filename, handle the same way self.checkpoint_file = self.get_chkptfile_from_pageids() if self.checkpoint_file: todo = [self.checkpoint_file] else: # list all the output files that would be produced w/o # checkpoint files on outfiles = self.get_reg_files_for_filepart_possible( runner.dump_dir, self.get_fileparts_list(), self.list_dumpnames()) if self._checkpoints_enabled: # get the stub list that would be used for the current run stubs = self.get_stub_files(runner) stubs = sorted(stubs, key=lambda thing: thing.filename) # get the page ranges covered by stubs stub_ranges = [] for stub in stubs: fname = DumpFile(self.wiki, runner.dump_dir.filename_public_path(stub, stub.date), stub, self.verbose) stub_ranges.append((fname.find_first_page_id_in_file(), self.find_last_page_id(stub, runner), stub.partnum)) # get list of existing checkpoint files chkpt_files = self.list_checkpt_files( runner.dump_dir, [self.dumpname], runner.wiki.date, parts=None) chkpt_files = sorted(chkpt_files, key=lambda thing: thing.filename) # get the page ranges covered by existing checkpoint files checkpoint_ranges = [(chkptfile.first_page_id, chkptfile.last_page_id, chkptfile.partnum) for chkptfile in chkpt_files] if self.verbose: print "checkpoint_ranges is", checkpoint_ranges print "stub_ranges is", stub_ranges if not checkpoint_ranges: # no page ranges covered by checkpoints. do all output files # the usual way todo = outfiles else: todo = [] parts = self.get_fileparts_list() for partnum in parts: if not [int(chkpt_range[2]) for chkpt_range in checkpoint_ranges if int(chkpt_range[2]) == int(partnum)]: # no page ranges covered by checkpoints for a particular # file part (subjob) so do that output file the # regular way todo.extend([outfile for outfile in outfiles if int(outfile.partnum) == int(partnum)]) missing = self.find_missing_ranges(stub_ranges, checkpoint_ranges) todo.extend([self.chkpt_file_from_page_range((first, last), partnum) for (first, last, partnum) in missing]) else: # do the missing files only # FIXME public or private depending on the wiki! todo = [outfile for outfile in outfiles if not os.path.exists(runner.dump_dir.filename_public_path(outfile))] partial_stubs = [] if self.verbose: print "todo is", [to.filename for to in todo] for fileobj in todo: stub_for_file = self.get_stub_files(runner, fileobj.partnum_int)[0] if fileobj.first_page_id is None: partial_stubs.append(stub_for_file) else: stub_output_file = DumpFilename( self.wiki, fileobj.date, fileobj.dumpname, self.item_for_stubs.get_filetype(), self.item_for_stubs.get_file_ext(), fileobj.partnum, DumpFilename.make_checkpoint_string( fileobj.first_page_id, fileobj.last_page_id), temp=True) self.write_partial_stub(stub_for_file, stub_output_file, runner) if not self.has_no_entries(stub_output_file, runner): partial_stubs.append(stub_output_file) if self.verbose: print "partial_stubs is", [ps.filename for ps in partial_stubs] if partial_stubs: stub_files = partial_stubs else: return for stub_file in stub_files: series = self.build_command(runner, stub_file) commands.append(series) error = runner.run_command(commands, callback_stderr=self.progress_callback, callback_stderr_arg=runner) if error: raise BackupError("error producing xml file(s) %s" % self.dumpname)