def has_no_entries(self, xmlfile, runner): ''' see if it has a page id in it or not. no? then return True ''' if xmlfile.is_temp_file: path = os.path.join(self.wiki.config.temp_dir, xmlfile.filename) else: path = runner.dump_dir.filename_public_path(xmlfile, self.wiki.date) fname = DumpFile(self.wiki, path, xmlfile, self.verbose) return bool(fname.find_first_page_id_in_file() is None)
def get_relevant_prefetch_files(self, file_list, start_page_id, end_page_id, date, runner): possibles = [] if len(file_list): # (a) nasty hack, see below (b) maxparts = 0 for file_obj in file_list: if file_obj.is_file_part and file_obj.partnum_int > maxparts: maxparts = file_obj.partnum_int if not file_obj.first_page_id: fname = DumpFile( self.wiki, runner.dump_dir.filename_public_path(file_obj, date), file_obj, self.verbose) file_obj.first_page_id = fname.find_first_page_id_in_file() # get the files that cover our range for file_obj in file_list: # If some of the file_objs in file_list could not be properly be parsed, some of # the (int) conversions below will fail. However, it is of little use to us, # which conversion failed. /If any/ conversion fails, it means, that that we do # not understand how to make sense of the current file_obj. Hence we cannot use # it as prefetch object and we have to drop it, to avoid passing a useless file # to the text pass. (This could days as of a comment below, but by not passing # a likely useless file, we have to fetch more texts from the database) # # Therefore try...except-ing the whole block is sufficient: If whatever error # occurs, we do not abort, but skip the file for prefetch. try: # If we could properly parse first_page_id_in_file = int(file_obj.first_page_id) # fixme what do we do here? this could be very expensive. is that worth it?? if not file_obj.last_page_id: # (b) nasty hack, see (a) # it's not a checkpoint fle or we'd have the pageid in the filename # so... temporary hack which will give expensive results # if file part, and it's the last one, put none # if it's not the last part, get the first pageid in the next # part and subtract 1 # if not file part, put none. if file_obj.is_file_part and file_obj.partnum_int < maxparts: for fname in file_list: if fname.partnum_int == file_obj.partnum_int + 1: # not true! this could be a few past where it really is # (because of deleted pages that aren't included at all) file_obj.last_page_id = str(int(fname.first_page_id) - 1) if file_obj.last_page_id: last_page_id_in_file = int(file_obj.last_page_id) else: last_page_id_in_file = None # FIXME there is no point in including files that have just a # few rev ids in them that we need, and having to read through # the whole file... could take hours or days (later it won't matter, # right? but until a rewrite, this is important) # also be sure that if a critical page is deleted by the time we # try to figure out ranges, that we don't get hosed if ((first_page_id_in_file <= int(start_page_id) and (last_page_id_in_file is None or last_page_id_in_file >= int(start_page_id))) or (first_page_id_in_file >= int(start_page_id) and (end_page_id is None or first_page_id_in_file <= int(end_page_id)))): possibles.append(file_obj) except Exception as ex: runner.debug( "Couldn't process %s for prefetch. Format update? Corrupt file?" % file_obj.filename) return possibles
def run(self, runner): # here we will either clean up or not depending on how we were called FIXME self.cleanup_old_files(runner.dump_dir, runner) commands = [] todo = [] if self.page_id_range is not None: # convert to checkpoint filename, handle the same way self.checkpoint_file = self.get_chkptfile_from_pageids() if self.checkpoint_file: todo = [self.checkpoint_file] else: # list all the output files that would be produced w/o # checkpoint files on outfiles = self.get_reg_files_for_filepart_possible( runner.dump_dir, self.get_fileparts_list(), self.list_dumpnames()) if self._checkpoints_enabled: # get the stub list that would be used for the current run stubs = self.get_stub_files(runner) stubs = sorted(stubs, key=lambda thing: thing.filename) # get the page ranges covered by stubs stub_ranges = [] for stub in stubs: fname = DumpFile(self.wiki, runner.dump_dir.filename_public_path(stub, stub.date), stub, self.verbose) stub_ranges.append((fname.find_first_page_id_in_file(), self.find_last_page_id(stub, runner), stub.partnum)) # get list of existing checkpoint files chkpt_files = self.list_checkpt_files( runner.dump_dir, [self.dumpname], runner.wiki.date, parts=None) chkpt_files = sorted(chkpt_files, key=lambda thing: thing.filename) # get the page ranges covered by existing checkpoint files checkpoint_ranges = [(chkptfile.first_page_id, chkptfile.last_page_id, chkptfile.partnum) for chkptfile in chkpt_files] if self.verbose: print "checkpoint_ranges is", checkpoint_ranges print "stub_ranges is", stub_ranges if not checkpoint_ranges: # no page ranges covered by checkpoints. do all output files # the usual way todo = outfiles else: todo = [] parts = self.get_fileparts_list() for partnum in parts: if not [int(chkpt_range[2]) for chkpt_range in checkpoint_ranges if int(chkpt_range[2]) == int(partnum)]: # no page ranges covered by checkpoints for a particular # file part (subjob) so do that output file the # regular way todo.extend([outfile for outfile in outfiles if int(outfile.partnum) == int(partnum)]) missing = self.find_missing_ranges(stub_ranges, checkpoint_ranges) todo.extend([self.chkpt_file_from_page_range((first, last), partnum) for (first, last, partnum) in missing]) else: # do the missing files only # FIXME public or private depending on the wiki! todo = [outfile for outfile in outfiles if not os.path.exists(runner.dump_dir.filename_public_path(outfile))] partial_stubs = [] if self.verbose: print "todo is", [to.filename for to in todo] for fileobj in todo: stub_for_file = self.get_stub_files(runner, fileobj.partnum_int)[0] if fileobj.first_page_id is None: partial_stubs.append(stub_for_file) else: stub_output_file = DumpFilename( self.wiki, fileobj.date, fileobj.dumpname, self.item_for_stubs.get_filetype(), self.item_for_stubs.get_file_ext(), fileobj.partnum, DumpFilename.make_checkpoint_string( fileobj.first_page_id, fileobj.last_page_id), temp=True) self.write_partial_stub(stub_for_file, stub_output_file, runner) if not self.has_no_entries(stub_output_file, runner): partial_stubs.append(stub_output_file) if self.verbose: print "partial_stubs is", [ps.filename for ps in partial_stubs] if partial_stubs: stub_files = partial_stubs else: return for stub_file in stub_files: series = self.build_command(runner, stub_file) commands.append(series) error = runner.run_command(commands, callback_stderr=self.progress_callback, callback_stderr_arg=runner) if error: raise BackupError("error producing xml file(s) %s" % self.dumpname)