def get_prev_revid(self, max_revid): ''' get the previous rundate, with or without maxrevid file we can populate that file if need be ''' prev_date = self.get_prev_incrdate(self.wiki.date) log.info("prev_date is %s", safe(prev_date)) prev_revid = None if prev_date: cutoff = cutoff_from_date(prev_date, self.wiki.config) id_reader = MaxRevID(self.wiki, cutoff, self.dryrun) prev_revid = id_reader.read_max_revid_from_file(prev_date) if prev_revid is None: log.info("Wiki %s retrieving prevRevId from db.", self.wiki.db_name) id_reader.record_max_revid() prev_revid = id_reader.max_id else: log.info("Wiki %s no previous runs, using %s - 10 ", self.wiki.db_name, max_revid) prev_revid = str(int(max_revid) - 10) if int(prev_revid) < 1: prev_revid = str(1) # this incr will cover every revision from the last # incremental through the maxid we wrote out already. if prev_revid is not None: prev_revid = str(int(prev_revid) + 1) log.info("prev_revid is %s", safe(prev_revid)) return prev_revid
def get_outputfile_indextxt(self, filenames_tocheck, expected, wikiname, dump_date): ''' generate and return a list of text strings that provide a link to the given files, along with filename, size and date. if the file does not exist, it will be silently excluded from the list. the expected list is a list of filenames that are expected to be produced by the dump; currently no errors are generated on this basis but this may change in the future. ''' dirinfo = MiscDumpDir(self.args['config'], dump_date) path = dirinfo.get_dumpdir(wikiname) output_fileinfo = {} for filename in filenames_tocheck: output_fileinfo[filename] = FileUtils.file_info(os.path.join(path, filename)) files_text = [] filenames = sorted(output_fileinfo.keys()) for filename in filenames: file_date, file_size = output_fileinfo[filename] self.log.info("output file %s for %s %s %s", filename, wikiname, safe(file_date), safe(file_size)) if filename in expected and file_date is None: # may do more with this sort of error in the future # for now, just get stats on the other files continue if file_date: files_text.append( "%s: %s (size %s)<br />" % (make_link( os.path.join( wikiname, dump_date, filename), os.path.basename(filename)), file_date, file_size)) return files_text
def get_stat_text(self, dump_date, wikiname): ''' generate and return the text string describing the status of the dump of the wiki for the given date ''' stat = StatusFile(self.args['config'], dump_date, wikiname) stat_contents = FileUtils.read_file(stat.get_path()) self.log.info("status for %s %s", wikiname, safe(stat_contents)) if stat_contents: stat_text = "(%s)" % (stat_contents) else: stat_text = None return stat_text
def dump_max_revid(self): ''' dump maximum rev id from wiki that's older than the configured number of seconds (cutoff) we have this cutoff so that content really new is not dumped; we want to give curators the chance to remove problematic entries first. a cutoff of some hours is reasonable. ''' max_revid = None revidfile = MaxRevIDFile(self.wiki.config, self.wiki.date, self.wiki.db_name) if exists(revidfile.get_path()): self.log.info("Wiki %s, max rev id file %s already exists", self.wiki.db_name, revidfile.get_path()) else: self.log.info("Wiki %s retrieving max revid from db.", self.wiki.db_name) query = ("select rev_id from revision where rev_timestamp < \"%s\" " "order by rev_timestamp desc limit 1" % self.cutoff) db_info = DbServerInfo(self.wiki, self.wiki.db_name) results = db_info.run_sql_and_get_output(query) if results: lines = results.splitlines() if lines and lines[1] and lines[1].isdigit(): max_revid = lines[1] if self.dryrun: print("would write file {path} with contents {revid}".format( path=revidfile.get_path(), revid=max_revid)) else: FileUtils.write_file_in_place( revidfile.get_path(), max_revid.decode('utf-8'), self.wiki.config.fileperms) if not max_revid: try: file_obj = MaxRevIDFile(self.wiki.config, self.wiki.date, self.wiki.db_name) max_revid = FileUtils.read_file(file_obj.get_path().rstrip()) except Exception as ex: self.log.info("Error encountered reading maxrevid from %s ", file_obj.get_path(), exc_info=ex) max_revid = None # end rev id is not included in dump if max_revid is not None: max_revid = str(int(max_revid) + 1) self.log.info("max_revid is %s", safe(max_revid)) return max_revid