def get_max_id(wikiconf, wikidb, id_field, table): ''' retrieve the largest id for this wiki from the db for specific table pass in name of id field, name of table ''' wiki = Wiki(wikiconf, wikidb) db_info = DbServerInfo(wiki, wikidb) query = "select MAX(%s) from %s%s;" % ( id_field, db_info.db_table_prefix, table) results = None retries = 0 maxretries = 5 end = 0 results = db_info.run_sql_and_get_output(query) if results: lines = results.splitlines() if lines and lines[1]: if not lines[1].isdigit(): return 0 # probably NULL or missing table end = int(lines[1]) return end while results is None and retries < maxretries: retries = retries + 1 time.sleep(5) results = db_info.run_sql_and_get_output(query) if not results: continue lines = results.splitlines() if lines and lines[1]: end = int(lines[1]) break if not end: sys.stderr.write("failed to get max page id from db, exiting\n") sys.exit(1) else: return end
def dump_max_revid(self): ''' dump maximum rev id from wiki that's older than the configured number of seconds (cutoff) we have this cutoff so that content really new is not dumped; we want to give curators the chance to remove problematic entries first. a cutoff of some hours is reasonable. ''' max_revid = None revidfile = MaxRevIDFile(self.wiki.config, self.wiki.date, self.wiki.db_name) if exists(revidfile.get_path()): self.log.info("Wiki %s, max rev id file %s already exists", self.wiki.db_name, revidfile.get_path()) else: self.log.info("Wiki %s retrieving max revid from db.", self.wiki.db_name) query = ("select rev_id from revision where rev_timestamp < \"%s\" " "order by rev_timestamp desc limit 1" % self.cutoff) db_info = DbServerInfo(self.wiki, self.wiki.db_name) results = db_info.run_sql_and_get_output(query) if results: lines = results.splitlines() if lines and lines[1] and lines[1].isdigit(): max_revid = lines[1] if self.dryrun: print("would write file {path} with contents {revid}".format( path=revidfile.get_path(), revid=max_revid)) else: FileUtils.write_file_in_place( revidfile.get_path(), max_revid.decode('utf-8'), self.wiki.config.fileperms) if not max_revid: try: file_obj = MaxRevIDFile(self.wiki.config, self.wiki.date, self.wiki.db_name) max_revid = FileUtils.read_file(file_obj.get_path().rstrip()) except Exception as ex: self.log.info("Error encountered reading maxrevid from %s ", file_obj.get_path(), exc_info=ex) max_revid = None # end rev id is not included in dump if max_revid is not None: max_revid = str(int(max_revid) + 1) self.log.info("max_revid is %s", safe(max_revid)) return max_revid
def get_revs_per_page_interval(page_id_start, interval, wiki, db_info): ''' given page id start and the number of pages, get and return total number of revisions these pages have wiki is a Wiki object for the specific wiki db_info is a DbServerInfo object for the specific wiki ''' query = ("select COUNT(rev_id) from revision where " "rev_page >= %s and rev_page < %s;" % ( page_id_start, page_id_start + interval)) results = None retries = 0 maxretries = 5 end = 0 results = db_info.run_sql_and_get_output(query) if results: lines = results.splitlines() if lines and lines[1]: if not lines[1].isdigit(): return 0 # probably NULL or missing table end = int(lines[1]) return end while results is None and retries < maxretries: retries = retries + 1 time.sleep(5) # maybe the server was depooled. if so we will get another one db_info = DbServerInfo(wiki, wiki.db_name) results = db_info.run_sql_and_get_output(query) if not results: continue lines = results.splitlines() if lines and lines[1]: end = int(lines[1]) break if not end: sys.stderr.write("failed to get revision count for page range from db, exiting\n") sys.exit(1) else: return end